-
Notifications
You must be signed in to change notification settings - Fork 500
feat: Add support for writing bloom filters #3265
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -181,7 +181,12 @@ | |
| from pyiceberg.utils.config import Config | ||
| from pyiceberg.utils.datetime import millis_to_datetime | ||
| from pyiceberg.utils.decimal import unscaled_to_decimal | ||
| from pyiceberg.utils.properties import get_first_property_value, property_as_bool, property_as_int | ||
| from pyiceberg.utils.properties import ( | ||
| get_first_property_value, | ||
| property_as_bool, | ||
| property_as_float, | ||
| property_as_int, | ||
| ) | ||
| from pyiceberg.utils.singleton import Singleton | ||
| from pyiceberg.utils.truncate import truncate_upper_bound_binary_string, truncate_upper_bound_text_string | ||
|
|
||
|
|
@@ -2474,6 +2479,120 @@ def parquet_path_to_id_mapping( | |
| return result | ||
|
|
||
|
|
||
| def id_to_parquet_path_mapping(schema: Schema) -> dict[int, str]: | ||
| """ | ||
| Compute the mapping of Iceberg column ID to parquet column path. | ||
|
|
||
| Args: | ||
| schema (pyiceberg.schema.Schema): The current table schema. | ||
| """ | ||
| result: dict[int, str] = {} | ||
| for pair in pre_order_visit(schema, ID2ParquetPathVisitor()): | ||
| result[pair.field_id] = pair.parquet_path | ||
| return result | ||
|
|
||
|
|
||
| @dataclass(frozen=True) | ||
| class BloomFilterOptions: | ||
| parquet_path: str | ||
| ndv: int | None | ||
| fpp: float | None | ||
|
|
||
|
|
||
| class BloomFilterOptionsCollector(PreOrderSchemaVisitor[list[BloomFilterOptions]]): | ||
| _field_id: int = 0 | ||
| _schema: Schema | ||
| _properties: dict[str, str] | ||
|
|
||
| def __init__(self, schema: Schema, properties: dict[str, str], id_to_parquet_path_mapping: dict[int, str]): | ||
| self._schema = schema | ||
| self._properties = properties | ||
| self._id_to_parquet_path_mapping = id_to_parquet_path_mapping | ||
|
|
||
| def schema( | ||
| self, schema: Schema, struct_result: Callable[[], builtins.list[BloomFilterOptions]] | ||
| ) -> builtins.list[BloomFilterOptions]: | ||
| return struct_result() | ||
|
|
||
| def struct( | ||
| self, struct: StructType, field_results: builtins.list[Callable[[], builtins.list[BloomFilterOptions]]] | ||
| ) -> builtins.list[BloomFilterOptions]: | ||
| return list(itertools.chain(*[result() for result in field_results])) | ||
|
|
||
| def field( | ||
| self, field: NestedField, field_result: Callable[[], builtins.list[BloomFilterOptions]] | ||
| ) -> builtins.list[BloomFilterOptions]: | ||
| self._field_id = field.field_id | ||
| return field_result() | ||
|
|
||
| def list( | ||
| self, list_type: ListType, element_result: Callable[[], builtins.list[BloomFilterOptions]] | ||
| ) -> builtins.list[BloomFilterOptions]: | ||
| self._field_id = list_type.element_id | ||
| return element_result() | ||
|
|
||
| def map( | ||
| self, | ||
| map_type: MapType, | ||
| key_result: Callable[[], builtins.list[BloomFilterOptions]], | ||
| value_result: Callable[[], builtins.list[BloomFilterOptions]], | ||
| ) -> builtins.list[BloomFilterOptions]: | ||
| self._field_id = map_type.key_id | ||
| k = key_result() | ||
| self._field_id = map_type.value_id | ||
| v = value_result() | ||
| return k + v | ||
|
|
||
| def primitive(self, primitive: PrimitiveType) -> builtins.list[BloomFilterOptions]: | ||
| from pyiceberg.table import TableProperties | ||
|
|
||
| column_name = self._schema.find_column_name(self._field_id) | ||
| if column_name is None: | ||
| return [] | ||
|
|
||
| parquet_path = self._id_to_parquet_path_mapping.get(self._field_id) | ||
| if parquet_path is None: | ||
| return [] | ||
|
|
||
| bloom_filter_enabled = property_as_bool( | ||
| self._properties, f"{TableProperties.PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX}.{column_name}", False | ||
| ) | ||
| if not bloom_filter_enabled: | ||
| return [] | ||
|
|
||
| bloom_filter_fpp = property_as_float( | ||
| self._properties, f"{TableProperties.PARQUET_BLOOM_FILTER_COLUMN_FPP_PREFIX}.{column_name}", None | ||
| ) | ||
| bloom_filter_ndv = property_as_int( | ||
| self._properties, f"{TableProperties.PARQUET_BLOOM_FILTER_COLUMN_NDV_PREFIX}.{column_name}", None | ||
| ) | ||
|
|
||
| return [BloomFilterOptions(parquet_path=parquet_path, ndv=bloom_filter_ndv, fpp=bloom_filter_fpp)] | ||
|
|
||
|
|
||
| def get_bloom_filter_options( | ||
| schema: Schema, | ||
| table_properties: dict[str, str], | ||
| ) -> dict[str, dict[str, Any]]: | ||
| """ | ||
| Get the bloom filter options from the table properties. | ||
|
|
||
| Args: | ||
| schema (pyiceberg.schema.Schema): The current table schema. | ||
| table_properties (dict[str, str]): The table properties. | ||
| """ | ||
| bloom_filter_options = pre_order_visit( | ||
| schema, BloomFilterOptionsCollector(schema, table_properties, id_to_parquet_path_mapping(schema)) | ||
| ) | ||
| result: dict[str, dict[str, Any]] = {} | ||
| for bf_opts in bloom_filter_options: | ||
| result[bf_opts.parquet_path] = { | ||
| **({"ndv": bf_opts.ndv} if bf_opts.ndv is not None else {}), | ||
| **({"fpp": bf_opts.fpp} if bf_opts.fpp is not None else {}), | ||
| } | ||
| return result | ||
|
|
||
|
|
||
| def data_file_statistics_from_parquet_metadata( | ||
| parquet_metadata: pq.FileMetaData, | ||
| stats_columns: dict[int, StatisticsCollector], | ||
|
|
@@ -2596,7 +2715,6 @@ def data_file_statistics_from_parquet_metadata( | |
| def write_file(io: FileIO, table_metadata: TableMetadata, tasks: Iterator[WriteTask]) -> Iterator[DataFile]: | ||
| from pyiceberg.table import DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE, TableProperties | ||
|
|
||
| parquet_writer_kwargs = _get_parquet_writer_kwargs(table_metadata.properties) | ||
| row_group_size = property_as_int( | ||
| properties=table_metadata.properties, | ||
| property_name=TableProperties.PARQUET_ROW_GROUP_LIMIT, | ||
|
|
@@ -2613,6 +2731,8 @@ def write_parquet(task: WriteTask) -> DataFile: | |
| else: | ||
| file_schema = table_schema | ||
|
|
||
| parquet_writer_kwargs = _get_parquet_writer_kwargs(table_metadata.properties, file_schema) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I understand correctly, neither input varies per file: table_metadata.properties is table-level, and file_schema is derived solely from table_metadata.schema() (just sanitized), so it's identical for every task. Would it make sense to lift the file_schema derivation and this _get_parquet_writer_kwargs call back out of write_parquet, computing them once per write_file call instead of once per write_parquet call? |
||
|
|
||
| downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False | ||
| batches = [ | ||
| _to_requested_schema( | ||
|
|
@@ -2757,14 +2877,25 @@ def parquet_file_to_data_file(io: FileIO, table_metadata: TableMetadata, file_pa | |
| PYARROW_UNCOMPRESSED_CODEC = "none" | ||
|
|
||
|
|
||
| def _get_parquet_writer_kwargs(table_properties: Properties) -> dict[str, Any]: | ||
| def _get_parquet_writer_kwargs(table_properties: Properties, file_schema: Schema) -> dict[str, Any]: | ||
| from pyiceberg.table import TableProperties | ||
|
|
||
| for key_pattern in [ | ||
| unsupported_key_patterns = [ | ||
| TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, | ||
| TableProperties.PARQUET_BLOOM_FILTER_MAX_BYTES, | ||
| f"{TableProperties.PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX}.*", | ||
| ]: | ||
| ] | ||
|
|
||
| from packaging import version | ||
|
|
||
| MIN_PYARROW_VERSION_SUPPORTING_BLOOM_FILTER_WRITES = "24.0.0" | ||
| if version.parse(pyarrow.__version__) < version.parse(MIN_PYARROW_VERSION_SUPPORTING_BLOOM_FILTER_WRITES): | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it be better to make this explicitly a different error since it is implemented but is gate on the dependency version? |
||
| unsupported_key_patterns += [ | ||
| f"{TableProperties.PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX}.*", | ||
| f"{TableProperties.PARQUET_BLOOM_FILTER_COLUMN_FPP_PREFIX}.*", | ||
| f"{TableProperties.PARQUET_BLOOM_FILTER_COLUMN_NDV_PREFIX}.*", | ||
| ] | ||
|
|
||
| for key_pattern in unsupported_key_patterns: | ||
| if unsupported_keys := fnmatch.filter(table_properties, key_pattern): | ||
| warnings.warn(f"Parquet writer option(s) {unsupported_keys} not implemented", stacklevel=2) | ||
|
|
||
|
|
@@ -2777,6 +2908,8 @@ def _get_parquet_writer_kwargs(table_properties: Properties) -> dict[str, Any]: | |
| if compression_codec == ICEBERG_UNCOMPRESSED_CODEC: | ||
| compression_codec = PYARROW_UNCOMPRESSED_CODEC | ||
|
|
||
| bloom_filter_options = get_bloom_filter_options(file_schema, table_properties) | ||
|
|
||
| return { | ||
| "compression": compression_codec, | ||
| "compression_level": compression_level, | ||
|
|
@@ -2795,6 +2928,7 @@ def _get_parquet_writer_kwargs(table_properties: Properties) -> dict[str, Any]: | |
| property_name=TableProperties.PARQUET_PAGE_ROW_LIMIT, | ||
| default=TableProperties.PARQUET_PAGE_ROW_LIMIT_DEFAULT, | ||
| ), | ||
| **({"bloom_filter_options": bloom_filter_options} if bloom_filter_options else {}), | ||
| } | ||
|
|
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,11 +30,13 @@ | |
| import fastavro | ||
| import pandas as pd | ||
| import pandas.testing | ||
| import pyarrow | ||
| import pyarrow as pa | ||
| import pyarrow.compute as pc | ||
| import pyarrow.parquet as pq | ||
| import pytest | ||
| import pytz | ||
| from packaging import version | ||
| from pyarrow.fs import S3FileSystem | ||
| from pydantic_core import ValidationError | ||
| from pyspark.sql import SparkSession | ||
|
|
@@ -68,6 +70,11 @@ | |
| from pyiceberg.view.metadata import SQLViewRepresentation, ViewVersion | ||
| from utils import TABLE_SCHEMA, _create_table | ||
|
|
||
| skip_if_bloom_filter_not_supported = pytest.mark.skipif( | ||
| version.parse(pyarrow.__version__) < version.parse("24.0.0"), | ||
| reason="Requires pyarrow version >= 24.0.0", | ||
| ) | ||
|
|
||
|
|
||
| @pytest.fixture(scope="session", autouse=True) | ||
| def table_v1_with_null(session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None: | ||
|
|
@@ -695,7 +702,6 @@ def test_write_parquet_other_properties( | |
| "properties", | ||
| [ | ||
| {"write.parquet.row-group-size-bytes": "42"}, | ||
| {"write.parquet.bloom-filter-enabled.column.bool": "42"}, | ||
| {"write.parquet.bloom-filter-max-bytes": "42"}, | ||
| ], | ||
| ) | ||
|
|
@@ -712,6 +718,27 @@ def test_write_parquet_unsupported_properties( | |
| tbl.append(arrow_table_with_null) | ||
|
|
||
|
|
||
| @pytest.mark.integration | ||
| @skip_if_bloom_filter_not_supported | ||
| @pytest.mark.parametrize("format_version", [1, 2]) | ||
| def test_write_parquet_bloom_filter_properties( | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it make sense to assert pq.ParquetWriter is called with bloom_filter_options? |
||
| spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int | ||
| ) -> None: | ||
| identifier = "default.write_parquet_bloom_filter_properties" | ||
|
|
||
| _create_table( | ||
| session_catalog, | ||
| identifier, | ||
| { | ||
| "format-version": format_version, | ||
| "write.parquet.bloom-filter-enabled.column.string": "true", | ||
| "write.parquet.bloom-filter-fpp.column.string": "0.1", | ||
| "write.parquet.bloom-filter-ndv.column.string": "100", | ||
| }, | ||
| [arrow_table_with_null], | ||
| ) | ||
|
|
||
|
|
||
| @pytest.mark.integration | ||
| @pytest.mark.parametrize("format_version", [1, 2]) | ||
| def test_spark_writes_orc_pyiceberg_reads(spark: SparkSession, session_catalog: Catalog, format_version: int) -> None: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think parquet_path_to_id_mapping(file_schema) already walks this schema with ID2ParquetPathVisitor for stats. If I'm reading it right, the bloom path adds a couple more passes over the same schema. Would it make sense to do a single ID2ParquetPathVisitor pass and deriving both the bloom options and the stats mapping from it?