diff --git a/pyiceberg/environment_context.py b/pyiceberg/environment_context.py new file mode 100644 index 0000000000..8791ae247f --- /dev/null +++ b/pyiceberg/environment_context.py @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from importlib.metadata import version + + +class EnvironmentContext: + _PROPERTIES: dict[str, str] = { + "engine-name": "pyiceberg", + "engine-version": version("pyiceberg"), + } + + def __init__(self) -> None: + raise NotImplementedError("EnvironmentContext is a utility class and cannot be instantiated.") + + @classmethod + def get(cls) -> dict[str, str]: + """Return a read-only copy of all properties.""" + return cls._PROPERTIES.copy() + + @classmethod + def put(cls, key: str, value: str) -> None: + """Will add the given key/value pair in a global properties map.""" + cls._PROPERTIES[key] = value + + @classmethod + def remove(cls, key: str) -> str | None: + """Remove the key from the global properties map.""" + return cls._PROPERTIES.pop(key, None) diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py index 7e4c6eb1ec..c18aeb00f8 100644 --- a/pyiceberg/table/snapshots.py +++ b/pyiceberg/table/snapshots.py @@ -25,6 +25,7 @@ from pydantic import Field, PrivateAttr, model_serializer +from pyiceberg.environment_context import EnvironmentContext from pyiceberg.io import FileIO from pyiceberg.manifest import DataFile, DataFileContent, ManifestFile, _manifests from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec @@ -402,6 +403,9 @@ def _update_totals(total_property: str, added_property: str, removed_property: s removed_property=REMOVED_EQUALITY_DELETES, ) + for key, value in EnvironmentContext.get().items(): + summary.__setitem__(key, value) + return summary diff --git a/tests/integration/test_deletes.py b/tests/integration/test_deletes.py index c49689b716..ca84d6f7ed 100644 --- a/tests/integration/test_deletes.py +++ b/tests/integration/test_deletes.py @@ -23,6 +23,7 @@ from pyspark.sql import SparkSession from pyiceberg.catalog.rest import RestCatalog +from pyiceberg.environment_context import EnvironmentContext from pyiceberg.exceptions import NoSuchTableError from pyiceberg.expressions import AlwaysTrue, EqualTo, LessThanOrEqual from pyiceberg.manifest import ManifestEntryStatus @@ -480,6 +481,8 @@ def test_partitioned_table_positional_deletes_sequence_number(spark: SparkSessio "total-files-size": snapshots[2].summary["total-files-size"], "total-position-deletes": "1", "total-records": "4", + "engine-name": "pyiceberg", + "engine-version": EnvironmentContext.get().get("engine-version"), }, ) diff --git a/tests/integration/test_inspect_table.py b/tests/integration/test_inspect_table.py index 03d4437d18..cf6f484b60 100644 --- a/tests/integration/test_inspect_table.py +++ b/tests/integration/test_inspect_table.py @@ -27,6 +27,7 @@ from pytest_lazy_fixtures import lf from pyiceberg.catalog import Catalog +from pyiceberg.environment_context import EnvironmentContext from pyiceberg.exceptions import NoSuchTableError from pyiceberg.expressions import ( And, @@ -277,6 +278,8 @@ def test_inspect_snapshots( ("total-files-size", str(file_size)), ("total-position-deletes", "0"), ("total-equality-deletes", "0"), + ("engine-name", "pyiceberg"), + ("engine-version", EnvironmentContext.get().get("engine-version")), ] # Delete @@ -290,6 +293,8 @@ def test_inspect_snapshots( ("total-files-size", "0"), ("total-position-deletes", "0"), ("total-equality-deletes", "0"), + ("engine-name", "pyiceberg"), + ("engine-version", EnvironmentContext.get().get("engine-version")), ] lhs = spark.table(f"{identifier}.snapshots").toPandas() diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index 1d1488255f..abfb9bad10 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -25,6 +25,7 @@ from pyspark.sql import SparkSession from pyiceberg.catalog import Catalog +from pyiceberg.environment_context import EnvironmentContext from pyiceberg.exceptions import NoSuchTableError from pyiceberg.partitioning import PartitionField, PartitionSpec from pyiceberg.schema import Schema @@ -498,6 +499,8 @@ def test_summaries_with_null(spark: SparkSession, session_catalog: Catalog, arro "total-files-size": str(file_size), "total-position-deletes": "0", "total-records": "3", + "engine-name": "pyiceberg", + "engine-version": EnvironmentContext.get().get("engine-version"), } assert summaries[1] == { @@ -511,6 +514,8 @@ def test_summaries_with_null(spark: SparkSession, session_catalog: Catalog, arro "total-files-size": str(file_size * 2), "total-position-deletes": "0", "total-records": "6", + "engine-name": "pyiceberg", + "engine-version": EnvironmentContext.get().get("engine-version"), } assert summaries[2] == { "removed-files-size": str(file_size * 2), @@ -523,6 +528,8 @@ def test_summaries_with_null(spark: SparkSession, session_catalog: Catalog, arro "total-files-size": "0", "total-data-files": "0", "total-records": "0", + "engine-name": "pyiceberg", + "engine-version": EnvironmentContext.get().get("engine-version"), } assert summaries[3] == { "changed-partition-count": "3", @@ -535,6 +542,8 @@ def test_summaries_with_null(spark: SparkSession, session_catalog: Catalog, arro "total-files-size": str(file_size), "total-data-files": "3", "total-records": "3", + "engine-name": "pyiceberg", + "engine-version": EnvironmentContext.get().get("engine-version"), } assert summaries[4] == { "changed-partition-count": "3", @@ -547,6 +556,8 @@ def test_summaries_with_null(spark: SparkSession, session_catalog: Catalog, arro "total-files-size": str(file_size * 2), "total-data-files": "6", "total-records": "6", + "engine-name": "pyiceberg", + "engine-version": EnvironmentContext.get().get("engine-version"), } assert "removed-files-size" in summaries[5] assert "total-files-size" in summaries[5] @@ -561,6 +572,8 @@ def test_summaries_with_null(spark: SparkSession, session_catalog: Catalog, arro "total-files-size": summaries[5]["total-files-size"], "total-data-files": "2", "total-records": "2", + "engine-name": "pyiceberg", + "engine-version": EnvironmentContext.get().get("engine-version"), } assert "added-files-size" in summaries[6] assert "total-files-size" in summaries[6] @@ -575,6 +588,8 @@ def test_summaries_with_null(spark: SparkSession, session_catalog: Catalog, arro "total-files-size": summaries[6]["total-files-size"], "total-data-files": "4", "total-records": "4", + "engine-name": "pyiceberg", + "engine-version": EnvironmentContext.get().get("engine-version"), } diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py index 609c1863bc..806c1da13f 100644 --- a/tests/integration/test_writes/test_writes.py +++ b/tests/integration/test_writes/test_writes.py @@ -44,6 +44,7 @@ from pyiceberg.catalog import Catalog, load_catalog from pyiceberg.catalog.hive import HiveCatalog from pyiceberg.catalog.sql import SqlCatalog +from pyiceberg.environment_context import EnvironmentContext from pyiceberg.exceptions import CommitFailedException, NoSuchTableError from pyiceberg.expressions import And, EqualTo, GreaterThanOrEqual, In, LessThan, Not from pyiceberg.io.pyarrow import UnsupportedPyArrowTypeException, _dataframe_to_data_files @@ -231,6 +232,8 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi "total-files-size": str(file_size), "total-position-deletes": "0", "total-records": "3", + "engine-name": "pyiceberg", + "engine-version": EnvironmentContext.get().get("engine-version"), } # Append @@ -244,6 +247,8 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi "total-files-size": str(file_size * 2), "total-position-deletes": "0", "total-records": "6", + "engine-name": "pyiceberg", + "engine-version": EnvironmentContext.get().get("engine-version"), } # Delete @@ -257,6 +262,8 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi "total-files-size": "0", "total-position-deletes": "0", "total-records": "0", + "engine-name": "pyiceberg", + "engine-version": EnvironmentContext.get().get("engine-version"), } # Append @@ -270,6 +277,8 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi "total-files-size": str(file_size), "total-position-deletes": "0", "total-records": "3", + "engine-name": "pyiceberg", + "engine-version": EnvironmentContext.get().get("engine-version"), } @@ -326,6 +335,8 @@ def test_summaries_partial_overwrite(spark: SparkSession, session_catalog: Catal "total-files-size": summaries[0]["total-files-size"], "total-position-deletes": "0", "total-records": "5", + "engine-name": "pyiceberg", + "engine-version": EnvironmentContext.get().get("engine-version"), } # Java produces: # { @@ -367,6 +378,8 @@ def test_summaries_partial_overwrite(spark: SparkSession, session_catalog: Catal "total-files-size": summaries[1]["total-files-size"], "total-position-deletes": "0", "total-records": "4", + "engine-name": "pyiceberg", + "engine-version": EnvironmentContext.get().get("engine-version"), } assert len(tbl.scan().to_pandas()) == 4 @@ -831,6 +844,8 @@ def test_summaries_with_only_nulls( "total-files-size": "0", "total-position-deletes": "0", "total-records": "0", + "engine-name": "pyiceberg", + "engine-version": EnvironmentContext.get().get("engine-version"), } assert summaries[1] == { @@ -843,6 +858,8 @@ def test_summaries_with_only_nulls( "total-files-size": str(file_size), "total-position-deletes": "0", "total-records": "2", + "engine-name": "pyiceberg", + "engine-version": EnvironmentContext.get().get("engine-version"), } assert summaries[2] == { @@ -855,6 +872,8 @@ def test_summaries_with_only_nulls( "total-files-size": "0", "total-position-deletes": "0", "total-records": "0", + "engine-name": "pyiceberg", + "engine-version": EnvironmentContext.get().get("engine-version"), } assert summaries[3] == { @@ -864,6 +883,8 @@ def test_summaries_with_only_nulls( "total-files-size": "0", "total-position-deletes": "0", "total-records": "0", + "engine-name": "pyiceberg", + "engine-version": EnvironmentContext.get().get("engine-version"), } @@ -1156,6 +1177,8 @@ def test_inspect_snapshots( ("total-files-size", str(file_size)), ("total-position-deletes", "0"), ("total-equality-deletes", "0"), + ("engine-name", "pyiceberg"), + ("engine-version", EnvironmentContext.get().get("engine-version")), ] # Delete @@ -1169,6 +1192,8 @@ def test_inspect_snapshots( ("total-files-size", "0"), ("total-position-deletes", "0"), ("total-equality-deletes", "0"), + ("engine-name", "pyiceberg"), + ("engine-version", EnvironmentContext.get().get("engine-version")), ] lhs = spark.table(f"{identifier}.snapshots").toPandas() diff --git a/tests/table/test_snapshots.py b/tests/table/test_snapshots.py index 077027f7b9..485a541630 100644 --- a/tests/table/test_snapshots.py +++ b/tests/table/test_snapshots.py @@ -19,6 +19,7 @@ import pytest +from pyiceberg.environment_context import EnvironmentContext from pyiceberg.manifest import DataFile, DataFileContent, ManifestContent, ManifestFile from pyiceberg.partitioning import PartitionField, PartitionSpec from pyiceberg.schema import Schema @@ -315,6 +316,8 @@ def test_merge_snapshot_summaries_empty() -> None: "total-files-size": "0", "total-position-deletes": "0", "total-equality-deletes": "0", + "engine-name": "pyiceberg", + "engine-version": EnvironmentContext.get().get("engine-version"), }, ) @@ -349,6 +352,8 @@ def test_merge_snapshot_summaries_new_summary() -> None: "total-files-size": "4", "total-position-deletes": "5", "total-equality-deletes": "3", + "engine-name": "pyiceberg", + "engine-version": EnvironmentContext.get().get("engine-version"), }, ) @@ -391,6 +396,8 @@ def test_merge_snapshot_summaries_overwrite_summary() -> None: "total-files-size": "5", "total-position-deletes": "6", "total-equality-deletes": "4", + "engine-name": "pyiceberg", + "engine-version": EnvironmentContext.get().get("engine-version"), } assert actual.additional_properties == expected diff --git a/tests/test_environment_context.py b/tests/test_environment_context.py new file mode 100644 index 0000000000..fe0b44fd75 --- /dev/null +++ b/tests/test_environment_context.py @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import re + +from pyiceberg.environment_context import EnvironmentContext + + +def test_default_value() -> None: + actual = EnvironmentContext.get() + assert len(actual) == 2 + assert actual["engine-name"] == "pyiceberg" + assert re.match(r"^\d+\.\d+\.\d+", actual["engine-version"]) + + +def test_put_and_remove() -> None: + EnvironmentContext.put("test-key", "test-value") + assert EnvironmentContext.get()["test-key"] == "test-value" + + EnvironmentContext.remove("test-key") + assert "test-key" not in EnvironmentContext.get()