From 5b5e1ecff9392acc41b033aa461dc7ad27a5b887 Mon Sep 17 00:00:00 2001 From: Decoder Date: Wed, 13 May 2026 04:25:48 -0700 Subject: [PATCH 1/2] =?UTF-8?q?spec(v1.0):=20rename=20@table=20=E2=86=92?= =?UTF-8?q?=20@dataset,=20add=20@proto,=20expand=20reserved=20names?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lockstep major-version cut alongside the protowire v1.0 spec freeze and protowire-cpp v1.0.0 (which this port wraps via nanobind). Hard cutover, no alias period. Python API rename surface: pxf.TableDirective → pxf.DatasetDirective pxf.TableReader → pxf.DatasetReader pxf.Result.tables → pxf.Result.datasets + pxf.ProtoDirective + pxf.ProtoShape (new) + Result.protos: tuple[ProtoDirective, ...] (new) FFI shape change: _protowire.pxf_unmarshal_full now returns a 6-tuple (raw, set_paths, null_paths, directives, datasets, protos). The cpp side surfaces ProtoDirective as a (shape, type_name, body) tuple; Python wraps it into the new frozen dataclass. Source files renamed: src/_protowire/module.cc (PyTableReader → PyDatasetReader) src/protowire/pxf.py (class + helper renames) tests/test_pxf_table_reader.py → tests/test_pxf_dataset_reader.py New: tests/test_pxf_proto_directive.py with 16 cases covering all four @proto body shapes through the FFI, multi-@proto, nested braces, three error paths, parametrized reserved-name rejection, and a ProtoDirective dataclass check. Version bumps: pyproject.toml 0.75.0 → 1.0.0 src/protowire/__init__ 0.75.0 → 1.0.0 pytest: 100 tests, 0 failures. --- CHANGELOG.md | 49 ++++++ pyproject.toml | 2 +- src/_protowire/module.cc | 89 ++++++---- src/protowire/__init__.py | 2 +- src/protowire/pxf.py | 92 ++++++---- ...e_reader.py => test_pxf_dataset_reader.py} | 82 ++++----- tests/test_pxf_directives.py | 34 ++-- tests/test_pxf_proto_directive.py | 158 ++++++++++++++++++ 8 files changed, 383 insertions(+), 125 deletions(-) rename tests/{test_pxf_table_reader.py => test_pxf_dataset_reader.py} (67%) create mode 100644 tests/test_pxf_proto_directive.py diff --git a/CHANGELOG.md b/CHANGELOG.md index b901a8c..47ad4d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,55 @@ format changes. ## [Unreleased] +## [1.0.0] — 2026-05-13 + +First major-version cut. Implements the three one-time spec changes +from the [protowire v1.0 freeze line](https://github.com/trendvidia/protowire/releases/tag/v1.0.0) +in lockstep with the other v1.0 ports, by re-pinning to +[`protowire-cpp` v1.0.0](https://github.com/trendvidia/protowire-cpp/releases/tag/v1.0.0). +**Breaking** — there is no alias period; v1.0 is itself the major +bump. + +### Python API rename + +- `pxf.TableDirective` → `pxf.DatasetDirective` +- `pxf.TableReader` → `pxf.DatasetReader` +- `pxf.Result.tables` → `pxf.Result.datasets` +- Tuple cell type aliases are unchanged. + +Source files renamed: + +- `tests/test_pxf_table_reader.py` → `tests/test_pxf_dataset_reader.py` + +### Python API additions + +- `pxf.ProtoDirective` (frozen dataclass with `shape`, `type_name`, + `body`) — exposed in `Result.protos`. The `shape` field is the + string literal `"anonymous" | "named" | "source" | "descriptor"`, + matching the cpp enum. +- `pxf.ProtoShape` type alias for the literal union. + +### FFI shape + +`_protowire.pxf_unmarshal_full` now returns a 6-tuple (raw bytes, +set_paths, null_paths, directives, datasets, **protos**). Python's +`Result` gains a `protos: tuple[ProtoDirective, ...]` field. + +### Build + +- `pyproject.toml` version `0.75.0` → `1.0.0`. +- `__version__` in `src/protowire/__init__.py` bumped accordingly. +- Sibling-checkout dependency on `../protowire-cpp` resolves at the + v1.0.0 tag. + +### Tests + +- New `tests/test_pxf_proto_directive.py` with 16 cases covering all + four `@proto` body shapes via the FFI roundtrip, multi-`@proto`, + nested-brace bodies, three error paths, parametrized reserved- + directive-name rejection, and a `ProtoDirective` dataclass check. +- pytest: 100 tests, 0 failures. + ## [0.75.0] — 2026-05-12 First release after the v0.70.0 baseline. Wraps the diff --git a/pyproject.toml b/pyproject.toml index 52a0b47..b478e16 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ build-backend = "scikit_build_core.build" # 2021 CLI tool). The import name stays `import protowire` — these two # names are independent (cf. python-dateutil → import dateutil). name = "protowire-python" -version = "0.75.0" +version = "1.0.0" description = "Python wrapper around protowire-cpp — PXF text, SBE binary, and envelope codecs." readme = "README.md" requires-python = ">=3.10" diff --git a/src/_protowire/module.cc b/src/_protowire/module.cc index c6e36fc..68c6355 100644 --- a/src/_protowire/module.cc +++ b/src/_protowire/module.cc @@ -82,7 +82,7 @@ const pbuf::Descriptor* FindDescriptor(const SchemaBundle& s, // CellToPyTuple converts a single AST cell value (or std::nullopt for an // absent cell) into the FFI shape consumed by pxf.py — `None` for absent, -// `(kind, value)` otherwise. Used by PxfUnmarshalFull for @table rows. +// `(kind, value)` otherwise. Used by PxfUnmarshalFull for @dataset rows. // // kind values mirror the AST variant tags: // "null" → nb::none() @@ -121,7 +121,7 @@ nb::object CellToPyTuple(const std::optional& cell) { } else if constexpr (std::is_same_v) { return nb::make_tuple(std::string("duration"), p->raw); } else { - // List / Block are rejected at @table cell-parse time, so this + // List / Block are rejected at @dataset cell-parse time, so this // branch is unreachable for cells. Surface as a clean error. return nb::make_tuple(std::string("unknown"), nb::none()); } @@ -156,14 +156,19 @@ nb::bytes PxfUnmarshal(nb::bytes text, nb::bytes fds_bytes, // Directive FFI shape: (name, prefixes, type, body, has_body, line, column). using PyDirective = std::tuple, std::string, nb::bytes, bool, int, int>; -// TableDirective FFI shape: (type, columns, rows) where rows is a list of +// DatasetDirective FFI shape: (type, columns, rows) where rows is a list of // lists of cells (each cell None or (kind, value); see CellToPyTuple). -using PyTableDirective = std::tuple, +using PyDatasetDirective = std::tuple, std::vector>>; +// ProtoDirective FFI shape: (shape, type_name, body) where shape is one of +// "anonymous" / "named" / "source" / "descriptor" (draft §3.4.5). +using PyProtoDirective = std::tuple; -// PXF text -> (binary proto bytes, set_paths, null_paths, directives, tables). +// PXF text -> (binary proto bytes, set_paths, null_paths, directives, +// datasets, protos). std::tuple, std::vector, - std::vector, std::vector> + std::vector, std::vector, + std::vector> PxfUnmarshalFull(nb::bytes text, nb::bytes fds_bytes, const std::string& full_name, bool discard_unknown, bool skip_validate) { @@ -193,10 +198,10 @@ PxfUnmarshalFull(nb::bytes text, nb::bytes fds_bytes, nb::bytes(d.body.data(), d.body.size()), d.has_body, d.pos.line, d.pos.column); } - // Marshal tables. - std::vector py_tables; - py_tables.reserve(r->Tables().size()); - for (const auto& t : r->Tables()) { + // Marshal datasets. + std::vector py_datasets; + py_datasets.reserve(r->Datasets().size()); + for (const auto& t : r->Datasets()) { std::vector> py_rows; py_rows.reserve(t.rows.size()); for (const auto& row : t.rows) { @@ -205,13 +210,23 @@ PxfUnmarshalFull(nb::bytes text, nb::bytes fds_bytes, for (const auto& cell : row.cells) py_cells.push_back(CellToPyTuple(cell)); py_rows.push_back(std::move(py_cells)); } - py_tables.emplace_back(t.type, t.columns, std::move(py_rows)); + py_datasets.emplace_back(t.type, t.columns, std::move(py_rows)); + } + // Marshal protos. + std::vector py_protos; + py_protos.reserve(r->Protos().size()); + for (const auto& p : r->Protos()) { + py_protos.emplace_back( + std::string(protowire::pxf::ProtoShapeName(p.shape)), + p.type_name, + nb::bytes(p.body.data(), p.body.size())); } return {nb::bytes(out.data(), out.size()), r->SetFields(), r->NullFields(), std::move(py_dirs), - std::move(py_tables)}; + std::move(py_datasets), + std::move(py_protos)}; } // PXF schema reserved-name check (draft §3.13). Returns a list of @@ -236,21 +251,21 @@ PxfValidateDescriptor(nb::bytes fds_bytes, const std::string& full_name) { return out; } -// --- PyTableReader: streaming @table consumption ------------------------- +// --- PyDatasetReader: streaming @dataset consumption ------------------------- // -// Wraps protowire::pxf::TableReader. The reader takes a std::istream*; we +// Wraps protowire::pxf::DatasetReader. The reader takes a std::istream*; we // hold the istringstream alongside the reader so its lifetime is bound to // the Python object. Input is provided as bytes (PR-2 scope); a file-like // streambuf bridge is a possible follow-up. -class PyTableReader { +class PyDatasetReader { public: - static std::unique_ptr FromBytes(nb::bytes data) { - auto out = std::unique_ptr(new PyTableReader()); + static std::unique_ptr FromBytes(nb::bytes data) { + auto out = std::unique_ptr(new PyDatasetReader()); out->stream_ = std::make_unique( std::string(data.c_str(), data.size())); - auto tr = protowire::pxf::TableReader::Create(out->stream_.get()); + auto tr = protowire::pxf::DatasetReader::Create(out->stream_.get()); if (!tr.ok()) { - throw nb::value_error(("pxf.TableReader: " + tr.status().ToString()).c_str()); + throw nb::value_error(("pxf.DatasetReader: " + tr.status().ToString()).c_str()); } out->reader_ = std::move(*tr); // Marshal the side-channel directives once at construction; they're @@ -273,10 +288,10 @@ class PyTableReader { // Raises ValueError on parse error. nb::object NextOrNone() { if (reader_->Done()) return nb::none(); - protowire::pxf::TableRow row; + protowire::pxf::DatasetRow row; auto s = reader_->Next(&row); if (!s.ok()) { - throw nb::value_error(("pxf.TableReader.next: " + s.ToString()).c_str()); + throw nb::value_error(("pxf.DatasetReader.next: " + s.ToString()).c_str()); } if (reader_->Done()) return nb::none(); return RowToList(row); @@ -285,10 +300,10 @@ class PyTableReader { // Iterator protocol: __next__ raises StopIteration at EOF. nb::object Next() { if (reader_->Done()) throw nb::stop_iteration(); - protowire::pxf::TableRow row; + protowire::pxf::DatasetRow row; auto s = reader_->Next(&row); if (!s.ok()) { - throw nb::value_error(("pxf.TableReader.next: " + s.ToString()).c_str()); + throw nb::value_error(("pxf.DatasetReader.next: " + s.ToString()).c_str()); } if (reader_->Done()) throw nb::stop_iteration(); return RowToList(row); @@ -296,8 +311,8 @@ class PyTableReader { // Drains the remaining buffered + underlying bytes. Only meaningful // after Done(); the Python wrapper exposes this as a method that - // returns bytes so callers can chain a second TableReader on - // multi-@table documents. + // returns bytes so callers can chain a second DatasetReader on + // multi-@dataset documents. nb::bytes Tail() { auto t = reader_->Tail(); std::ostringstream buf; @@ -307,7 +322,7 @@ class PyTableReader { } private: - static nb::object RowToList(const protowire::pxf::TableRow& row) { + static nb::object RowToList(const protowire::pxf::DatasetRow& row) { std::vector cells; cells.reserve(row.cells.size()); for (const auto& cell : row.cells) cells.push_back(CellToPyTuple(cell)); @@ -315,7 +330,7 @@ class PyTableReader { } std::unique_ptr stream_; - std::unique_ptr reader_; + std::unique_ptr reader_; std::vector directives_; }; @@ -502,16 +517,16 @@ NB_MODULE(_protowire, m) { m.def("pxf_marshal", &PxfMarshal, "msg_bytes"_a, "fds"_a, "full_name"_a); m.def("pxf_validate_descriptor", &PxfValidateDescriptor, "fds"_a, "full_name"_a); - nb::class_(m, "PxfTableReader") - .def_static("from_bytes", &PyTableReader::FromBytes, "data"_a) - .def_prop_ro("type", &PyTableReader::Type) - .def_prop_ro("columns", &PyTableReader::Columns) - .def_prop_ro("directives", &PyTableReader::Directives) - .def_prop_ro("done", &PyTableReader::Done) - .def("next_or_none", &PyTableReader::NextOrNone) - .def("tail", &PyTableReader::Tail) - .def("__iter__", [](PyTableReader& self) -> PyTableReader& { return self; }) - .def("__next__", &PyTableReader::Next); + nb::class_(m, "PxfDatasetReader") + .def_static("from_bytes", &PyDatasetReader::FromBytes, "data"_a) + .def_prop_ro("type", &PyDatasetReader::Type) + .def_prop_ro("columns", &PyDatasetReader::Columns) + .def_prop_ro("directives", &PyDatasetReader::Directives) + .def_prop_ro("done", &PyDatasetReader::Done) + .def("next_or_none", &PyDatasetReader::NextOrNone) + .def("tail", &PyDatasetReader::Tail) + .def("__iter__", [](PyDatasetReader& self) -> PyDatasetReader& { return self; }) + .def("__next__", &PyDatasetReader::Next); nb::class_(m, "SbeCodec") .def_static("create", &SbeCodec::Create, "fds"_a, "file_names"_a) diff --git a/src/protowire/__init__.py b/src/protowire/__init__.py index ed2b8dc..e27eba9 100644 --- a/src/protowire/__init__.py +++ b/src/protowire/__init__.py @@ -5,4 +5,4 @@ from . import envelope, pxf, sbe __all__ = ["pxf", "sbe", "envelope"] -__version__ = "0.75.0" +__version__ = "1.0.0" diff --git a/src/protowire/pxf.py b/src/protowire/pxf.py index 81ca023..efc7713 100644 --- a/src/protowire/pxf.py +++ b/src/protowire/pxf.py @@ -21,7 +21,7 @@ # --- Directive surface (PXF v0.72+) -------------------------------------- -# A single `@table` row cell. `None` denotes an absent cell (no value between +# A single `@dataset` row cell. `None` denotes an absent cell (no value between # two commas, draft §3.4.4); a non-None Cell is a (kind, value) pair where # kind is one of the strings below. # @@ -61,11 +61,11 @@ class Directive: @dataclass(frozen=True) -class TableDirective: - """An `@table TYPE ( cols ) row*` directive at document root. +class DatasetDirective: + """An `@dataset TYPE ( cols ) row*` directive at document root. - Per draft §3.4.4 a document with any TableDirective MUST NOT have a - @type directive or top-level field entries — the @table header IS + Per draft §3.4.4 a document with any DatasetDirective MUST NOT have a + @type directive or top-level field entries — the @dataset header IS the document's type declaration. """ @@ -74,6 +74,32 @@ class TableDirective: rows: tuple[tuple[Cell, ...], ...] +ProtoShape = Literal["anonymous", "named", "source", "descriptor"] + + +@dataclass(frozen=True) +class ProtoDirective: + """An `@proto ` directive at document root (draft §3.4.5). + + Carries an embedded protobuf schema, making the PXF document + self-describing. `shape` distinguishes the four lexically-determined + body forms; `type_name` is non-empty only when `shape == "named"`. + `body` carries raw bytes per shape: + + - `"anonymous"` / `"named"`: bytes between the opening `{` and matching + `}` (both exclusive). Protobuf message-body source. + - `"source"`: contents of the triple-quoted string (with leading-LF + stripping / common-prefix dedent already applied). A complete + ``.proto`` source file. + - `"descriptor"`: base64-decoded bytes of the bytes literal. A + serialised ``google.protobuf.FileDescriptorSet``. + """ + + shape: ProtoShape + type_name: str + body: bytes + + @dataclass(frozen=True) class Violation: """A schema reserved-name violation, draft §3.13. @@ -101,7 +127,8 @@ class Result: set_paths: frozenset[str] null_paths: frozenset[str] directives: tuple[Directive, ...] = field(default_factory=tuple) - tables: tuple[TableDirective, ...] = field(default_factory=tuple) + datasets: tuple[DatasetDirective, ...] = field(default_factory=tuple) + protos: tuple["ProtoDirective", ...] = field(default_factory=tuple) def is_set(self, path: str) -> bool: return path in self.set_paths and path not in self.null_paths @@ -171,10 +198,10 @@ def unmarshal_full_bytes( skip_validate: bool = False, ) -> tuple[bytes, Result]: text = data.encode("utf-8") if isinstance(data, str) else bytes(data) - raw, set_paths, null_paths, dirs, tables = _protowire.pxf_unmarshal_full( + raw, set_paths, null_paths, dirs, tables, protos = _protowire.pxf_unmarshal_full( text, bytes(fds), full_name, discard_unknown, skip_validate ) - return raw, _wrap_result(set_paths, null_paths, dirs, tables) + return raw, _wrap_result(set_paths, null_paths, dirs, tables, protos) # --- Decoders -------------------------------------------------------------- @@ -205,24 +232,24 @@ def unmarshal_full( skip_validate: bool = False, ) -> Result: """Decode PXF + return per-field presence (set/null) metadata and any - `@` / `@table` directives the decoder saw at the document root. + `@` / `@dataset` directives the decoder saw at the document root. Mirrors Go pxf.UnmarshalFull. """ text = data.encode("utf-8") if isinstance(data, str) else bytes(data) fds = fds_for_message(msg) - raw, set_paths, null_paths, dirs, tables = _protowire.pxf_unmarshal_full( + raw, set_paths, null_paths, dirs, tables, protos = _protowire.pxf_unmarshal_full( text, fds, msg.DESCRIPTOR.full_name, discard_unknown, skip_validate ) msg.Clear() msg.MergeFromString(raw) - return _wrap_result(set_paths, null_paths, dirs, tables) + return _wrap_result(set_paths, null_paths, dirs, tables, protos) # --- Internal helpers ---------------------------------------------------- -def _wrap_result(set_paths, null_paths, raw_dirs, raw_tables) -> Result: +def _wrap_result(set_paths, null_paths, raw_dirs, raw_tables, raw_protos) -> Result: dirs = tuple( Directive( name=name, @@ -236,7 +263,7 @@ def _wrap_result(set_paths, null_paths, raw_dirs, raw_tables) -> Result: for (name, prefixes, type_, body, has_body, line, column) in raw_dirs ) tables = tuple( - TableDirective( + DatasetDirective( type=type_, columns=tuple(columns), rows=tuple( @@ -245,11 +272,20 @@ def _wrap_result(set_paths, null_paths, raw_dirs, raw_tables) -> Result: ) for (type_, columns, rows) in raw_tables ) + protos = tuple( + ProtoDirective( + shape=shape, + type_name=type_name, + body=bytes(body), + ) + for (shape, type_name, body) in raw_protos + ) return Result( set_paths=frozenset(set_paths), null_paths=frozenset(null_paths), directives=dirs, - tables=tables, + datasets=tables, + protos=protos, ) @@ -264,21 +300,21 @@ def _normalize_cell(c) -> Cell: return c # already in the right shape -# --- TableReader (streaming @table consumption, draft §3.4.4) ------------ +# --- DatasetReader (streaming @dataset consumption, draft §3.4.4) ------------ -class TableReader: - """Streaming row reader for a single `@table` directive. +class DatasetReader: + """Streaming row reader for a single `@dataset` directive. - `unmarshal_full` materializes every row of a `@table` directive into + `unmarshal_full` materializes every row of a `@dataset` directive into `Result.tables`. That works for small datasets and breaks for the - CSV-replacement workload `@table` was designed for. `TableReader` + CSV-replacement workload `@dataset` was designed for. `DatasetReader` pulls one row at a time from an in-memory buffer; working-set memory is bounded by the size of the largest single row. Usage:: - reader = pxf.TableReader.from_bytes(open("trades.pxf", "rb").read()) + reader = pxf.DatasetReader.from_bytes(open("trades.pxf", "rb").read()) for row in reader: msg = TradeMsg() pxf.bind_row(msg, reader.columns, row) @@ -299,10 +335,10 @@ def __init__(self, native: object) -> None: self._native = native @classmethod - def from_bytes(cls, data: bytes | str) -> "TableReader": + def from_bytes(cls, data: bytes | str) -> "DatasetReader": if isinstance(data, str): data = data.encode("utf-8") - return cls(_protowire.PxfTableReader.from_bytes(bytes(data))) + return cls(_protowire.PxfDatasetReader.from_bytes(bytes(data))) @property def type(self) -> str: @@ -336,7 +372,7 @@ def next_or_none(self) -> tuple[Cell, ...] | None: cells = self._native.next_or_none() return None if cells is None else tuple(cells) - def __iter__(self) -> "TableReader": + def __iter__(self) -> "DatasetReader": return self def __next__(self) -> tuple[Cell, ...]: @@ -346,12 +382,12 @@ def tail(self) -> bytes: """Returns the bytes the reader has buffered but not consumed, followed by any remaining bytes from the underlying source. - Use to chain a second `TableReader` for documents containing - multiple `@table` directives:: + Use to chain a second `DatasetReader` for documents containing + multiple `@dataset` directives:: - tr1 = pxf.TableReader.from_bytes(data) + tr1 = pxf.DatasetReader.from_bytes(data) for _ in tr1: ... - tr2 = pxf.TableReader.from_bytes(tr1.tail()) + tr2 = pxf.DatasetReader.from_bytes(tr1.tail()) MUST only be called after iteration has exhausted (i.e. `done` is True). Calling earlier returns bytes the current reader @@ -380,7 +416,7 @@ def bind_row( *, skip_validate: bool = True, ) -> None: - """Bind a `@table` row's cells to `msg`'s fields by column name. + """Bind a `@dataset` row's cells to `msg`'s fields by column name. `columns` and `row` MUST have the same length. Cell-state semantics: diff --git a/tests/test_pxf_table_reader.py b/tests/test_pxf_dataset_reader.py similarity index 67% rename from tests/test_pxf_table_reader.py rename to tests/test_pxf_dataset_reader.py index bf3ad89..e844a9b 100644 --- a/tests/test_pxf_table_reader.py +++ b/tests/test_pxf_dataset_reader.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: MIT # Copyright (c) 2026 TrendVidia, LLC. -"""Tests for pxf.TableReader (streaming @table consumption) and pxf.bind_row +"""Tests for pxf.DatasetReader (streaming @dataset consumption) and pxf.bind_row (per-row proto binding). PR 2 of the Python v0.72-v0.75 catch-up.""" from __future__ import annotations @@ -10,39 +10,39 @@ from protowire import pxf -# ---- TableReader.from_bytes header parsing ------------------------------- +# ---- DatasetReader.from_bytes header parsing ------------------------------- def test_reads_header_and_exposes_type_and_columns(): - src = b"@table trades.v1.Trade ( px, qty )\n( 100, 5 )\n( 101, 7 )\n" - tr = pxf.TableReader.from_bytes(src) + src = b"@dataset trades.v1.Trade ( px, qty )\n( 100, 5 )\n( 101, 7 )\n" + tr = pxf.DatasetReader.from_bytes(src) assert tr.type == "trades.v1.Trade" assert tr.columns == ("px", "qty") assert tr.directives == () def test_accepts_str_input(): - tr = pxf.TableReader.from_bytes("@table x.Row ( a )\n( 1 )\n") + tr = pxf.DatasetReader.from_bytes("@dataset x.Row ( a )\n( 1 )\n") assert tr.type == "x.Row" def test_no_table_raises(): - with pytest.raises(ValueError, match="no @table"): - pxf.TableReader.from_bytes(b"@type foo.Msg\nname = \"x\"\n") + with pytest.raises(ValueError, match="no @dataset"): + pxf.DatasetReader.from_bytes(b"@type foo.Msg\nname = \"x\"\n") def test_empty_input_raises(): with pytest.raises(ValueError): - pxf.TableReader.from_bytes(b"") + pxf.DatasetReader.from_bytes(b"") def test_leading_directives_preserved(): src = b'''@header pkg.Hdr { id = "h" } @frob alpha -@table trades.v1.Trade ( px, qty ) +@dataset trades.v1.Trade ( px, qty ) ( 1, 2 ) ''' - tr = pxf.TableReader.from_bytes(src) + tr = pxf.DatasetReader.from_bytes(src) assert len(tr.directives) == 2 assert tr.directives[0].name == "header" assert tr.directives[1].name == "frob" @@ -50,18 +50,18 @@ def test_leading_directives_preserved(): def test_header_oversize_rejected(): - # >64 KiB of leading directive bytes before any @table. - big = b"@frob " + (b"x " * 35000) + b"\n@table x.Row ( a )\n" + # >64 KiB of leading directive bytes before any @dataset. + big = b"@frob " + (b"x " * 35000) + b"\n@dataset x.Row ( a )\n" with pytest.raises(ValueError, match="header exceeds"): - pxf.TableReader.from_bytes(big) + pxf.DatasetReader.from_bytes(big) # ---- iteration ---------------------------------------------------------- def test_iter_yields_rows_in_order(): - src = b"@table x.Row ( a, b )\n( 1, 2 )\n( 3, 4 )\n( 5, 6 )\n" - tr = pxf.TableReader.from_bytes(src) + src = b"@dataset x.Row ( a, b )\n( 1, 2 )\n( 3, 4 )\n( 5, 6 )\n" + tr = pxf.DatasetReader.from_bytes(src) rows = list(tr) assert rows == [ (("int", "1"), ("int", "2")), @@ -72,14 +72,14 @@ def test_iter_yields_rows_in_order(): def test_zero_rows_immediately_stops(): - tr = pxf.TableReader.from_bytes(b"@table x.Row ( a )\n") + tr = pxf.DatasetReader.from_bytes(b"@dataset x.Row ( a )\n") rows = list(tr) assert rows == [] assert tr.done def test_next_or_none_returns_none_at_eof(): - tr = pxf.TableReader.from_bytes(b"@table x.Row ( a )\n( 1 )\n") + tr = pxf.DatasetReader.from_bytes(b"@dataset x.Row ( a )\n( 1 )\n") first = tr.next_or_none() assert first == (("int", "1"),) assert tr.next_or_none() is None @@ -87,10 +87,10 @@ def test_next_or_none_returns_none_at_eof(): def test_cell_shapes_match_three_state_grammar(): - src = b"""@table x.Row ( a, b, c, d, e ) + src = b"""@dataset x.Row ( a, b, c, d, e ) ( 42, "hi", true, null, ) """ - tr = pxf.TableReader.from_bytes(src) + tr = pxf.DatasetReader.from_bytes(src) (row,) = list(tr) assert row[0] == ("int", "42") assert row[1] == ("string", "hi") @@ -100,15 +100,15 @@ def test_cell_shapes_match_three_state_grammar(): def test_arity_mismatch_raises(): - src = b"@table x.Row ( a, b )\n( 1, 2, 3 )\n" - tr = pxf.TableReader.from_bytes(src) + src = b"@dataset x.Row ( a, b )\n( 1, 2, 3 )\n" + tr = pxf.DatasetReader.from_bytes(src) with pytest.raises(ValueError, match="3 cells, expected 2"): next(iter(tr)) def test_parens_inside_string_not_row_boundary(): - src = b'@table x.Row ( a )\n( "hi ) there" )\n( "next" )\n' - tr = pxf.TableReader.from_bytes(src) + src = b'@dataset x.Row ( a )\n( "hi ) there" )\n( "next" )\n' + tr = pxf.DatasetReader.from_bytes(src) rows = list(tr) assert rows == [ (("string", "hi ) there"),), @@ -117,7 +117,7 @@ def test_parens_inside_string_not_row_boundary(): def test_comments_between_rows_ignored(): - src = b"""@table x.Row ( a ) + src = b"""@dataset x.Row ( a ) # leading ( 1 ) // mid @@ -126,7 +126,7 @@ def test_comments_between_rows_ignored(): comment */ ( 3 ) """ - tr = pxf.TableReader.from_bytes(src) + tr = pxf.DatasetReader.from_bytes(src) assert len(list(tr)) == 3 @@ -134,17 +134,17 @@ def test_comments_between_rows_ignored(): def test_tail_chains_to_second_table(): - src = b"""@table a.Row ( x ) + src = b"""@dataset a.Row ( x ) ( 1 ) ( 2 ) -@table b.Row ( y ) +@dataset b.Row ( y ) ( "p" ) ( "q" ) """ - tr1 = pxf.TableReader.from_bytes(src) + tr1 = pxf.DatasetReader.from_bytes(src) assert tr1.type == "a.Row" list(tr1) # drain - tr2 = pxf.TableReader.from_bytes(tr1.tail()) + tr2 = pxf.DatasetReader.from_bytes(tr1.tail()) assert tr2.type == "b.Row" rows = list(tr2) assert rows == [ @@ -157,8 +157,8 @@ def test_tail_chains_to_second_table(): def test_bind_row_sets_fields_by_column(all_types_cls): - src = b'@table test.v1.AllTypes ( string_field, int32_field )\n( "alpha", 42 )\n' - tr = pxf.TableReader.from_bytes(src) + src = b'@dataset test.v1.AllTypes ( string_field, int32_field )\n( "alpha", 42 )\n' + tr = pxf.DatasetReader.from_bytes(src) (row,) = list(tr) msg = all_types_cls() pxf.bind_row(msg, tr.columns, row) @@ -167,8 +167,8 @@ def test_bind_row_sets_fields_by_column(all_types_cls): def test_scan_equivalent_to_next_plus_bind(all_types_cls): - src = b'@table test.v1.AllTypes ( string_field )\n( "row1" )\n( "row2" )\n' - tr = pxf.TableReader.from_bytes(src) + src = b'@dataset test.v1.AllTypes ( string_field )\n( "row1" )\n( "row2" )\n' + tr = pxf.DatasetReader.from_bytes(src) seen = [] while True: msg = all_types_cls() @@ -181,8 +181,8 @@ def test_scan_equivalent_to_next_plus_bind(all_types_cls): def test_bind_row_absent_cell_leaves_default(all_types_cls): # proto3 string default is ""; absent cell shouldn't stamp a value. - src = b'@table test.v1.AllTypes ( string_field, int32_field )\n( , 7 )\n' - tr = pxf.TableReader.from_bytes(src) + src = b'@dataset test.v1.AllTypes ( string_field, int32_field )\n( , 7 )\n' + tr = pxf.DatasetReader.from_bytes(src) (row,) = list(tr) msg = all_types_cls() pxf.bind_row(msg, tr.columns, row) @@ -192,8 +192,8 @@ def test_bind_row_absent_cell_leaves_default(all_types_cls): def test_bind_row_null_clears_wrapper(all_types_cls): # A `null` cell on a wrapper field clears it (draft §3.9). - src = b'@table test.v1.AllTypes ( nullable_string )\n( null )\n' - tr = pxf.TableReader.from_bytes(src) + src = b'@dataset test.v1.AllTypes ( nullable_string )\n( null )\n' + tr = pxf.DatasetReader.from_bytes(src) (row,) = list(tr) msg = all_types_cls() msg.nullable_string.value = "stale" # populate to confirm clear @@ -204,8 +204,8 @@ def test_bind_row_null_clears_wrapper(all_types_cls): def test_bind_row_bytes_cell(all_types_cls): - src = b'@table test.v1.AllTypes ( bytes_field )\n( b"YWJj" )\n' # "abc" - tr = pxf.TableReader.from_bytes(src) + src = b'@dataset test.v1.AllTypes ( bytes_field )\n( b"YWJj" )\n' # "abc" + tr = pxf.DatasetReader.from_bytes(src) (row,) = list(tr) msg = all_types_cls() pxf.bind_row(msg, tr.columns, row) @@ -227,8 +227,8 @@ def test_bind_row_unknown_column_errors(all_types_cls): def test_bind_row_string_escape(all_types_cls): # String values containing quotes and backslashes must round-trip # via the synthetic body formatter. - src = b'@table test.v1.AllTypes ( string_field )\n( "she said \\"hi\\"" )\n' - tr = pxf.TableReader.from_bytes(src) + src = b'@dataset test.v1.AllTypes ( string_field )\n( "she said \\"hi\\"" )\n' + tr = pxf.DatasetReader.from_bytes(src) (row,) = list(tr) msg = all_types_cls() pxf.bind_row(msg, tr.columns, row) diff --git a/tests/test_pxf_directives.py b/tests/test_pxf_directives.py index 0381bbc..c2cc905 100644 --- a/tests/test_pxf_directives.py +++ b/tests/test_pxf_directives.py @@ -2,7 +2,7 @@ # Copyright (c) 2026 TrendVidia, LLC. """Tests for the PXF v0.72+ surface exposed in v0.75.0: - - `Result.directives` and `Result.tables` populated by `unmarshal_full` + - `Result.directives` and `Result.datasets` populated by `unmarshal_full` - `pxf.validate_descriptor` and `pxf.Violation` - `skip_validate` opt-out on `unmarshal` / `unmarshal_full` """ @@ -21,7 +21,7 @@ def test_directives_empty_when_no_at_directives(all_types_cls): msg = all_types_cls() r = pxf.unmarshal_full('string_field = "x"', msg) assert r.directives == () - assert r.tables == () + assert r.datasets == () def test_bare_directive_recorded(all_types_cls): @@ -78,15 +78,15 @@ def test_at_type_does_not_leak_into_directives(all_types_cls): assert r.directives[0].name == "frob" -# ---- Result.tables ------------------------------------------------------ +# ---- Result.datasets ------------------------------------------------------ def test_table_recorded_with_columns_and_rows(all_types_cls): msg = all_types_cls() - src = "@table trades.v1.Trade ( px, qty )\n( 100, 5 )\n( 101, 7 )\n" + src = "@dataset trades.v1.Trade ( px, qty )\n( 100, 5 )\n( 101, 7 )\n" r = pxf.unmarshal_full(src, msg) - assert len(r.tables) == 1 - t = r.tables[0] + assert len(r.datasets) == 1 + t = r.datasets[0] assert t.type == "trades.v1.Trade" assert t.columns == ("px", "qty") assert len(t.rows) == 2 @@ -96,9 +96,9 @@ def test_table_recorded_with_columns_and_rows(all_types_cls): def test_table_cell_shapes(all_types_cls): msg = all_types_cls() - src = '@table x.Row ( a, b, c, d )\n( 42, "hello", true, null )\n' + src = '@dataset x.Row ( a, b, c, d )\n( 42, "hello", true, null )\n' r = pxf.unmarshal_full(src, msg) - row = r.tables[0].rows[0] + row = r.datasets[0].rows[0] assert row[0] == ("int", "42") assert row[1] == ("string", "hello") assert row[2] == ("bool", True) @@ -109,8 +109,8 @@ def test_three_state_cells(all_types_cls): msg = all_types_cls() # Empty cell = None (absent); null literal = ("null", None) (present-but-null); # value = ("", value) (present-with-value). - r = pxf.unmarshal_full("@table x.Row ( a, b, c )\n( 1, , null )\n", msg) - row = r.tables[0].rows[0] + r = pxf.unmarshal_full("@dataset x.Row ( a, b, c )\n( 1, , null )\n", msg) + row = r.datasets[0].rows[0] assert row[0] == ("int", "1") assert row[1] is None # absent assert row[2] == ("null", None) @@ -119,23 +119,23 @@ def test_three_state_cells(all_types_cls): def test_multiple_tables_in_order(all_types_cls): msg = all_types_cls() src = ( - "@table a.Row ( x )\n" + "@dataset a.Row ( x )\n" "( 1 )\n" - "@table b.Row ( y )\n" + "@dataset b.Row ( y )\n" '( "p" )\n' ) r = pxf.unmarshal_full(src, msg) - assert [t.type for t in r.tables] == ["a.Row", "b.Row"] + assert [t.type for t in r.datasets] == ["a.Row", "b.Row"] def test_directives_and_tables_can_coexist(all_types_cls): - # A doc with @table can NOT have @type or body entries, but can carry - # generic @s before the @table header. + # A doc with @dataset can NOT have @type or body entries, but can carry + # generic @s before the @dataset header. msg = all_types_cls() - src = '@header pkg.Hdr { id = "h" }\n@table x.Row ( a )\n( 1 )\n' + src = '@header pkg.Hdr { id = "h" }\n@dataset x.Row ( a )\n( 1 )\n' r = pxf.unmarshal_full(src, msg) assert len(r.directives) == 1 - assert len(r.tables) == 1 + assert len(r.datasets) == 1 assert r.directives[0].name == "header" diff --git a/tests/test_pxf_proto_directive.py b/tests/test_pxf_proto_directive.py new file mode 100644 index 0000000..70d2285 --- /dev/null +++ b/tests/test_pxf_proto_directive.py @@ -0,0 +1,158 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2026 TrendVidia, LLC. +"""Tests for the @proto directive (draft §3.4.5). + +Four body shapes lexically distinguished: anonymous, named, source, +descriptor. Plus reserved-directive-name rejection (draft §3.4.6). + +These exercise the FFI roundtrip through protowire-cpp via +unmarshal_full, since Python doesn't expose a pure Parse() entry +point — the AST-tier checks happen on the cpp side and surface to +Python through Result.protos. +""" + +import base64 + +import pytest + +from protowire import pxf + + +def test_anonymous_body(all_types_cls): + msg = all_types_cls() + r = pxf.unmarshal_full( + b"""@proto { + string symbol = 1; + double price = 2; +} +string_field = "hi" +""", + msg, + ) + assert len(r.protos) == 1 + p = r.protos[0] + assert p.shape == "anonymous" + assert p.type_name == "" + assert b"string symbol = 1;" in p.body + assert b"double price = 2;" in p.body + + +def test_named_body(all_types_cls): + msg = all_types_cls() + r = pxf.unmarshal_full( + b"""@proto trades.v1.Trade { + string symbol = 1; +} +string_field = "hi" +""", + msg, + ) + assert len(r.protos) == 1 + assert r.protos[0].shape == "named" + assert r.protos[0].type_name == "trades.v1.Trade" + + +def test_source_body(all_types_cls): + msg = all_types_cls() + r = pxf.unmarshal_full( + b'''@proto """ +syntax = "proto3"; +message Trade { string symbol = 1; } +""" +string_field = "hi" +''', + msg, + ) + assert len(r.protos) == 1 + assert r.protos[0].shape == "source" + assert b"message Trade" in r.protos[0].body + + +def test_descriptor_body(all_types_cls): + msg = all_types_cls() + raw = b"\x0a\x05hello" + b64 = base64.standard_b64encode(raw).decode() + r = pxf.unmarshal_full( + f'@proto b"{b64}"\nstring_field = "hi"\n'.encode(), + msg, + ) + assert len(r.protos) == 1 + assert r.protos[0].shape == "descriptor" + assert r.protos[0].body == raw + + +def test_multiple_protos(all_types_cls): + msg = all_types_cls() + r = pxf.unmarshal_full( + b"""@proto trades.v1.Trade { string symbol = 1; } +@proto orders.v1.Order { string id = 1; } +string_field = "hi" +""", + msg, + ) + assert len(r.protos) == 2 + assert [p.type_name for p in r.protos] == ["trades.v1.Trade", "orders.v1.Order"] + + +def test_nested_braces_in_body(all_types_cls): + msg = all_types_cls() + r = pxf.unmarshal_full( + b"""@proto { + message Side { + string label = 1; + } + Side side = 1; +} +string_field = "hi" +""", + msg, + ) + assert len(r.protos) == 1 + body = r.protos[0].body + assert b"message Side" in body + assert b"Side side = 1;" in body + + +def test_rejects_bad_shape(all_types_cls): + msg = all_types_cls() + with pytest.raises(ValueError, match="@proto"): + pxf.unmarshal_full(b"@proto 42\nstring_field = \"hi\"\n", msg) + + +def test_rejects_named_missing_brace(all_types_cls): + msg = all_types_cls() + with pytest.raises(ValueError, match=r"'\{'"): + pxf.unmarshal_full( + b"@proto trades.v1.Trade 42\nstring_field = \"hi\"\n", + msg, + ) + + +def test_rejects_anonymous_unmatched_brace(all_types_cls): + msg = all_types_cls() + with pytest.raises(ValueError, match="unmatched"): + pxf.unmarshal_full(b"@proto { string symbol = 1;\n", msg) + + +@pytest.mark.parametrize( + "name", ["table", "datasource", "view", "procedure", "function", "permissions"] +) +def test_rejects_reserved_directive_names(all_types_cls, name): + """Draft §3.4.6: v1 decoders MUST reject future-allocated names.""" + msg = all_types_cls() + with pytest.raises(ValueError, match="spec-reserved"): + pxf.unmarshal_full( + f"@{name} {{ x = 1 }}\nstring_field = \"hi\"\n".encode(), + msg, + ) + + +def test_proto_directive_dataclass_shape(): + """ProtoDirective is a frozen dataclass with shape/type_name/body.""" + pd = pxf.ProtoDirective(shape="named", type_name="pkg.T", body=b"hello") + assert pd.shape == "named" + assert pd.type_name == "pkg.T" + assert pd.body == b"hello" + # Frozen — assignment raises. + with pytest.raises((AttributeError, Exception)): + pd.shape = "anonymous" # type: ignore[misc] From a989c4e78e035df7b0f1f3805c07a841d50f4d3c Mon Sep 17 00:00:00 2001 From: Decoder Date: Wed, 13 May 2026 04:33:10 -0700 Subject: [PATCH 2/2] =?UTF-8?q?ci:=20bump=20PROTOWIRE=5FCPP=5FREF=20v0.75.?= =?UTF-8?q?0=20=E2=86=92=20v1.0.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CI / publish / codeql workflows pinned the sibling cpp checkout to v0.75.0 — the Python source compiles against the v1.0 API surface (Datasets / Protos / DatasetReader) and fails against the older TableReader / Tables surface. Local builds passed because the local protowire-cpp checkout is at main (v1.0.0); CI's pinned checkout was the v0.75.0 tag. --- .github/workflows/ci.yml | 2 +- .github/workflows/codeql.yml | 2 +- .github/workflows/publish.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 814b471..73408a8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,7 +20,7 @@ env: # PXF v0.72-series feature set (@ / @entry / @table directive # grammar, schema validator, Result accessors, TableReader streaming) # the Python port wraps. Bump in lockstep with cpp release cuts. - PROTOWIRE_CPP_REF: v0.75.0 + PROTOWIRE_CPP_REF: v1.0.0 jobs: # --------------------------------------------------------------------- diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 89113eb..96bc456 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -17,7 +17,7 @@ permissions: env: # See ci.yml for the rationale on this pin. - PROTOWIRE_CPP_REF: v0.75.0 + PROTOWIRE_CPP_REF: v1.0.0 jobs: analyze: diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index ddef699..72781e5 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -28,7 +28,7 @@ env: # feature set (@ / @entry / @table grammar, schema validator, # Result accessors, TableReader streaming) that this Python port # wraps. Bump in lockstep with cpp release cuts. - PROTOWIRE_CPP_REF: v0.75.0 + PROTOWIRE_CPP_REF: v1.0.0 jobs: # ---------------------------------------------------------------------