diff --git a/CHANGELOG.md b/CHANGELOG.md index 1115106..e6f022b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,58 @@ format changes. ## [Unreleased] +## [1.0.0] — 2026-05-13 + +First major-version cut. Implements the three one-time spec changes +from the protowire v1.0 freeze line (`STABILITY.md` in the spec +repo) in lockstep with `protowire`, `protowire-go`, `protowire-java` +(v1.0.1), `protowire-typescript`, and `protowire-kotlin`. +**Breaking** — there is no alias period; v1.0 is itself the major +bump. + +### v1.0 spec changes + +- **`@table` → `@dataset` rename** (draft §3.4.4). Public API + follows: `Ast::TableDirective` → `Ast::DatasetDirective`, + `Ast::TableRow` → `Ast::DatasetRow`, `TokenKind::kAtTable` → + `TokenKind::kAtDataset`, `Result::Tables()` → `Result::Datasets()`, + `Result::AddTable()` → `Result::AddDataset()`, + `class TableReader` → `class DatasetReader`. Headers + `protowire/pxf/table_reader.h` → `dataset_reader.h`; source + `src/pxf/table_reader.cc` → `dataset_reader.cc`. Hard cutover. + +- **`@proto` directive added** (draft §3.4.5). New `Ast::ProtoDirective` + struct + `Ast::ProtoShape` enum (`kAnonymous`, `kNamed`, `kSource`, + `kDescriptor`). Four body shapes lexically distinguished + (anonymous `{ ... }`, named ` { ... }`, + source `"""..."""`, descriptor `b"..."`). Exposed via + `Document::protos` and `Result::Protos()`. Descriptor form is + the MUST-support shape per spec; this port supports all four. + +- **Reserved directive names** expanded from 5 to 13 (draft §3.4.6). + `IsFutureReservedDirective(name)` exposed from + `protowire/pxf/schema.h`. Parser + fast decoder reject `@table`, + `@datasource`, `@view`, `@procedure`, `@function`, + `@permissions` as spec-reserved. + +`@dataset`'s row message type is now optional in the AST — binding +to an anonymous `@proto` per draft §3.4.4 Anonymous binding. + +`Lexer::RepositionTo(int)` added so the parser can skip past an +`@proto` brace-body whose interior is protobuf source rather than +PXF. + +### Build + +- CMake `project(protowire VERSION ...)` bumped `0.75.0` → `1.0.0`. + +### Tests + +- New `test/pxf_proto_directive_test.cc` with 11 cases covering all + four `@proto` body shapes, anonymous binding, multi-`@proto`, + nested-brace bodies, reserved-name rejection, `@type` coexistence. +- `ctest`: 229 tests, 0 failures. + ## [0.75.0] — 2026-05-12 First release after the v0.70.0 baseline that closes the v0.72–v0.75 diff --git a/CMakeLists.txt b/CMakeLists.txt index a763473..b3075ed 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.20) -project(protowire VERSION 0.75.0 LANGUAGES CXX) +project(protowire VERSION 1.0.0 LANGUAGES CXX) set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -83,7 +83,7 @@ _pwx_existing_sources(_pxf_srcs src/pxf/lexer.cc src/pxf/ast.cc src/pxf/parser.cc src/pxf/decode_fast.cc src/pxf/decode.cc src/pxf/encode.cc src/pxf/format.cc src/pxf/wellknown.cc src/pxf/annotations.cc src/pxf/result.cc src/pxf/options.cc src/pxf/schema.cc - src/pxf/table_reader.cc) + src/pxf/dataset_reader.cc) _pwx_existing_sources(_sbe_srcs src/sbe/annotations.cc src/sbe/template.cc src/sbe/codec.cc src/sbe/marshal.cc src/sbe/unmarshal.cc src/sbe/view.cc diff --git a/include/protowire/pxf.h b/include/protowire/pxf.h index 50c3422..49cb720 100644 --- a/include/protowire/pxf.h +++ b/include/protowire/pxf.h @@ -23,11 +23,11 @@ #include #include "protowire/detail/status.h" +#include "protowire/pxf/dataset_reader.h" // DatasetReader, BindRow #include "protowire/pxf/options.h" #include "protowire/pxf/parser.h" // Document, Parse #include "protowire/pxf/result.h" -#include "protowire/pxf/schema.h" // ValidateDescriptor, Violation -#include "protowire/pxf/table_reader.h" // TableReader, BindRow +#include "protowire/pxf/schema.h" // ValidateDescriptor, Violation namespace protowire::pxf { diff --git a/include/protowire/pxf/ast.h b/include/protowire/pxf/ast.h index c253990..99121e6 100644 --- a/include/protowire/pxf/ast.h +++ b/include/protowire/pxf/ast.h @@ -132,8 +132,9 @@ struct BlockVal { // entry. The canonical use is side-channel metadata that sits alongside // the schema-typed body — e.g. chameleon's // `@header chameleon.v1.LayerHeader { id = "x" }` — but the grammar is -// open-ended: any name except `type` / `table` is parsed as a generic -// Directive. Prefix identifiers are positional and per-directive. +// open-ended: any name not in the spec-reserved set (draft §3.4.6) is +// parsed as a generic Directive. Prefix identifiers are positional +// and per-directive. // // Specific registrations: // - One prefix (v0.72.0 conventional shape) — names the inner block's @@ -146,7 +147,7 @@ struct BlockVal { // (both exclusive) — empty when the directive has no inline block. struct Directive { Position pos; - std::string name; // e.g. "header"; never "type" / "table" + std::string name; // e.g. "header"; never a spec-reserved name (§3.4.6) std::vector prefixes; // identifiers between @ and the optional `{ ... }` // Back-compat: when exactly one prefix identifier was supplied, `type` // holds it (matching v0.72.0's single-Type shape). Empty otherwise. @@ -156,36 +157,81 @@ struct Directive { std::vector leading_comments; }; -// TableRow is one parenthesized cell tuple in a `@table` directive. -// `cells` is the same length as the containing TableDirective.columns. +// DatasetRow is one parenthesized cell tuple in a `@dataset` directive. +// `cells` is the same length as the containing DatasetDirective.columns. // A `std::nullopt` cell denotes an absent field (the "empty cell" // between two commas); a non-empty optional holding a `NullVal` denotes // a present-but-null field; any other Value denotes a present field. -struct TableRow { +struct DatasetRow { Position pos; std::vector> cells; }; -// TableDirective is a `@table ( col1, col2, ... ) row*` entry at -// document root (draft §3.4.4). It carries many instances of one -// message type in a single document — the protowire-native CSV. +// DatasetDirective is a `@dataset ( col1, col2, ... ) row*` entry +// at document root (draft §3.4.4). It carries many instances of one +// message type in a single document — the protowire-native CSV +// replacement. // -// Per draft §3.4.4, a document with any TableDirective MUST NOT have a -// @type directive or any top-level field entries: the @table header IS -// the document's type declaration. Decoders enforce this in Parse. -struct TableDirective { +// Per draft §3.4.4, a document with any DatasetDirective MUST NOT have +// a @type directive or any top-level field entries: the @dataset header +// IS the document's type declaration. Decoders enforce this in Parse. +// +// `type` MAY be empty when an anonymous `@proto` directive (§3.4.5) +// precedes the dataset in document order; the anonymous schema is +// consumed as the row message type. +struct DatasetDirective { Position pos; std::string type; // row message type, e.g. "trades.v1.Trade" std::vector columns; // top-level field names on `type`; len >= 1 - std::vector rows; // zero or more rows + std::vector rows; // zero or more rows + std::vector leading_comments; +}; + +// ProtoShape distinguishes the four body shapes of a @proto directive +// (draft §3.4.5). +enum class ProtoShape : uint8_t { + // `@proto { }` — defines an unnamed message used by + // the next typed directive in document order. + kAnonymous = 0, + // `@proto { }` — sugar for a single named + // message; `type_name` carries the dotted name. + kNamed, + // `@proto """"""` — complete .proto source file. + kSource, + // `@proto b""` — base64-encoded + // google.protobuf.FileDescriptorSet bytes. + kDescriptor, +}; + +const char* ProtoShapeName(ProtoShape s); + +// ProtoDirective is a `@proto ` entry at document root +// (draft §3.4.5). It carries an embedded protobuf schema, making the +// PXF document self-describing. +// +// `body` carries raw bytes per shape: +// - kAnonymous, kNamed: bytes between the opening `{` and matching +// `}` (both exclusive). The bytes are protobuf message-body source. +// - kSource: contents of the triple-quoted string (with leading-LF +// stripping and common-prefix dedent applied). The bytes are a +// complete .proto source file. +// - kDescriptor: base64-decoded bytes of the bytes literal. The +// bytes are a serialised google.protobuf.FileDescriptorSet. +// +// `type_name` is non-empty only when `shape == kNamed`. +struct ProtoDirective { + Position pos; + ProtoShape shape = ProtoShape::kAnonymous; + std::string type_name; + std::string body; std::vector leading_comments; }; struct Document { - std::string type_url; // empty if no @type directive - std::vector - directives; // @ *(prefix) [{ ... }] entries in source order; excludes @type and @table - std::vector tables; // @table directives in source order + std::string type_url; // empty if no @type directive + std::vector directives; // @ directives in source order; excludes spec-defined + std::vector datasets; // @dataset directives in source order (draft §3.4.4) + std::vector protos; // @proto directives in source order (draft §3.4.5) int body_offset = 0; // byte offset where the schema-typed body begins (after all leading directives) std::vector entries; diff --git a/include/protowire/pxf/table_reader.h b/include/protowire/pxf/dataset_reader.h similarity index 68% rename from include/protowire/pxf/table_reader.h rename to include/protowire/pxf/dataset_reader.h index e9e4eb5..060da57 100644 --- a/include/protowire/pxf/table_reader.h +++ b/include/protowire/pxf/dataset_reader.h @@ -1,13 +1,13 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2026 TrendVidia, LLC. // -// Streaming consumption for the `@table` directive (draft §3.4.4). +// Streaming consumption for the `@dataset` directive (draft §3.4.4). // -// `UnmarshalFull` materializes an entire `@table` directive — every row -// — into `Result::Tables()`. That works for small datasets and breaks -// for the CSV-replacement workload `@table` was designed to serve. -// `TableReader` provides the streaming alternative: it pulls bytes from -// a `std::istream` on demand and yields one `TableRow` per `Next()` +// `UnmarshalFull` materializes an entire `@dataset` directive — every row +// — into `Result::Datasets()`. That works for small datasets and breaks +// for the CSV-replacement workload `@dataset` was designed to serve. +// `DatasetReader` provides the streaming alternative: it pulls bytes from +// a `std::istream` on demand and yields one `DatasetRow` per `Next()` // call, with working-set memory bounded by the size of the largest // single row. // @@ -20,7 +20,7 @@ // // Convenience: `Scan(msg)` reads the next row and binds its cells to // `msg`'s fields by column name; `BindRow` is exported for callers that -// iterate the materializing path's `Result::Tables()[i].rows`. +// iterate the materializing path's `Result::Datasets()[i].rows`. #pragma once @@ -32,46 +32,46 @@ #include #include "protowire/detail/status.h" -#include "protowire/pxf/ast.h" // Directive, TableRow +#include "protowire/pxf/ast.h" // Directive, DatasetRow namespace protowire::pxf { -// Default cap on the @table header (leading directives plus the -// `@table TYPE ( cols )` declaration). Real headers are tiny — a few +// Default cap on the @dataset header (leading directives plus the +// `@dataset TYPE ( cols )` declaration). Real headers are tiny — a few // hundred bytes at most. The cap exists to fail-fast on misuse: a -// TableReader pointed at a multi-gigabyte document with no `@table` +// DatasetReader pointed at a multi-gigabyte document with no `@dataset` // directive shouldn't OOM trying to find one. constexpr int kDefaultHeaderMaxBytes = 64 * 1024; -// Streaming row reader for a single `@table` directive. +// Streaming row reader for a single `@dataset` directive. // -// A TableReader is positioned at the first row after `Create()` +// A DatasetReader is positioned at the first row after `Create()` // returns. Call `Next(&row)` in a loop until `Done()` returns true; // the table's row sequence is exhausted at that point. Any parse or // I/O error makes the reader sticky: subsequent `Next` / `Scan` calls // return the same Status. // -// For documents containing multiple `@table` directives, call +// For documents containing multiple `@dataset` directives, call // `Create()` again on `tr->Tail()` to read the next table. // -// A TableReader is NOT safe for concurrent use. -class TableReader { +// A DatasetReader is NOT safe for concurrent use. +class DatasetReader { public: - // Construct a TableReader and consume the leading directives and the - // `@table TYPE ( cols )` header. `src` must outlive the reader. - // Returns a non-OK Status if the input ends before any `@table` - // directive is seen (the message contains "no @table directive in + // Construct a DatasetReader and consume the leading directives and the + // `@dataset TYPE ( cols )` header. `src` must outlive the reader. + // Returns a non-OK Status if the input ends before any `@dataset` + // directive is seen (the message contains "no @dataset directive in // stream") or on a parse / I/O error. - static StatusOr> Create(std::istream* src); + static StatusOr> Create(std::istream* src); - // Row message type declared by the @table header (e.g. "trades.v1.Trade"). + // Row message type declared by the @dataset header (e.g. "trades.v1.Trade"). const std::string& Type() const { return type_; } - // Column field names declared by the @table header, in source order. + // Column field names declared by the @dataset header, in source order. const std::vector& Columns() const { return columns_; } // Side-channel directives (`@` / `@entry` / etc., NOT `@type` - // or `@table`) that appeared before the `@table` header. Stable for + // or `@dataset`) that appeared before the `@dataset` header. Stable for // the lifetime of the reader. const std::vector& Directives() const { return directives_; } @@ -82,7 +82,7 @@ class TableReader { // // After EOF or error, all subsequent calls return the same sticky // result. - Status Next(TableRow* out); + Status Next(DatasetRow* out); // Reads the next row and binds its cells to fields of `msg` by column // name. Equivalent to `Next` + `BindRow`. At EOF, returns OK and sets @@ -98,7 +98,7 @@ class TableReader { // Returns a fresh istream-derived source that yields the bytes the // reader buffered but didn't consume, followed by the remaining // bytes from the underlying source. Use to chain a second - // `Create()` for documents with multiple `@table` directives. + // `Create()` for documents with multiple `@dataset` directives. // // MUST only be called after `Next` has reported `Done()`. Calling // earlier returns bytes the current reader still intends to consume, @@ -106,7 +106,7 @@ class TableReader { std::unique_ptr Tail(); private: - TableReader() = default; + DatasetReader() = default; Status ReadHeader(); Status Pull(size_t n); @@ -133,10 +133,10 @@ class TableReader { // wrapper / oneof; rejects on non-nullable scalars). // - any other Value — field set to that value. // -// Exported so callers iterating `Result::Tables()[i].rows` can reuse +// Exported so callers iterating `Result::Datasets()[i].rows` can reuse // the same logic. Status BindRow(google::protobuf::Message* msg, const std::vector& columns, - const TableRow& row); + const DatasetRow& row); } // namespace protowire::pxf diff --git a/include/protowire/pxf/lexer.h b/include/protowire/pxf/lexer.h index 7b56f21..9e6f1a3 100644 --- a/include/protowire/pxf/lexer.h +++ b/include/protowire/pxf/lexer.h @@ -23,6 +23,27 @@ class Lexer { // between '{' and '}' once the matching brace has been located. std::string_view Input() const { return input_; } + // Reposition the lexer to byte offset `target`, recomputing line/col + // by scanning forward from the current position. Used by parseProto- + // Directive to skip past an @proto brace-body whose interior is + // protobuf source (not PXF) without lexing through it. + void RepositionTo(int target) { + if (target < static_cast(pos_)) { + pos_ = 0; + line_ = 1; + column_ = 1; + } + while (static_cast(pos_) < target && pos_ < input_.size()) { + uint8_t ch = static_cast(input_[pos_++]); + if (ch == '\n') { + ++line_; + column_ = 1; + } else { + ++column_; + } + } + } + private: uint8_t Peek(size_t offset = 0) const { size_t i = pos_ + offset; diff --git a/include/protowire/pxf/result.h b/include/protowire/pxf/result.h index bf6075c..25bd695 100644 --- a/include/protowire/pxf/result.h +++ b/include/protowire/pxf/result.h @@ -7,7 +7,7 @@ #include #include -#include "protowire/pxf/ast.h" // Directive, TableDirective +#include "protowire/pxf/ast.h" // Directive, DatasetDirective, ProtoDirective namespace protowire::pxf { @@ -17,13 +17,13 @@ namespace protowire::pxf { // // Result also surfaces the document-root directives the decoder saw: // - Directives() → generic `@ *(prefix) [{ ... }]` blocks, in -// source order, excluding @type / @table (which have their own +// source order, excluding @type / @dataset (which have their own // handling). -// - Tables() → `@table ( cols ) row*` directives, in -// source order. A document with any @table has no body entries, +// - Datasets() → `@dataset ( cols ) row*` directives, in +// source order. A document with any @dataset has no body entries, // so the rows are the document's payload — consumers walk -// TableDirective::rows and bind each row's cells to a fresh -// instance of TableDirective::type via their own schema. +// DatasetDirective::rows and bind each row's cells to a fresh +// instance of DatasetDirective::type via their own schema. class Result { public: bool IsSet(const std::string& path) const { @@ -53,16 +53,19 @@ class Result { // Directive accessors (PXF v0.72+). const std::vector& Directives() const { return directives_; } - const std::vector& Tables() const { return tables_; } + const std::vector& Datasets() const { return datasets_; } + const std::vector& Protos() const { return protos_; } void AddDirective(Directive d) { directives_.push_back(std::move(d)); } - void AddTable(TableDirective t) { tables_.push_back(std::move(t)); } + void AddTable(DatasetDirective t) { datasets_.push_back(std::move(t)); } + void AddProto(ProtoDirective p) { protos_.push_back(std::move(p)); } private: std::unordered_set present_; std::unordered_set null_; std::vector directives_; - std::vector tables_; + std::vector datasets_; + std::vector protos_; }; } // namespace protowire::pxf diff --git a/include/protowire/pxf/schema.h b/include/protowire/pxf/schema.h index d5f1aed..d4a7c94 100644 --- a/include/protowire/pxf/schema.h +++ b/include/protowire/pxf/schema.h @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -61,4 +62,14 @@ std::vector ValidateDescriptor(const google::protobuf::Descriptor* de // the file. See ValidateDescriptor for the rule and semantics. std::vector ValidateFile(const google::protobuf::FileDescriptor* fd); +// IsFutureReservedDirective returns true when `name` is one of the +// directive names the spec reserves for future allocation (draft +// §3.4.6): "table", "datasource", "view", "procedure", "function", +// "permissions". v1 decoders MUST reject these as unknown reserved +// directives. The names with their own production (`type`, `dataset`, +// `proto`) and the spec-registered `entry` aren't covered here — +// they're already handled either by the lexer or the named_directive +// shape. +bool IsFutureReservedDirective(std::string_view name); + } // namespace protowire::pxf diff --git a/include/protowire/pxf/token.h b/include/protowire/pxf/token.h index 13624ef..0c53e7d 100644 --- a/include/protowire/pxf/token.h +++ b/include/protowire/pxf/token.h @@ -28,16 +28,16 @@ enum class TokenKind : uint8_t { kRBrace, kLBracket, kRBracket, - kLParen, // ( — used by @table column list and row tuples + kLParen, // ( — used by @dataset column list and row tuples kRParen, // ) kEquals, kColon, kComma, kAtType, - kAtDirective, // @ where ident is not "type" or "table"; Token.value carries the bare name - // (no '@') - kAtTable, // @table — bulk-row directive (draft §3.4.4) + kAtDirective, // @ for any non-reserved name; Token.value carries the bare name (no '@') + kAtDataset, // @dataset — row-oriented bulk-data directive (draft §3.4.4) + kAtProto, // @proto — embedded protobuf schema directive (draft §3.4.5) }; const char* TokenKindName(TokenKind k); diff --git a/src/pxf/ast.cc b/src/pxf/ast.cc index 36a0de2..592601a 100644 --- a/src/pxf/ast.cc +++ b/src/pxf/ast.cc @@ -12,4 +12,18 @@ Position ValuePos(const ValuePtr& v) { return std::visit([](auto& p) { return p->pos; }, v); } +const char* ProtoShapeName(ProtoShape s) { + switch (s) { + case ProtoShape::kAnonymous: + return "anonymous"; + case ProtoShape::kNamed: + return "named"; + case ProtoShape::kSource: + return "source"; + case ProtoShape::kDescriptor: + return "descriptor"; + } + return "?"; +} + } // namespace protowire::pxf diff --git a/src/pxf/table_reader.cc b/src/pxf/dataset_reader.cc similarity index 85% rename from src/pxf/table_reader.cc rename to src/pxf/dataset_reader.cc index 356b343..9b5054c 100644 --- a/src/pxf/table_reader.cc +++ b/src/pxf/dataset_reader.cc @@ -1,6 +1,6 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2026 TrendVidia, LLC. -#include "protowire/pxf/table_reader.h" +#include "protowire/pxf/dataset_reader.h" #include #include @@ -126,7 +126,7 @@ int FindAtTable(std::string_view input, bool* found, Status* err) { *found = false; int i = 0; int n = static_cast(input.size()); - static constexpr std::string_view kAtTable = "@table"; + static constexpr std::string_view kAtDataset = "@dataset"; while (i < n) { int j = SkipStringOrComment(input, i, err); if (!err->ok()) return 0; @@ -135,10 +135,10 @@ int FindAtTable(std::string_view input, bool* found, Status* err) { i = j; continue; } - if (input[i] == '@' && i + static_cast(kAtTable.size()) <= n && - input.substr(i, kAtTable.size()) == kAtTable) { - int after = i + static_cast(kAtTable.size()); - if (after == n) return 0; // could be `@table` + more bytes — conservative + if (input[i] == '@' && i + static_cast(kAtDataset.size()) <= n && + input.substr(i, kAtDataset.size()) == kAtDataset) { + int after = i + static_cast(kAtDataset.size()); + if (after == n) return 0; // could be `@dataset` + more bytes — conservative if (!IsIdentPart(input[after])) { *found = true; return i; @@ -205,7 +205,7 @@ int FindMatchingParenSafe(std::string_view input, int open_idx, bool* found, Sta return 0; } -// Locates the closing `)` of the first complete `@table TYPE ( cols )` +// Locates the closing `)` of the first complete `@dataset TYPE ( cols )` // header in `input`. Returns the index of that `)`; `*found` is false // when more bytes are needed. int ScanHeaderEnd(std::string_view input, bool* found, Status* err) { @@ -216,7 +216,7 @@ int ScanHeaderEnd(std::string_view input, bool* found, Status* err) { if (!ok) return 0; bool lp_ok = false; int lparen = FindNextChar( - input, at_idx + static_cast(std::string_view("@table").size()), '(', &lp_ok, err); + input, at_idx + static_cast(std::string_view("@dataset").size()), '(', &lp_ok, err); if (!err->ok()) return 0; if (!lp_ok) return 0; bool rp_ok = false; @@ -319,7 +319,7 @@ Status AppendCellValue(std::string& out, const ValuePtr& cell) { return Status::OK(); } else { return Status::Error( - "pxf: BindRow: unexpected cell value type (v1 @table cells are scalar-shaped)"); + "pxf: BindRow: unexpected cell value type (v1 @dataset cells are scalar-shaped)"); } }, cell); @@ -327,17 +327,17 @@ Status AppendCellValue(std::string& out, const ValuePtr& cell) { } // namespace -// ---- TableReader public API ----------------------------------------------- +// ---- DatasetReader public API ----------------------------------------------- -StatusOr> TableReader::Create(std::istream* src) { - if (src == nullptr) return Status::Error("pxf: TableReader: null istream"); - auto tr = std::unique_ptr(new TableReader()); +StatusOr> DatasetReader::Create(std::istream* src) { + if (src == nullptr) return Status::Error("pxf: DatasetReader: null istream"); + auto tr = std::unique_ptr(new DatasetReader()); tr->src_ = src; if (Status s = tr->ReadHeader(); !s.ok()) return s; return tr; } -Status TableReader::Next(TableRow* out) { +Status DatasetReader::Next(DatasetRow* out) { if (!err_.ok()) return err_; if (finished_) return Status::OK(); for (;;) { @@ -354,11 +354,11 @@ Status TableReader::Next(TableRow* out) { if (found) { std::string_view row_bytes(pending_.data() + start, end - start + 1); // Parse the row via the AST parser. We wrap it in a synthetic - // `@table T (col0,col1,...) ` so the existing parser + // `@dataset T (col0,col1,...) ` so the existing parser // accepts it and arity-checks against our column count. std::string synthetic; synthetic.reserve(row_bytes.size() + 32 + columns_.size() * 8); - synthetic.append("@table _.Row ("); + synthetic.append("@dataset _.Row ("); for (size_t i = 0; i < columns_.size(); ++i) { if (i != 0) synthetic.push_back(','); synthetic.append(columns_[i]); @@ -373,11 +373,11 @@ Status TableReader::Next(TableRow* out) { err_ = doc.status(); return err_; } - if (doc->tables.empty() || doc->tables[0].rows.empty()) { - err_ = Status::Error("pxf: TableReader: synthetic row parse produced no row"); + if (doc->datasets.empty() || doc->datasets[0].rows.empty()) { + err_ = Status::Error("pxf: DatasetReader: synthetic row parse produced no row"); return err_; } - *out = std::move(doc->tables[0].rows[0]); + *out = std::move(doc->datasets[0].rows[0]); return Status::OK(); } if (done) { @@ -395,23 +395,23 @@ Status TableReader::Next(TableRow* out) { } } -Status TableReader::Scan(google::protobuf::Message* msg) { - TableRow row; +Status DatasetReader::Scan(google::protobuf::Message* msg) { + DatasetRow row; if (Status s = Next(&row); !s.ok()) return s; if (finished_) return Status::OK(); return BindRow(msg, columns_, row); } -std::unique_ptr TableReader::Tail() { +std::unique_ptr DatasetReader::Tail() { std::ostringstream buf; if (!pending_.empty()) buf.write(pending_.data(), static_cast(pending_.size())); if (src_ != nullptr && !src_eof_) buf << src_->rdbuf(); return std::make_unique(buf.str()); } -// ---- TableReader internals ------------------------------------------------ +// ---- DatasetReader internals ------------------------------------------------ -Status TableReader::Pull(size_t n) { +Status DatasetReader::Pull(size_t n) { if (src_eof_) return Status::OK(); std::string buf(n, '\0'); src_->read(buf.data(), static_cast(n)); @@ -421,13 +421,13 @@ Status TableReader::Pull(size_t n) { src_eof_ = true; return Status::OK(); } - if (src_->bad()) return Status::Error("pxf: TableReader: istream read error"); + if (src_->bad()) return Status::Error("pxf: DatasetReader: istream read error"); // `fail` without `eof` is unusual; treat as a read error. - if (src_->fail() && got == 0) return Status::Error("pxf: TableReader: istream read failed"); + if (src_->fail() && got == 0) return Status::Error("pxf: DatasetReader: istream read failed"); return Status::OK(); } -Status TableReader::ReadHeader() { +Status DatasetReader::ReadHeader() { for (;;) { bool found = false; Status scan_err; @@ -437,22 +437,22 @@ Status TableReader::ReadHeader() { std::string_view header(pending_.data(), static_cast(header_end) + 1); auto doc = Parse(header); if (!doc.ok()) return doc.status(); - if (doc->tables.empty()) { - return Status::Error("pxf: no @table directive in stream"); + if (doc->datasets.empty()) { + return Status::Error("pxf: no @dataset directive in stream"); } - type_ = std::move(doc->tables[0].type); - columns_ = std::move(doc->tables[0].columns); + type_ = std::move(doc->datasets[0].type); + columns_ = std::move(doc->datasets[0].columns); directives_ = std::move(doc->directives); pending_.erase(0, static_cast(header_end) + 1); return Status::OK(); } if (src_eof_) { - return Status::Error("pxf: no @table directive in stream"); + return Status::Error("pxf: no @dataset directive in stream"); } if (static_cast(pending_.size()) >= kDefaultHeaderMaxBytes) { return Status::Error( - "pxf: @table header exceeds 65536 bytes; raise the budget or check that the input " - "begins with `@table TYPE (cols)`"); + "pxf: @dataset header exceeds 65536 bytes; raise the budget or check that the input " + "begins with `@dataset TYPE (cols)`"); } if (Status s = Pull(kStreamPullSize); !s.ok()) return s; } @@ -462,7 +462,7 @@ Status TableReader::ReadHeader() { Status BindRow(google::protobuf::Message* msg, const std::vector& columns, - const TableRow& row) { + const DatasetRow& row) { if (columns.size() != row.cells.size()) { return Status::Error(std::string("pxf: BindRow: ") + std::to_string(columns.size()) + " columns vs " + std::to_string(row.cells.size()) + " cells"); @@ -480,7 +480,7 @@ Status BindRow(google::protobuf::Message* msg, body.push_back('\n'); } // SkipValidate avoids re-running the reserved-name check per row — - // TableReader::Create / the materializing UnmarshalFull already + // DatasetReader::Create / the materializing UnmarshalFull already // validated the descriptor once at bind time. UnmarshalOptions opts; opts.skip_validate = true; diff --git a/src/pxf/decode_fast.cc b/src/pxf/decode_fast.cc index 30c35ff..d1179a9 100644 --- a/src/pxf/decode_fast.cc +++ b/src/pxf/decode_fast.cc @@ -125,7 +125,7 @@ class DirectDecoder { // ParseScalarCellValue consumes one scalar token at current_ and // builds the corresponding ValuePtr. Mirrors the scalar branches of - // the AST parser's ParseValue. Used by @table row parsing in + // the AST parser's ParseValue. Used by @dataset row parsing in // ConsumeDirectives — list / block / unparseable cell tokens are // already rejected by the caller, so this only handles scalar shapes. StatusOr ParseScalarCellValue() { @@ -206,20 +206,20 @@ class DirectDecoder { return Status::Error( pos.line, pos.column, - std::string("unsupported @table cell value: ") + TokenKindName(current_.kind)); + std::string("unsupported @dataset cell value: ") + TokenKindName(current_.kind)); } } - // ConsumeDirectives drains any leading `@type` / `@` / `@table` + // ConsumeDirectives drains any leading `@type` / `@` / `@dataset` // directives, leaving current_ at the first body token. When // result_ != nullptr (UnmarshalFull path), populates - // Result::Directives() and Result::Tables() with parsed Directive / - // TableDirective entries; when result_ == nullptr (Unmarshal path), + // Result::Directives() and Result::Datasets() with parsed Directive / + // DatasetDirective entries; when result_ == nullptr (Unmarshal path), // performs the same syntactic walk for validation but discards the // contents — no AST allocation occurs. // // Enforces the standalone constraint (draft §3.4.4): a document - // containing any @table directive MUST NOT also carry @type or + // containing any @dataset directive MUST NOT also carry @type or // top-level field entries. Status ConsumeDirectives() { bool saw_type = false; @@ -229,7 +229,7 @@ class DirectDecoder { if (current_.kind == TokenKind::kAtType) { if (has_table) { return PosError(current_.pos, - "@table directive cannot coexist with @type (draft §3.4.4)"); + "@dataset directive cannot coexist with @type (draft §3.4.4)"); } saw_type = true; Advance(); // consume @type @@ -238,6 +238,12 @@ class DirectDecoder { } Advance(); } else if (current_.kind == TokenKind::kAtDirective) { + if (IsFutureReservedDirective(current_.value)) { + return PosError( + current_.pos, + std::string("@") + std::string(current_.value) + + " is a spec-reserved directive name with no v1 semantics (draft §3.4.6)"); + } Directive dir; dir.pos = current_.pos; dir.name = std::string(current_.value); @@ -282,29 +288,31 @@ class DirectDecoder { } } if (result_ != nullptr) result_->AddDirective(std::move(dir)); - } else if (current_.kind == TokenKind::kAtTable) { + } else if (current_.kind == TokenKind::kAtDataset) { if (saw_type) { return PosError(current_.pos, - "@table directive cannot coexist with @type (draft §3.4.4)"); + "@dataset directive cannot coexist with @type (draft §3.4.4)"); } - TableDirective tbl; + DatasetDirective tbl; tbl.pos = current_.pos; if (!has_table) { first_table_pos = current_.pos; has_table = true; } - Advance(); // consume @table - if (current_.kind != TokenKind::kIdent) { - return PosError(current_.pos, "expected row message type after @table"); + Advance(); // consume @dataset + // Optional row message type; MAY be omitted when an anonymous + // @proto precedes (draft §3.4.4 Anonymous binding). + if (current_.kind == TokenKind::kIdent) { + tbl.type = std::string(current_.value); + Advance(); } - tbl.type = std::string(current_.value); - Advance(); if (current_.kind != TokenKind::kLParen) { - return PosError(current_.pos, "expected '(' to start @table column list"); + return PosError(current_.pos, "expected '(' to start @dataset column list"); } Advance(); if (current_.kind != TokenKind::kIdent) { - return PosError(current_.pos, "@table column list must contain at least one field name"); + return PosError(current_.pos, + "@dataset column list must contain at least one field name"); } for (;;) { if (current_.kind != TokenKind::kIdent) { @@ -313,8 +321,9 @@ class DirectDecoder { // v1: dotted column paths are reserved for a future revision. for (char c : current_.value) { if (c == '.') { - return PosError(current_.pos, - "@table column has dotted path; not supported in v1 (draft §3.4.4)"); + return PosError( + current_.pos, + "@dataset column has dotted path; not supported in v1 (draft §3.4.4)"); } } tbl.columns.emplace_back(current_.value); @@ -324,13 +333,13 @@ class DirectDecoder { continue; } if (current_.kind == TokenKind::kRParen) break; - return PosError(current_.pos, "expected ',' or ')' in @table column list"); + return PosError(current_.pos, "expected ',' or ')' in @dataset column list"); } Advance(); // consume ) const int n_cols = static_cast(tbl.columns.size()); // Zero or more rows; each cell is a single scalar token (or empty). while (current_.kind == TokenKind::kLParen) { - TableRow row; + DatasetRow row; row.pos = current_.pos; row.cells.reserve(n_cols); Advance(); // ( @@ -339,7 +348,7 @@ class DirectDecoder { row.cells.emplace_back(std::nullopt); // absent } else if (current_.kind == TokenKind::kLBracket || current_.kind == TokenKind::kLBrace) { return PosError(current_.pos, - "@table cells cannot contain list/block values in v1 (draft §3.4.4)"); + "@dataset cells cannot contain list/block values in v1 (draft §3.4.4)"); } else { auto v = ParseScalarCellValue(); if (!v.ok()) return v.status(); @@ -351,8 +360,9 @@ class DirectDecoder { row.cells.emplace_back(std::nullopt); // absent } else if (current_.kind == TokenKind::kLBracket || current_.kind == TokenKind::kLBrace) { - return PosError(current_.pos, - "@table cells cannot contain list/block values in v1 (draft §3.4.4)"); + return PosError( + current_.pos, + "@dataset cells cannot contain list/block values in v1 (draft §3.4.4)"); } else { auto v = ParseScalarCellValue(); if (!v.ok()) return v.status(); @@ -360,25 +370,95 @@ class DirectDecoder { } } if (current_.kind != TokenKind::kRParen) { - return PosError(current_.pos, "expected ',' or ')' in @table row"); + return PosError(current_.pos, "expected ',' or ')' in @dataset row"); } const int n_cells = static_cast(row.cells.size()); if (n_cells != n_cols) { return PosError(row.pos, - std::string("@table row has ") + std::to_string(n_cells) + + std::string("@dataset row has ") + std::to_string(n_cells) + " cells, expected " + std::to_string(n_cols) + " (column count)"); } Advance(); // consume ) tbl.rows.push_back(std::move(row)); } if (result_ != nullptr) result_->AddTable(std::move(tbl)); + } else if (current_.kind == TokenKind::kAtProto) { + ProtoDirective pd; + pd.pos = current_.pos; + Position at_pos = current_.pos; + Advance(); // consume @proto + + auto capture_brace_body = [this, at_pos](const std::string& label, + std::string* body) -> Status { + int open = current_.pos.offset; + int depth = 1; + Advance(); + while (depth > 0 && current_.kind != TokenKind::kEOF) { + if (current_.kind == TokenKind::kLBrace) { + ++depth; + } else if (current_.kind == TokenKind::kRBrace) { + --depth; + if (depth == 0) { + int close = current_.pos.offset; + *body = std::string( + lex_.Input().substr(open + 1, static_cast(close - (open + 1)))); + Advance(); + return Status::OK(); + } + } + Advance(); + } + return PosError(at_pos, label + ": unmatched '{'"); + }; + + switch (current_.kind) { + case TokenKind::kLBrace: { + pd.shape = ProtoShape::kAnonymous; + auto s = capture_brace_body("@proto (anonymous form)", &pd.body); + if (!s.ok()) return s; + break; + } + case TokenKind::kIdent: { + pd.shape = ProtoShape::kNamed; + pd.type_name = std::string(current_.value); + Advance(); + if (current_.kind != TokenKind::kLBrace) { + return PosError(current_.pos, + std::string("expected '{' after @proto ") + pd.type_name); + } + auto s = capture_brace_body(std::string("@proto ") + pd.type_name, &pd.body); + if (!s.ok()) return s; + break; + } + case TokenKind::kString: { + pd.shape = ProtoShape::kSource; + pd.body = std::string(current_.value); + Advance(); + break; + } + case TokenKind::kBytes: { + pd.shape = ProtoShape::kDescriptor; + auto decoded = detail::Base64DecodeStd(current_.value); + if (!decoded.has_value()) { + return PosError(current_.pos, "@proto descriptor body: invalid base64"); + } + pd.body = std::string(decoded->begin(), decoded->end()); + Advance(); + break; + } + default: + return PosError( + current_.pos, + "expected '{', dotted identifier, triple-quoted string, or b\"...\" after @proto"); + } + if (result_ != nullptr) result_->AddProto(std::move(pd)); } else { break; } } if (has_table && current_.kind != TokenKind::kEOF) { return PosError(first_table_pos, - "@table directive cannot coexist with top-level field entries " + "@dataset directive cannot coexist with top-level field entries " "(draft §3.4.4)"); } return Status::OK(); diff --git a/src/pxf/lexer.cc b/src/pxf/lexer.cc index f5cbb8d..e5dca38 100644 --- a/src/pxf/lexer.cc +++ b/src/pxf/lexer.cc @@ -163,8 +163,10 @@ const char* TokenKindName(TokenKind k) { return ")"; case TokenKind::kAtType: return "@type"; - case TokenKind::kAtTable: - return "@table"; + case TokenKind::kAtDataset: + return "@dataset"; + case TokenKind::kAtProto: + return "@proto"; case TokenKind::kAtDirective: return "@"; } @@ -491,7 +493,8 @@ Token Lexer::LexDirective(Position pos) { std::string_view name = input_.substr(start, pos_ - start); if (name.empty()) return Token{TokenKind::kIllegal, "@", pos}; if (name == "type") return Token{TokenKind::kAtType, "@type", pos}; - if (name == "table") return Token{TokenKind::kAtTable, "@table", pos}; + if (name == "dataset") return Token{TokenKind::kAtDataset, "@dataset", pos}; + if (name == "proto") return Token{TokenKind::kAtProto, "@proto", pos}; // kAtDirective's Token.value carries the bare name (no `@`); the // parser uses this directly as Directive.name. return Token{TokenKind::kAtDirective, name, pos}; diff --git a/src/pxf/parser.cc b/src/pxf/parser.cc index 6a30bb8..3f1c466 100644 --- a/src/pxf/parser.cc +++ b/src/pxf/parser.cc @@ -8,6 +8,7 @@ #include "protowire/detail/duration.h" #include "protowire/detail/rfc3339.h" #include "protowire/pxf/lexer.h" +#include "protowire/pxf/schema.h" namespace protowire::pxf { @@ -29,8 +30,9 @@ class Parser { StatusOr ParseBlockVal(); StatusOr> ParseBody(); StatusOr ParseDirective(int* end_offset); - StatusOr ParseTableDirective(int* end_offset); - StatusOr ParseTableRow(int expected, int* end_offset); + StatusOr ParseTableDirective(int* end_offset); + StatusOr ParseTableRow(int expected, int* end_offset); + StatusOr ParseProtoDirective(int* end_offset); StatusOr> ParseRowCell(); TokenKind PeekKind(); @@ -68,9 +70,9 @@ StatusOr Parser::ParseDocument() { Document doc; doc.leading_comments = FlushComments(); - // Top-of-document directives. @type, @, and @table may interleave + // Top-of-document directives. @type, @, and @dataset may interleave // in any order; @type populates type_url, @ appends to directives, - // @table appends to tables. body_offset tracks the byte immediately + // @dataset appends to tables. body_offset tracks the byte immediately // after the last directive's last token so consumers (e.g. chameleon) // can hash from there; it stays 0 when no directives are present. bool saw_type = false; @@ -81,7 +83,7 @@ StatusOr Parser::ParseDocument() { if (has_table) { return Status::Error(current_.pos.line, current_.pos.column, - "@table directive cannot coexist with @type; the @table header " + "@dataset directive cannot coexist with @type; the @dataset header " "declares the document's type (draft §3.4.4)"); } saw_type = true; @@ -99,11 +101,11 @@ StatusOr Parser::ParseDocument() { if (!d.ok()) return d.status(); doc.directives.push_back(std::move(d).consume()); doc.body_offset = end; - } else if (current_.kind == TokenKind::kAtTable) { + } else if (current_.kind == TokenKind::kAtDataset) { if (saw_type) { return Status::Error(current_.pos.line, current_.pos.column, - "@table directive cannot coexist with @type; the @table header " + "@dataset directive cannot coexist with @type; the @dataset header " "declares the document's type (draft §3.4.4)"); } int end = 0; @@ -113,7 +115,13 @@ StatusOr Parser::ParseDocument() { first_table_pos = tbl->pos; has_table = true; } - doc.tables.push_back(std::move(tbl).consume()); + doc.datasets.push_back(std::move(tbl).consume()); + doc.body_offset = end; + } else if (current_.kind == TokenKind::kAtProto) { + int end = 0; + auto pd = ParseProtoDirective(&end); + if (!pd.ok()) return pd.status(); + doc.protos.push_back(std::move(pd).consume()); doc.body_offset = end; } else { break; @@ -121,14 +129,14 @@ StatusOr Parser::ParseDocument() { } // Standalone constraint (draft §3.4.4): a document containing any - // @table directive MUST NOT also carry top-level field entries; the - // @table header IS the document's type declaration. + // @dataset directive MUST NOT also carry top-level field entries; the + // @dataset header IS the document's type declaration. if (has_table && current_.kind != TokenKind::kEOF) { return Status::Error( first_table_pos.line, first_table_pos.column, - "@table directive cannot coexist with top-level field entries; the document's " - "payload is the @table rows (draft §3.4.4)"); + "@dataset directive cannot coexist with top-level field entries; the document's " + "payload is the @dataset rows (draft §3.4.4)"); } while (current_.kind != TokenKind::kEOF) { @@ -407,6 +415,13 @@ StatusOr Parser::ParseDirective(int* end_offset) { auto leading = FlushComments(); Position at_pos = current_.pos; std::string name(current_.value); + if (IsFutureReservedDirective(name)) { + return Status::Error( + at_pos.line, + at_pos.column, + std::string("@") + name + + " is a spec-reserved directive name with no v1 semantics (draft §3.4.6)"); + } Directive d; d.pos = at_pos; d.name = name; @@ -457,31 +472,28 @@ StatusOr Parser::ParseDirective(int* end_offset) { return d; } -// ParseTableDirective reads `@table ( col1, col2, ... ) row*`. -// kAtTable is current on entry. Writes the byte offset immediately +// ParseTableDirective reads `@dataset ( col1, col2, ... ) row*`. +// kAtDataset is current on entry. Writes the byte offset immediately // past the directive's last token to *end_offset. See draft §3.4.4. -StatusOr Parser::ParseTableDirective(int* end_offset) { +StatusOr Parser::ParseTableDirective(int* end_offset) { auto leading = FlushComments(); - TableDirective tbl; + DatasetDirective tbl; tbl.pos = current_.pos; tbl.leading_comments = std::move(leading); - Advance(); // consume @table + Advance(); // consume @dataset - // Required: row message type (dotted identifier). - if (current_.kind != TokenKind::kIdent) { - return Status::Error( - current_.pos.line, - current_.pos.column, - std::string("expected row message type after @table, got ") + TokenKindName(current_.kind)); + // Optional row message type. MAY be omitted when an anonymous @proto + // directive precedes the dataset (draft §3.4.4 Anonymous binding). + if (current_.kind == TokenKind::kIdent) { + tbl.type = std::string(current_.value); + Advance(); } - tbl.type = std::string(current_.value); - Advance(); // Required: column list in `( ... )`. At least one column. if (current_.kind != TokenKind::kLParen) { return Status::Error(current_.pos.line, current_.pos.column, - std::string("expected '(' to start @table column list, got ") + + std::string("expected '(' to start @dataset column list, got ") + TokenKindName(current_.kind)); } Advance(); // consume ( @@ -489,7 +501,7 @@ StatusOr Parser::ParseTableDirective(int* end_offset) { if (current_.kind != TokenKind::kIdent) { return Status::Error(current_.pos.line, current_.pos.column, - std::string("@table column list must contain at least one field name, " + std::string("@dataset column list must contain at least one field name, " "got ") + TokenKindName(current_.kind)); } @@ -506,7 +518,7 @@ StatusOr Parser::ParseTableDirective(int* end_offset) { if (ContainsDot(col_name)) { return Status::Error(current_.pos.line, current_.pos.column, - std::string("@table column \"") + col_name + + std::string("@dataset column \"") + col_name + "\": dotted column paths are not supported in v1 (draft §3.4.4)"); } tbl.columns.push_back(std::move(col_name)); @@ -518,7 +530,7 @@ StatusOr Parser::ParseTableDirective(int* end_offset) { if (current_.kind == TokenKind::kRParen) break; return Status::Error(current_.pos.line, current_.pos.column, - std::string("expected ',' or ')' in @table column list, got ") + + std::string("expected ',' or ')' in @dataset column list, got ") + TokenKindName(current_.kind)); } int eo = current_.pos.offset + 1; // past `)` @@ -539,11 +551,11 @@ StatusOr Parser::ParseTableDirective(int* end_offset) { // ParseTableRow reads `( cell ( ',' cell )* )` with an arity check // against `expected`. kLParen is current on entry. Writes the byte // offset immediately past the closing `)` to *end_offset. -StatusOr Parser::ParseTableRow(int expected, int* end_offset) { +StatusOr Parser::ParseTableRow(int expected, int* end_offset) { Position pos = current_.pos; Advance(); // consume ( - TableRow row; + DatasetRow row; row.pos = pos; row.cells.reserve(expected); @@ -562,7 +574,7 @@ StatusOr Parser::ParseTableRow(int expected, int* end_offset) { return Status::Error( current_.pos.line, current_.pos.column, - std::string("expected ',' or ')' in @table row, got ") + TokenKindName(current_.kind)); + std::string("expected ',' or ')' in @dataset row, got ") + TokenKindName(current_.kind)); } int eo = current_.pos.offset + 1; Advance(); // consume ) @@ -570,14 +582,102 @@ StatusOr Parser::ParseTableRow(int expected, int* end_offset) { if (static_cast(row.cells.size()) != expected) { return Status::Error(pos.line, pos.column, - std::string("@table row has ") + std::to_string(row.cells.size()) + + std::string("@dataset row has ") + std::to_string(row.cells.size()) + " cells, expected " + std::to_string(expected) + " (column count)"); } *end_offset = eo; return row; } -// ParseRowCell consumes one cell of a @table row. Returns nullopt for +// ParseProtoDirective reads `@proto ` (draft §3.4.5). kAtProto +// is current on entry. Four body shapes are lexically distinguished: +// +// - anonymous: `@proto { }` +// - named: `@proto { }` +// - source: `@proto """"""` +// - descriptor: `@proto b""` +// +// For brace-bounded shapes the body is sliced as raw bytes between +// `{` and the matching `}` (both exclusive); the contents are +// protobuf source and are NOT decoded as PXF entries. +StatusOr Parser::ParseProtoDirective(int* end_offset) { + auto leading = FlushComments(); + Position at_pos = current_.pos; + ProtoDirective pd; + pd.pos = at_pos; + pd.leading_comments = std::move(leading); + Advance(); // consume @proto + + auto capture_brace_body = [this, at_pos]( + const std::string& label, std::string* body, int* eo) -> Status { + int open = current_.pos.offset; + int close = FindMatchingBrace(lex_.Input(), open); + if (close < 0) { + return Status::Error(at_pos.line, at_pos.column, label + ": unmatched '{'"); + } + *body = std::string(lex_.Input().substr(open + 1, close - (open + 1))); + // Reposition the lexer past the closing `}` and prime the parser. + lex_.RepositionTo(close + 1); + Advance(); + *eo = close + 1; + return Status::OK(); + }; + + switch (current_.kind) { + case TokenKind::kLBrace: { + pd.shape = ProtoShape::kAnonymous; + int eo = 0; + auto st = capture_brace_body("@proto (anonymous form)", &pd.body, &eo); + if (!st.ok()) return st; + *end_offset = eo; + return pd; + } + case TokenKind::kIdent: { + pd.shape = ProtoShape::kNamed; + pd.type_name = std::string(current_.value); + Advance(); + if (current_.kind != TokenKind::kLBrace) { + return Status::Error(current_.pos.line, + current_.pos.column, + std::string("expected '{' after @proto ") + pd.type_name + ", got " + + TokenKindName(current_.kind)); + } + int eo = 0; + auto st = capture_brace_body(std::string("@proto ") + pd.type_name, &pd.body, &eo); + if (!st.ok()) return st; + *end_offset = eo; + return pd; + } + case TokenKind::kString: { + pd.shape = ProtoShape::kSource; + pd.body = std::string(current_.value); + *end_offset = current_.pos.offset + static_cast(current_.value.size()); + Advance(); + return pd; + } + case TokenKind::kBytes: { + pd.shape = ProtoShape::kDescriptor; + auto decoded = detail::Base64DecodeStd(current_.value); + if (!decoded.has_value()) { + return Status::Error( + current_.pos.line, current_.pos.column, "@proto descriptor body: invalid base64"); + } + pd.body = std::string(decoded->begin(), decoded->end()); + *end_offset = current_.pos.offset + static_cast(current_.value.size()) + 3; // b" … " + Advance(); + return pd; + } + default: + return Status::Error( + current_.pos.line, + current_.pos.column, + std::string("expected '{', dotted identifier, triple-quoted string, or b\"...\" after " + "@proto, got ") + + TokenKindName(current_.kind)); + } +} + +// ParseRowCell consumes one cell of a @dataset row. Returns nullopt for // an empty cell (no value between two commas, or at row start/end); // rejects list / block values per v1 cell-grammar (draft §3.4.4). StatusOr> Parser::ParseRowCell() { @@ -588,11 +688,11 @@ StatusOr> Parser::ParseRowCell() { case TokenKind::kLBracket: return Status::Error(current_.pos.line, current_.pos.column, - "@table cells cannot contain list values in v1 (draft §3.4.4)"); + "@dataset cells cannot contain list values in v1 (draft §3.4.4)"); case TokenKind::kLBrace: return Status::Error(current_.pos.line, current_.pos.column, - "@table cells cannot contain block values in v1 (draft §3.4.4)"); + "@dataset cells cannot contain block values in v1 (draft §3.4.4)"); default: break; } diff --git a/src/pxf/schema.cc b/src/pxf/schema.cc index 9856d22..fa02b72 100644 --- a/src/pxf/schema.cc +++ b/src/pxf/schema.cc @@ -118,4 +118,13 @@ std::vector ValidateFile(const pb::FileDescriptor* fd) { return out; } +bool IsFutureReservedDirective(std::string_view name) { + // Names the spec reserves for future allocation (draft §3.4.6). + // Names with their own production (`type`, `dataset`, `proto`) and + // the spec-registered `entry` aren't included here — they're handled + // by the lexer or the named_directive shape. + return name == "table" || name == "datasource" || name == "view" || name == "procedure" || + name == "function" || name == "permissions"; +} + } // namespace protowire::pxf diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3a558f8..9640e20 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -36,7 +36,8 @@ if(TARGET protowire_pxf) pxf_null_test pxf_bignum_test pxf_features_test pxf_any_test pxf_fast_test pxf_full_roundtrip_test pxf_escapes_test pxf_annotations_test pxf_format_test pxf_directive_test - pxf_schema_test pxf_result_directives_test pxf_table_reader_test) + pxf_schema_test pxf_result_directives_test pxf_dataset_reader_test + pxf_proto_directive_test) if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${f}.cc) list(APPEND _pxf_test_srcs ${f}.cc) endif() diff --git a/test/pxf_table_reader_test.cc b/test/pxf_dataset_reader_test.cc similarity index 77% rename from test/pxf_table_reader_test.cc rename to test/pxf_dataset_reader_test.cc index 1cc05e6..08dff5e 100644 --- a/test/pxf_table_reader_test.cc +++ b/test/pxf_dataset_reader_test.cc @@ -1,11 +1,11 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2026 TrendVidia, LLC. // -// Tests for TableReader (streaming @table consumption) and BindRow +// Tests for DatasetReader (streaming @dataset consumption) and BindRow // (per-row proto binding). PR 4 of the v0.72-v0.75 cpp catch-up. #include "protowire/pxf.h" -#include "protowire/pxf/table_reader.h" +#include "protowire/pxf/dataset_reader.h" #include #include "protoc_compat.h" @@ -22,8 +22,8 @@ namespace { namespace pb = google::protobuf; using protowire::pxf::BindRow; -using protowire::pxf::TableReader; -using protowire::pxf::TableRow; +using protowire::pxf::DatasetReader; +using protowire::pxf::DatasetRow; class SilentErrorCollector : public pb::compiler::MultiFileErrorCollector { public: @@ -57,11 +57,11 @@ class PxfTableReader : public ::testing::Test { std::unique_ptr factory_; }; -// ---- TableReader::Create header parsing ----------------------------------- +// ---- DatasetReader::Create header parsing ----------------------------------- TEST_F(PxfTableReader, ReadsHeaderAndExposesTypeAndColumns) { - std::istringstream in("@table trades.v1.Trade ( px, qty )\n( 100, 5 )\n( 101, 7 )\n"); - auto tr = TableReader::Create(&in); + std::istringstream in("@dataset trades.v1.Trade ( px, qty )\n( 100, 5 )\n( 101, 7 )\n"); + auto tr = DatasetReader::Create(&in); ASSERT_TRUE(tr.ok()) << tr.status().message(); EXPECT_EQ((*tr)->Type(), "trades.v1.Trade"); ASSERT_EQ((*tr)->Columns().size(), 2u); @@ -72,29 +72,29 @@ TEST_F(PxfTableReader, ReadsHeaderAndExposesTypeAndColumns) { TEST_F(PxfTableReader, NoTableReturnsError) { std::istringstream in("@type foo.Msg\nname = \"x\"\n"); - auto tr = TableReader::Create(&in); + auto tr = DatasetReader::Create(&in); ASSERT_FALSE(tr.ok()); - EXPECT_NE(std::string(tr.status().message()).find("no @table directive"), std::string::npos); + EXPECT_NE(std::string(tr.status().message()).find("no @dataset directive"), std::string::npos); } TEST_F(PxfTableReader, EmptyInputReturnsError) { std::istringstream in(""); - auto tr = TableReader::Create(&in); + auto tr = DatasetReader::Create(&in); ASSERT_FALSE(tr.ok()); } TEST_F(PxfTableReader, NullStreamRejected) { - auto tr = TableReader::Create(nullptr); + auto tr = DatasetReader::Create(nullptr); ASSERT_FALSE(tr.ok()); } TEST_F(PxfTableReader, LeadingDirectivesPreserved) { std::istringstream in(R"(@header pkg.Hdr { id = "h" } @frob alpha -@table trades.v1.Trade ( px, qty ) +@dataset trades.v1.Trade ( px, qty ) ( 1, 2 ) )"); - auto tr = TableReader::Create(&in); + auto tr = DatasetReader::Create(&in); ASSERT_TRUE(tr.ok()) << tr.status().message(); ASSERT_EQ((*tr)->Directives().size(), 2u); EXPECT_EQ((*tr)->Directives()[0].name, "header"); @@ -102,28 +102,28 @@ TEST_F(PxfTableReader, LeadingDirectivesPreserved) { } TEST_F(PxfTableReader, HeaderOversizeRejected) { - // Generate >64 KiB of leading directive bytes before any @table — + // Generate >64 KiB of leading directive bytes before any @dataset — // should fail-fast with the budget message. std::string big; big.reserve(70 * 1024); big.append("@frob "); while (big.size() < 70 * 1024) big.append("x "); - big.append("\n@table x.Row ( a )\n"); + big.append("\n@dataset x.Row ( a )\n"); std::istringstream in(big); - auto tr = TableReader::Create(&in); + auto tr = DatasetReader::Create(&in); ASSERT_FALSE(tr.ok()); EXPECT_NE(std::string(tr.status().message()).find("header exceeds"), std::string::npos); } -// ---- TableReader::Next row iteration -------------------------------------- +// ---- DatasetReader::Next row iteration -------------------------------------- TEST_F(PxfTableReader, IteratesAllRowsInOrder) { - std::istringstream in("@table x.Row ( a, b )\n( 1, 2 )\n( 3, 4 )\n( 5, 6 )\n"); - auto tr = TableReader::Create(&in); + std::istringstream in("@dataset x.Row ( a, b )\n( 1, 2 )\n( 3, 4 )\n( 5, 6 )\n"); + auto tr = DatasetReader::Create(&in); ASSERT_TRUE(tr.ok()); int count = 0; for (;;) { - TableRow row; + DatasetRow row; auto s = (*tr)->Next(&row); ASSERT_TRUE(s.ok()) << s.message(); if ((*tr)->Done()) break; @@ -134,22 +134,22 @@ TEST_F(PxfTableReader, IteratesAllRowsInOrder) { } TEST_F(PxfTableReader, ZeroRowsReportsDoneImmediately) { - std::istringstream in("@table x.Row ( a )\n"); - auto tr = TableReader::Create(&in); + std::istringstream in("@dataset x.Row ( a )\n"); + auto tr = DatasetReader::Create(&in); ASSERT_TRUE(tr.ok()); - TableRow row; + DatasetRow row; auto s = (*tr)->Next(&row); ASSERT_TRUE(s.ok()); EXPECT_TRUE((*tr)->Done()); } TEST_F(PxfTableReader, RowCellsParsedAsExpectedShapes) { - std::istringstream in(R"(@table x.Row ( a, b, c, d ) + std::istringstream in(R"(@dataset x.Row ( a, b, c, d ) ( 42, "hello", true, null ) )"); - auto tr = TableReader::Create(&in); + auto tr = DatasetReader::Create(&in); ASSERT_TRUE(tr.ok()); - TableRow row; + DatasetRow row; ASSERT_TRUE((*tr)->Next(&row).ok()); ASSERT_FALSE((*tr)->Done()); ASSERT_EQ(row.cells.size(), 4u); @@ -169,10 +169,10 @@ TEST_F(PxfTableReader, RowCellsParsedAsExpectedShapes) { } TEST_F(PxfTableReader, ThreeStateCellsAbsentNullSet) { - std::istringstream in("@table x.Row ( a, b, c )\n( 1, , null )\n"); - auto tr = TableReader::Create(&in); + std::istringstream in("@dataset x.Row ( a, b, c )\n( 1, , null )\n"); + auto tr = DatasetReader::Create(&in); ASSERT_TRUE(tr.ok()); - TableRow row; + DatasetRow row; ASSERT_TRUE((*tr)->Next(&row).ok()); ASSERT_EQ(row.cells.size(), 3u); EXPECT_TRUE(row.cells[0].has_value()); // present @@ -182,10 +182,10 @@ TEST_F(PxfTableReader, ThreeStateCellsAbsentNullSet) { } TEST_F(PxfTableReader, ArityMismatchSurfacesAndBecomesSticky) { - std::istringstream in("@table x.Row ( a, b )\n( 1, 2, 3 )\n( 4, 5 )\n"); - auto tr = TableReader::Create(&in); + std::istringstream in("@dataset x.Row ( a, b )\n( 1, 2, 3 )\n( 4, 5 )\n"); + auto tr = DatasetReader::Create(&in); ASSERT_TRUE(tr.ok()); - TableRow row; + DatasetRow row; auto s = (*tr)->Next(&row); ASSERT_FALSE(s.ok()); EXPECT_NE(std::string(s.message()).find("3 cells, expected 2"), std::string::npos); @@ -197,7 +197,7 @@ TEST_F(PxfTableReader, ArityMismatchSurfacesAndBecomesSticky) { TEST_F(PxfTableReader, MultiByteRowsAcrossPullBoundaries) { // Force the row scanner to pull bytes across many chunk boundaries // by using a row body that's much larger than the 4 KiB pull size. - std::string body = "@table x.Row ( a )\n"; + std::string body = "@dataset x.Row ( a )\n"; const int row_count = 50; for (int i = 0; i < row_count; ++i) { body.append("("); @@ -205,11 +205,11 @@ TEST_F(PxfTableReader, MultiByteRowsAcrossPullBoundaries) { body.append("\")\n"); } std::istringstream in(body); - auto tr = TableReader::Create(&in); + auto tr = DatasetReader::Create(&in); ASSERT_TRUE(tr.ok()) << tr.status().message(); int seen = 0; for (;;) { - TableRow row; + DatasetRow row; auto s = (*tr)->Next(&row); ASSERT_TRUE(s.ok()) << s.message(); if ((*tr)->Done()) break; @@ -219,10 +219,10 @@ TEST_F(PxfTableReader, MultiByteRowsAcrossPullBoundaries) { } TEST_F(PxfTableReader, ParenthesesInsideStringNotMistakenForRowBoundary) { - std::istringstream in("@table x.Row ( a )\n( \"hi ) there\" )\n( \"next\" )\n"); - auto tr = TableReader::Create(&in); + std::istringstream in("@dataset x.Row ( a )\n( \"hi ) there\" )\n( \"next\" )\n"); + auto tr = DatasetReader::Create(&in); ASSERT_TRUE(tr.ok()); - TableRow row; + DatasetRow row; ASSERT_TRUE((*tr)->Next(&row).ok()); ASSERT_FALSE((*tr)->Done()); EXPECT_EQ(std::get>(*row.cells[0])->value, @@ -235,7 +235,7 @@ TEST_F(PxfTableReader, ParenthesesInsideStringNotMistakenForRowBoundary) { } TEST_F(PxfTableReader, CommentsBetweenRowsIgnored) { - std::istringstream in(R"(@table x.Row ( a ) + std::istringstream in(R"(@dataset x.Row ( a ) # leading comment ( 1 ) // between rows @@ -244,11 +244,11 @@ TEST_F(PxfTableReader, CommentsBetweenRowsIgnored) { comment */ ( 3 ) )"); - auto tr = TableReader::Create(&in); + auto tr = DatasetReader::Create(&in); ASSERT_TRUE(tr.ok()); int count = 0; for (;;) { - TableRow row; + DatasetRow row; ASSERT_TRUE((*tr)->Next(&row).ok()); if ((*tr)->Done()) break; ++count; @@ -256,33 +256,33 @@ TEST_F(PxfTableReader, CommentsBetweenRowsIgnored) { EXPECT_EQ(count, 3); } -// ---- TableReader::Tail chaining ------------------------------------------ +// ---- DatasetReader::Tail chaining ------------------------------------------ TEST_F(PxfTableReader, TailAllowsChainingToSecondTable) { - std::istringstream in(R"(@table a.Row ( x ) + std::istringstream in(R"(@dataset a.Row ( x ) ( 1 ) ( 2 ) -@table b.Row ( y ) +@dataset b.Row ( y ) ( "p" ) ( "q" ) )"); - auto tr1 = TableReader::Create(&in); + auto tr1 = DatasetReader::Create(&in); ASSERT_TRUE(tr1.ok()); EXPECT_EQ((*tr1)->Type(), "a.Row"); // Drain to EOF. for (;;) { - TableRow row; + DatasetRow row; ASSERT_TRUE((*tr1)->Next(&row).ok()); if ((*tr1)->Done()) break; } // Chain the second table. auto tail = (*tr1)->Tail(); - auto tr2 = TableReader::Create(tail.get()); + auto tr2 = DatasetReader::Create(tail.get()); ASSERT_TRUE(tr2.ok()) << tr2.status().message(); EXPECT_EQ((*tr2)->Type(), "b.Row"); int n = 0; for (;;) { - TableRow row; + DatasetRow row; ASSERT_TRUE((*tr2)->Next(&row).ok()); if ((*tr2)->Done()) break; ++n; @@ -293,12 +293,12 @@ TEST_F(PxfTableReader, TailAllowsChainingToSecondTable) { // ---- BindRow + Scan ------------------------------------------------------- TEST_F(PxfTableReader, BindRowSetsFieldsByColumnName) { - std::istringstream in(R"(@table test.v1.AllTypes ( string_field, int32_field ) + std::istringstream in(R"(@dataset test.v1.AllTypes ( string_field, int32_field ) ( "alpha", 42 ) )"); - auto tr = TableReader::Create(&in); + auto tr = DatasetReader::Create(&in); ASSERT_TRUE(tr.ok()); - TableRow row; + DatasetRow row; ASSERT_TRUE((*tr)->Next(&row).ok()); ASSERT_FALSE((*tr)->Done()); @@ -310,11 +310,11 @@ TEST_F(PxfTableReader, BindRowSetsFieldsByColumnName) { } TEST_F(PxfTableReader, ScanIsEquivalentToNextPlusBindRow) { - std::istringstream in(R"(@table test.v1.AllTypes ( string_field ) + std::istringstream in(R"(@dataset test.v1.AllTypes ( string_field ) ( "row1" ) ( "row2" ) )"); - auto tr = TableReader::Create(&in); + auto tr = DatasetReader::Create(&in); ASSERT_TRUE(tr.ok()); std::vector seen; for (;;) { @@ -331,12 +331,12 @@ TEST_F(PxfTableReader, ScanIsEquivalentToNextPlusBindRow) { TEST_F(PxfTableReader, BindRowAbsentCellLeavesFieldDefault) { // proto3 string default is "". An absent cell should not stamp a value. - std::istringstream in(R"(@table test.v1.AllTypes ( string_field, int32_field ) + std::istringstream in(R"(@dataset test.v1.AllTypes ( string_field, int32_field ) ( , 7 ) )"); - auto tr = TableReader::Create(&in); + auto tr = DatasetReader::Create(&in); ASSERT_TRUE(tr.ok()); - TableRow row; + DatasetRow row; ASSERT_TRUE((*tr)->Next(&row).ok()); auto msg = NewAllTypes(); @@ -347,7 +347,7 @@ TEST_F(PxfTableReader, BindRowAbsentCellLeavesFieldDefault) { } TEST_F(PxfTableReader, BindRowMismatchColumnCountErrors) { - TableRow row; + DatasetRow row; row.cells.emplace_back(std::nullopt); row.cells.emplace_back(std::nullopt); auto msg = NewAllTypes(); @@ -360,10 +360,10 @@ TEST_F(PxfTableReader, BindRowMismatchColumnCountErrors) { TEST_F(PxfTableReader, BindRowUnknownColumnErrors) { // The synthetic body names a column the schema doesn't know — surfaces // as a per-field "field not found" from the underlying Unmarshal. - std::istringstream in(R"(@table test.v1.AllTypes ( not_a_field ) + std::istringstream in(R"(@dataset test.v1.AllTypes ( not_a_field ) ( "x" ) )"); - auto tr = TableReader::Create(&in); + auto tr = DatasetReader::Create(&in); ASSERT_TRUE(tr.ok()); auto msg = NewAllTypes(); auto s = (*tr)->Scan(msg.get()); diff --git a/test/pxf_directive_test.cc b/test/pxf_directive_test.cc index c3fab2e..2a537b0 100644 --- a/test/pxf_directive_test.cc +++ b/test/pxf_directive_test.cc @@ -4,11 +4,11 @@ // Parser-tier tests for the v0.72-v0.75 directive grammar: // - @ *() [{ ... }] (draft §3.4.2) // - @entry *() [{ ... }] (draft §3.4.3) -// - @table ( cols ) row* (draft §3.4.4) +// - @dataset ( cols ) row* (draft §3.4.4) // // These exercise Parse(...) directly and assert on AST shape — they do // NOT decode against a proto descriptor. Decode-tier wiring (Result -// accessors, TableReader, BindRow) arrives in later PRs of the +// accessors, DatasetReader, BindRow) arrives in later PRs of the // v0.72-v0.75 cpp catch-up sequence. #include "protowire/pxf.h" @@ -28,9 +28,9 @@ namespace { namespace pb = google::protobuf; +using protowire::pxf::DatasetDirective; using protowire::pxf::Document; using protowire::pxf::Parse; -using protowire::pxf::TableDirective; class SilentErrorCollector : public pb::compiler::MultiFileErrorCollector { public: @@ -187,14 +187,14 @@ TEST(Directive, BracesInsideStringNotCounted) { } TEST(Table, BasicTwoColumnsTwoRows) { - std::string_view src = R"(@table trades.v1.Trade ( px, qty ) + std::string_view src = R"(@dataset trades.v1.Trade ( px, qty ) ( 100, 5 ) ( 101, 7 ) )"; auto doc = Parse(src); ASSERT_TRUE(doc.ok()) << doc.status().message(); - ASSERT_EQ(doc->tables.size(), 1u); - const TableDirective& t = doc->tables[0]; + ASSERT_EQ(doc->datasets.size(), 1u); + const DatasetDirective& t = doc->datasets[0]; EXPECT_EQ(t.type, "trades.v1.Trade"); ASSERT_EQ(t.columns.size(), 2u); EXPECT_EQ(t.columns[0], "px"); @@ -209,13 +209,13 @@ TEST(Table, BasicTwoColumnsTwoRows) { TEST(Table, EmptyCellMeansAbsentField) { // The middle cell is empty (no value between two commas) — distinct // from `null` (present-but-null) per the three-state cell grammar. - std::string_view src = R"(@table x.Row ( a, b, c ) + std::string_view src = R"(@dataset x.Row ( a, b, c ) ( 1, , 3 ) )"; auto doc = Parse(src); ASSERT_TRUE(doc.ok()) << doc.status().message(); - ASSERT_EQ(doc->tables.size(), 1u); - const auto& row = doc->tables[0].rows[0]; + ASSERT_EQ(doc->datasets.size(), 1u); + const auto& row = doc->datasets[0].rows[0]; ASSERT_EQ(row.cells.size(), 3u); EXPECT_TRUE(row.cells[0].has_value()); EXPECT_FALSE(row.cells[1].has_value()); // absent @@ -223,27 +223,27 @@ TEST(Table, EmptyCellMeansAbsentField) { } TEST(Table, NullCellMeansPresentNull) { - std::string_view src = R"(@table x.Row ( a, b ) + std::string_view src = R"(@dataset x.Row ( a, b ) ( 1, null ) )"; auto doc = Parse(src); ASSERT_TRUE(doc.ok()) << doc.status().message(); - ASSERT_EQ(doc->tables.size(), 1u); - const auto& row = doc->tables[0].rows[0]; + ASSERT_EQ(doc->datasets.size(), 1u); + const auto& row = doc->datasets[0].rows[0]; ASSERT_EQ(row.cells.size(), 2u); EXPECT_TRUE(row.cells[1].has_value()); // present-but-null is not nullopt } TEST(Table, ZeroRowsOk) { - std::string_view src = "@table x.Row ( a, b )\n"; + std::string_view src = "@dataset x.Row ( a, b )\n"; auto doc = Parse(src); ASSERT_TRUE(doc.ok()) << doc.status().message(); - ASSERT_EQ(doc->tables.size(), 1u); - EXPECT_EQ(doc->tables[0].rows.size(), 0u); + ASSERT_EQ(doc->datasets.size(), 1u); + EXPECT_EQ(doc->datasets[0].rows.size(), 0u); } TEST(Table, ArityMismatchRejected) { - std::string_view src = R"(@table x.Row ( a, b ) + std::string_view src = R"(@dataset x.Row ( a, b ) ( 1, 2, 3 ) )"; auto doc = Parse(src); @@ -252,14 +252,14 @@ TEST(Table, ArityMismatchRejected) { } TEST(Table, DottedColumnRejected) { - std::string_view src = "@table x.Row ( a.b )\n"; + std::string_view src = "@dataset x.Row ( a.b )\n"; auto doc = Parse(src); ASSERT_FALSE(doc.ok()); EXPECT_NE(doc.status().message().find("dotted column"), std::string::npos); } TEST(Table, ListCellRejected) { - std::string_view src = R"(@table x.Row ( a ) + std::string_view src = R"(@dataset x.Row ( a ) ( [1, 2] ) )"; auto doc = Parse(src); @@ -268,7 +268,7 @@ TEST(Table, ListCellRejected) { } TEST(Table, BlockCellRejected) { - std::string_view src = R"(@table x.Row ( a ) + std::string_view src = R"(@dataset x.Row ( a ) ( { x = 1 } ) )"; auto doc = Parse(src); @@ -278,7 +278,7 @@ TEST(Table, BlockCellRejected) { TEST(Table, StandaloneRejectsCoexistingAtType) { std::string_view src = R"(@type some.Other -@table x.Row ( a ) +@dataset x.Row ( a ) ( 1 ) )"; auto doc = Parse(src); @@ -287,7 +287,7 @@ TEST(Table, StandaloneRejectsCoexistingAtType) { } TEST(Table, StandaloneRejectsCoexistingBodyEntries) { - std::string_view src = R"(@table x.Row ( a ) + std::string_view src = R"(@dataset x.Row ( a ) ( 1 ) extra = 5 )"; @@ -314,51 +314,54 @@ TEST(Directive, AtTypeWithoutIdentRejected) { } TEST(Directive, AtTypeAfterTableRejected) { - // Reverse order of the "type before table" violation: @table first, + // Reverse order of the "type before table" violation: @dataset first, // then @type — exercises the symmetric branch in ParseDocument. - auto doc = Parse("@table x.Row ( a )\n@type other.Msg\n"); + auto doc = Parse("@dataset x.Row ( a )\n@type other.Msg\n"); ASSERT_FALSE(doc.ok()); EXPECT_NE(doc.status().message().find("cannot coexist with @type"), std::string::npos); } -TEST(Table, MissingTypeRejected) { - auto doc = Parse("@table ( a )\n"); - ASSERT_FALSE(doc.ok()); - EXPECT_NE(doc.status().message().find("expected row message type after @table"), - std::string::npos); +TEST(Table, MissingTypeIsPermissive) { + // Type is optional in the AST under v1 (binds to a preceding anonymous + // @proto per draft §3.4.4 Anonymous binding). Binding-time validation + // handles the no-preceding-@proto case. + auto doc = Parse("@dataset ( a )\n"); + ASSERT_TRUE(doc.ok()); + ASSERT_EQ(doc->datasets.size(), 1u); + EXPECT_TRUE(doc->datasets[0].type.empty()); } TEST(Table, MissingLParenAfterTypeRejected) { - auto doc = Parse("@table x.Row a, b\n"); + auto doc = Parse("@dataset x.Row a, b\n"); ASSERT_FALSE(doc.ok()); - EXPECT_NE(doc.status().message().find("expected '(' to start @table column list"), + EXPECT_NE(doc.status().message().find("expected '(' to start @dataset column list"), std::string::npos); } TEST(Table, EmptyColumnListRejected) { - auto doc = Parse("@table x.Row ( )\n"); + auto doc = Parse("@dataset x.Row ( )\n"); ASSERT_FALSE(doc.ok()); EXPECT_NE(doc.status().message().find("at least one field name"), std::string::npos); } TEST(Table, BadTokenInColumnListRejected) { // Integer literal where a field name is expected. - auto doc = Parse("@table x.Row ( a, 123 )\n"); + auto doc = Parse("@dataset x.Row ( a, 123 )\n"); ASSERT_FALSE(doc.ok()); EXPECT_NE(doc.status().message().find("expected column field name"), std::string::npos); } TEST(Table, MissingCommaOrRParenInColumnListRejected) { - auto doc = Parse("@table x.Row ( a b )\n"); + auto doc = Parse("@dataset x.Row ( a b )\n"); ASSERT_FALSE(doc.ok()); - EXPECT_NE(doc.status().message().find("expected ',' or ')' in @table column list"), + EXPECT_NE(doc.status().message().find("expected ',' or ')' in @dataset column list"), std::string::npos); } TEST(Table, MissingCommaOrRParenInRowRejected) { - auto doc = Parse("@table x.Row ( a, b )\n( 1 2 )\n"); + auto doc = Parse("@dataset x.Row ( a, b )\n( 1 2 )\n"); ASSERT_FALSE(doc.ok()); - EXPECT_NE(doc.status().message().find("expected ',' or ')' in @table row"), std::string::npos); + EXPECT_NE(doc.status().message().find("expected ',' or ')' in @dataset row"), std::string::npos); } TEST(Directive, TrailingCommentInBlockBody) { @@ -486,7 +489,7 @@ string_field = "x" TEST_F(PxfDirectiveFast, AtTypeAfterTableRejected) { auto msg = NewAllTypes(); - auto st = protowire::pxf::Unmarshal(R"(@table x.Row ( a ) + auto st = protowire::pxf::Unmarshal(R"(@dataset x.Row ( a ) @type other )", msg.get()); @@ -497,7 +500,7 @@ TEST_F(PxfDirectiveFast, AtTypeAfterTableRejected) { TEST_F(PxfDirectiveFast, TableAfterTypeRejected) { auto msg = NewAllTypes(); auto st = protowire::pxf::Unmarshal(R"(@type other -@table x.Row ( a ) +@dataset x.Row ( a ) )", msg.get()); ASSERT_FALSE(st.ok()); @@ -505,11 +508,11 @@ TEST_F(PxfDirectiveFast, TableAfterTypeRejected) { } TEST_F(PxfDirectiveFast, TableWithRowsAndStandalone) { - // Fast path drops the @table rows in PR 1; the call succeeds because + // Fast path drops the @dataset rows in PR 1; the call succeeds because // the doc is well-formed (no @type, no body entries). PR 4 will // make Result.tables() expose the rows. auto msg = NewAllTypes(); - auto st = protowire::pxf::Unmarshal(R"(@table trades.v1.Trade ( px, qty ) + auto st = protowire::pxf::Unmarshal(R"(@dataset trades.v1.Trade ( px, qty ) ( 100, 5 ) ( 101, 7 ) )", @@ -519,7 +522,7 @@ TEST_F(PxfDirectiveFast, TableWithRowsAndStandalone) { TEST_F(PxfDirectiveFast, TableWithEmptyAndNullCells) { auto msg = NewAllTypes(); - auto st = protowire::pxf::Unmarshal(R"(@table x.Row ( a, b, c ) + auto st = protowire::pxf::Unmarshal(R"(@dataset x.Row ( a, b, c ) ( 1, , null ) ( null, , 9 ) )", @@ -529,13 +532,13 @@ TEST_F(PxfDirectiveFast, TableWithEmptyAndNullCells) { TEST_F(PxfDirectiveFast, TableZeroRows) { auto msg = NewAllTypes(); - auto st = protowire::pxf::Unmarshal("@table x.Row ( a, b )\n", msg.get()); + auto st = protowire::pxf::Unmarshal("@dataset x.Row ( a, b )\n", msg.get()); ASSERT_TRUE(st.ok()) << st.message(); } TEST_F(PxfDirectiveFast, TableArityMismatchRejected) { auto msg = NewAllTypes(); - auto st = protowire::pxf::Unmarshal(R"(@table x.Row ( a, b ) + auto st = protowire::pxf::Unmarshal(R"(@dataset x.Row ( a, b ) ( 1, 2, 3 ) )", msg.get()); @@ -545,14 +548,14 @@ TEST_F(PxfDirectiveFast, TableArityMismatchRejected) { TEST_F(PxfDirectiveFast, TableDottedColumnRejected) { auto msg = NewAllTypes(); - auto st = protowire::pxf::Unmarshal("@table x.Row ( a.b )\n", msg.get()); + auto st = protowire::pxf::Unmarshal("@dataset x.Row ( a.b )\n", msg.get()); ASSERT_FALSE(st.ok()); EXPECT_NE(std::string(st.message()).find("dotted path"), std::string::npos); } TEST_F(PxfDirectiveFast, TableListCellRejected) { auto msg = NewAllTypes(); - auto st = protowire::pxf::Unmarshal(R"(@table x.Row ( a ) + auto st = protowire::pxf::Unmarshal(R"(@dataset x.Row ( a ) ( [1, 2] ) )", msg.get()); @@ -562,7 +565,7 @@ TEST_F(PxfDirectiveFast, TableListCellRejected) { TEST_F(PxfDirectiveFast, TableBlockCellRejected) { auto msg = NewAllTypes(); - auto st = protowire::pxf::Unmarshal(R"(@table x.Row ( a ) + auto st = protowire::pxf::Unmarshal(R"(@dataset x.Row ( a ) ( { x = 1 } ) )", msg.get()); @@ -570,53 +573,55 @@ TEST_F(PxfDirectiveFast, TableBlockCellRejected) { EXPECT_NE(std::string(st.message()).find("list/block"), std::string::npos); } -TEST_F(PxfDirectiveFast, TableMissingTypeRejected) { +TEST_F(PxfDirectiveFast, TableMissingTypeIsPermissive) { + // Type is optional in v1 (binds to a preceding anonymous @proto per + // draft §3.4.4 Anonymous binding). The fast decoder accepts and binds + // the empty header; standalone-with-body-entries checking remains. auto msg = NewAllTypes(); - auto st = protowire::pxf::Unmarshal("@table ( a )\n", msg.get()); - ASSERT_FALSE(st.ok()); - EXPECT_NE(std::string(st.message()).find("expected row message type after @table"), - std::string::npos); + auto st = protowire::pxf::Unmarshal("@dataset ( a )\n", msg.get()); + EXPECT_TRUE(st.ok()) << st.message(); } TEST_F(PxfDirectiveFast, TableMissingLParenRejected) { auto msg = NewAllTypes(); - auto st = protowire::pxf::Unmarshal("@table x.Row a, b\n", msg.get()); + auto st = protowire::pxf::Unmarshal("@dataset x.Row a, b\n", msg.get()); ASSERT_FALSE(st.ok()); EXPECT_NE(std::string(st.message()).find("expected '(' to start"), std::string::npos); } TEST_F(PxfDirectiveFast, TableEmptyColumnsRejected) { auto msg = NewAllTypes(); - auto st = protowire::pxf::Unmarshal("@table x.Row ( )\n", msg.get()); + auto st = protowire::pxf::Unmarshal("@dataset x.Row ( )\n", msg.get()); ASSERT_FALSE(st.ok()); EXPECT_NE(std::string(st.message()).find("at least one field name"), std::string::npos); } TEST_F(PxfDirectiveFast, TableBadColumnTokenRejected) { auto msg = NewAllTypes(); - auto st = protowire::pxf::Unmarshal("@table x.Row ( a, 123 )\n", msg.get()); + auto st = protowire::pxf::Unmarshal("@dataset x.Row ( a, 123 )\n", msg.get()); ASSERT_FALSE(st.ok()); EXPECT_NE(std::string(st.message()).find("expected column field name"), std::string::npos); } TEST_F(PxfDirectiveFast, TableMissingCommaInColumnListRejected) { auto msg = NewAllTypes(); - auto st = protowire::pxf::Unmarshal("@table x.Row ( a b )\n", msg.get()); + auto st = protowire::pxf::Unmarshal("@dataset x.Row ( a b )\n", msg.get()); ASSERT_FALSE(st.ok()); - EXPECT_NE(std::string(st.message()).find("expected ',' or ')' in @table column list"), + EXPECT_NE(std::string(st.message()).find("expected ',' or ')' in @dataset column list"), std::string::npos); } TEST_F(PxfDirectiveFast, TableMissingCommaInRowRejected) { auto msg = NewAllTypes(); - auto st = protowire::pxf::Unmarshal("@table x.Row ( a, b )\n( 1 2 )\n", msg.get()); + auto st = protowire::pxf::Unmarshal("@dataset x.Row ( a, b )\n( 1 2 )\n", msg.get()); ASSERT_FALSE(st.ok()); - EXPECT_NE(std::string(st.message()).find("expected ',' or ')' in @table row"), std::string::npos); + EXPECT_NE(std::string(st.message()).find("expected ',' or ')' in @dataset row"), + std::string::npos); } TEST_F(PxfDirectiveFast, TableWithBodyEntriesRejected) { auto msg = NewAllTypes(); - auto st = protowire::pxf::Unmarshal(R"(@table x.Row ( a ) + auto st = protowire::pxf::Unmarshal(R"(@dataset x.Row ( a ) ( 1 ) string_field = "extra" )", diff --git a/test/pxf_proto_directive_test.cc b/test/pxf_proto_directive_test.cc new file mode 100644 index 0000000..ff9b8d7 --- /dev/null +++ b/test/pxf_proto_directive_test.cc @@ -0,0 +1,325 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2026 TrendVidia, LLC. +// +// Parser tests for the @proto directive (draft §3.4.5). +// +// Four body shapes lexically distinguished: anonymous, named, source, +// descriptor. Plus reserved-directive-name rejection (draft §3.4.6). + +#include "protowire/pxf.h" +#include "protowire/pxf/parser.h" + +#include +#include "protoc_compat.h" + +#include +#include + +#include +#include + +namespace pp = protowire::pxf; + +namespace { + +namespace pb = google::protobuf; + +class SilentErrorCollector : public pb::compiler::MultiFileErrorCollector { + public: + PROTOWIRE_PROTOC_RECORD_ERROR(filename, line, column, msg) { + last_ = std::string(filename) + ":" + std::to_string(line) + ":" + std::to_string(column) + + ": " + std::string(msg); + } + std::string last_; +}; + +std::string Body(const pp::ProtoDirective& pd) { + return pd.body; +} + +TEST(Proto, AnonymousBody) { + auto doc = pp::Parse(R"(@proto { + string symbol = 1; + double price = 2; +} +)"); + ASSERT_TRUE(doc.ok()) << doc.status().message(); + ASSERT_EQ(doc->protos.size(), 1u); + const auto& p = doc->protos[0]; + EXPECT_EQ(p.shape, pp::ProtoShape::kAnonymous); + EXPECT_EQ(p.type_name, ""); + EXPECT_NE(Body(p).find("string symbol = 1;"), std::string::npos); + EXPECT_NE(Body(p).find("double price = 2;"), std::string::npos); +} + +TEST(Proto, NamedBody) { + auto doc = pp::Parse(R"(@proto trades.v1.Trade { + string symbol = 1; + double price = 2; +} +)"); + ASSERT_TRUE(doc.ok()); + ASSERT_EQ(doc->protos.size(), 1u); + EXPECT_EQ(doc->protos[0].shape, pp::ProtoShape::kNamed); + EXPECT_EQ(doc->protos[0].type_name, "trades.v1.Trade"); + EXPECT_NE(Body(doc->protos[0]).find("string symbol = 1;"), std::string::npos); +} + +TEST(Proto, SourceBody) { + auto doc = pp::Parse(R"(@proto """ +syntax = "proto3"; +package trades.v1; +message Trade { string symbol = 1; } +""")"); + ASSERT_TRUE(doc.ok()) << doc.status().message(); + ASSERT_EQ(doc->protos.size(), 1u); + EXPECT_EQ(doc->protos[0].shape, pp::ProtoShape::kSource); + EXPECT_NE(Body(doc->protos[0]).find("message Trade"), std::string::npos); +} + +TEST(Proto, DescriptorBody) { + // "hello" → "aGVsbG8=" + auto doc = pp::Parse(R"(@proto b"aGVsbG8=")"); + ASSERT_TRUE(doc.ok()) << doc.status().message(); + ASSERT_EQ(doc->protos.size(), 1u); + EXPECT_EQ(doc->protos[0].shape, pp::ProtoShape::kDescriptor); + EXPECT_EQ(Body(doc->protos[0]), "hello"); +} + +TEST(Proto, Multiple) { + auto doc = pp::Parse(R"(@proto trades.v1.Trade { string symbol = 1; } +@proto orders.v1.Order { string id = 1; } +)"); + ASSERT_TRUE(doc.ok()); + ASSERT_EQ(doc->protos.size(), 2u); + EXPECT_EQ(doc->protos[0].type_name, "trades.v1.Trade"); + EXPECT_EQ(doc->protos[1].type_name, "orders.v1.Order"); +} + +TEST(Proto, AnonymousFollowedByUntypedDataset) { + // One-shot binding: anonymous @proto types the next untyped @dataset + // in document order (draft §3.4.4 Anonymous binding). + auto doc = pp::Parse(R"(@proto { + string symbol = 1; + double price = 2; +} +@dataset (symbol, price) +("AAPL", 192.34) +("MSFT", 410.10) +)"); + ASSERT_TRUE(doc.ok()) << doc.status().message(); + ASSERT_EQ(doc->protos.size(), 1u); + EXPECT_EQ(doc->protos[0].shape, pp::ProtoShape::kAnonymous); + ASSERT_EQ(doc->datasets.size(), 1u); + EXPECT_EQ(doc->datasets[0].type, ""); + EXPECT_EQ(doc->datasets[0].rows.size(), 2u); +} + +TEST(Proto, NestedBracesInBody) { + // Anonymous @proto with nested `message Side { ... }` must capture + // the body up to the matching outer `}`. + auto doc = pp::Parse(R"(@proto { + message Side { + string label = 1; + } + Side side = 1; +} +)"); + ASSERT_TRUE(doc.ok()); + ASSERT_EQ(doc->protos.size(), 1u); + const std::string body = Body(doc->protos[0]); + EXPECT_NE(body.find("message Side"), std::string::npos); + EXPECT_NE(body.find("Side side = 1;"), std::string::npos); +} + +TEST(Proto, RejectsBadShape) { + auto doc = pp::Parse("@proto 42"); + ASSERT_FALSE(doc.ok()); + EXPECT_NE(doc.status().message().find("after @proto"), std::string::npos); +} + +TEST(Proto, RejectsNamedMissingBrace) { + auto doc = pp::Parse("@proto trades.v1.Trade 42"); + ASSERT_FALSE(doc.ok()); + EXPECT_NE(doc.status().message().find("'{'"), std::string::npos); +} + +TEST(Proto, RejectsAnonymousUnmatchedBrace) { + auto doc = pp::Parse("@proto { string symbol = 1;"); + ASSERT_FALSE(doc.ok()); + EXPECT_NE(doc.status().message().find("unmatched"), std::string::npos); +} + +TEST(Proto, CoexistsWithType) { + auto doc = pp::Parse(R"(@type some.pkg.Foo +@proto some.pkg.Foo { + string name = 1; +} +)"); + ASSERT_TRUE(doc.ok()) << doc.status().message(); + EXPECT_EQ(doc->type_url, "some.pkg.Foo"); + ASSERT_EQ(doc->protos.size(), 1u); + EXPECT_EQ(doc->protos[0].shape, pp::ProtoShape::kNamed); +} + +TEST(ReservedDirectives, FutureReservedNamesRejected) { + // Draft §3.4.6: v1 decoders MUST reject @table / @datasource / + // @view / @procedure / @function / @permissions as spec-reserved + // (future-allocated). + for (const auto& name : {"table", "datasource", "view", "procedure", "function", "permissions"}) { + std::string input = std::string("@") + name + " { x = 1 }"; + auto doc = pp::Parse(input); + ASSERT_FALSE(doc.ok()) << "@" << name << " should be rejected"; + EXPECT_NE(doc.status().message().find("spec-reserved"), std::string::npos) << "for @" << name; + } +} + +// ---- ProtoShapeName coverage -------------------------------------------- + +TEST(ProtoShape, NameLookup) { + EXPECT_STREQ(pp::ProtoShapeName(pp::ProtoShape::kAnonymous), "anonymous"); + EXPECT_STREQ(pp::ProtoShapeName(pp::ProtoShape::kNamed), "named"); + EXPECT_STREQ(pp::ProtoShapeName(pp::ProtoShape::kSource), "source"); + EXPECT_STREQ(pp::ProtoShapeName(pp::ProtoShape::kDescriptor), "descriptor"); +} + +// ---- Fast-path coverage (decode_fast.cc) -------------------------------- +// Mirrors the AST tests above but routes through Unmarshal / UnmarshalFull +// — covers consumeProtoDirective + capture_brace_body on the fast path, +// plus Result::AddProto / Result::Protos() accessors. + +class ProtoFast : public ::testing::Test { + protected: + void SetUp() override { + source_tree_.MapPath("", TESTDATA_DIR); + source_tree_.MapPath("", WKT_PROTO_DIR); + importer_ = std::make_unique(&source_tree_, &errors_); + file_ = importer_->Import("test.proto"); + ASSERT_NE(file_, nullptr) << errors_.last_; + factory_ = std::make_unique(importer_->pool()); + desc_ = file_->FindMessageTypeByName("AllTypes"); + ASSERT_NE(desc_, nullptr); + } + std::unique_ptr NewAllTypes() { + return std::unique_ptr(factory_->GetPrototype(desc_)->New()); + } + pb::compiler::DiskSourceTree source_tree_; + SilentErrorCollector errors_; + std::unique_ptr importer_; + const pb::FileDescriptor* file_ = nullptr; + const pb::Descriptor* desc_ = nullptr; + std::unique_ptr factory_; +}; + +TEST_F(ProtoFast, AnonymousBody) { + auto msg = NewAllTypes(); + auto rr = pp::UnmarshalFull(R"(@proto { + string symbol = 1; +} +string_field = "hi" +)", + msg.get()); + ASSERT_TRUE(rr.ok()) << rr.status().message(); + ASSERT_EQ(rr->Protos().size(), 1u); + EXPECT_EQ(rr->Protos()[0].shape, pp::ProtoShape::kAnonymous); + EXPECT_NE(rr->Protos()[0].body.find("string symbol = 1;"), std::string::npos); +} + +TEST_F(ProtoFast, NamedBody) { + auto msg = NewAllTypes(); + auto rr = pp::UnmarshalFull(R"(@proto trades.v1.Trade { + string symbol = 1; +} +string_field = "hi" +)", + msg.get()); + ASSERT_TRUE(rr.ok()) << rr.status().message(); + ASSERT_EQ(rr->Protos().size(), 1u); + EXPECT_EQ(rr->Protos()[0].shape, pp::ProtoShape::kNamed); + EXPECT_EQ(rr->Protos()[0].type_name, "trades.v1.Trade"); +} + +TEST_F(ProtoFast, SourceBody) { + auto msg = NewAllTypes(); + auto rr = pp::UnmarshalFull(R"(@proto """ +syntax = "proto3"; +message Trade { string symbol = 1; } +""" +string_field = "hi" +)", + msg.get()); + ASSERT_TRUE(rr.ok()) << rr.status().message(); + ASSERT_EQ(rr->Protos().size(), 1u); + EXPECT_EQ(rr->Protos()[0].shape, pp::ProtoShape::kSource); +} + +TEST_F(ProtoFast, DescriptorBody) { + auto msg = NewAllTypes(); + // "hello" → "aGVsbG8=" + auto rr = pp::UnmarshalFull(R"(@proto b"aGVsbG8=" +string_field = "hi" +)", + msg.get()); + ASSERT_TRUE(rr.ok()) << rr.status().message(); + ASSERT_EQ(rr->Protos().size(), 1u); + EXPECT_EQ(rr->Protos()[0].shape, pp::ProtoShape::kDescriptor); + EXPECT_EQ(rr->Protos()[0].body, "hello"); +} + +TEST_F(ProtoFast, NestedBracesInBody) { + auto msg = NewAllTypes(); + auto rr = pp::UnmarshalFull(R"(@proto { + message Side { + string label = 1; + } + Side side = 1; +} +string_field = "hi" +)", + msg.get()); + ASSERT_TRUE(rr.ok()) << rr.status().message(); + ASSERT_EQ(rr->Protos().size(), 1u); + EXPECT_NE(rr->Protos()[0].body.find("message Side"), std::string::npos); +} + +TEST_F(ProtoFast, MultipleProtos) { + auto msg = NewAllTypes(); + auto rr = pp::UnmarshalFull(R"(@proto trades.v1.Trade { string symbol = 1; } +@proto orders.v1.Order { string id = 1; } +string_field = "hi" +)", + msg.get()); + ASSERT_TRUE(rr.ok()) << rr.status().message(); + EXPECT_EQ(rr->Protos().size(), 2u); +} + +TEST_F(ProtoFast, RejectsBadShape) { + auto msg = NewAllTypes(); + auto st = pp::Unmarshal("@proto 42\nstring_field = \"hi\"\n", msg.get()); + ASSERT_FALSE(st.ok()); + EXPECT_NE(st.message().find("@proto"), std::string::npos); +} + +TEST_F(ProtoFast, RejectsNamedMissingBrace) { + auto msg = NewAllTypes(); + auto st = pp::Unmarshal("@proto trades.v1.Trade 42\nstring_field = \"hi\"\n", msg.get()); + ASSERT_FALSE(st.ok()); + EXPECT_NE(st.message().find("'{'"), std::string::npos); +} + +TEST_F(ProtoFast, RejectsAnonymousUnmatchedBrace) { + auto msg = NewAllTypes(); + auto st = pp::Unmarshal("@proto { string symbol = 1;\n", msg.get()); + ASSERT_FALSE(st.ok()); + EXPECT_NE(st.message().find("unmatched"), std::string::npos); +} + +TEST_F(ProtoFast, ReservedDirectiveRejected) { + // Draft §3.4.6 enforcement on the fast path. + auto msg = NewAllTypes(); + auto st = pp::Unmarshal("@table { x = 1 }\nstring_field = \"hi\"\n", msg.get()); + ASSERT_FALSE(st.ok()); + EXPECT_NE(st.message().find("spec-reserved"), std::string::npos); +} + +} // namespace diff --git a/test/pxf_result_directives_test.cc b/test/pxf_result_directives_test.cc index 6f2d852..939e53e 100644 --- a/test/pxf_result_directives_test.cc +++ b/test/pxf_result_directives_test.cc @@ -1,7 +1,7 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2026 TrendVidia, LLC. // -// Tests for Result::Directives() / Result::Tables() — PR 3 of the +// Tests for Result::Directives() / Result::Datasets() — PR 3 of the // v0.72-v0.75 cpp catch-up. The fast-path decoder now populates the // directive vectors on Result during UnmarshalFull, so consumers // (chameleon's @header reader, table binders, etc.) can read the @@ -59,7 +59,7 @@ TEST_F(PxfResultDirectives, EmptyDocumentHasEmptyAccessors) { auto r = protowire::pxf::UnmarshalFull("string_field = \"x\"\n", msg.get()); ASSERT_TRUE(r.ok()) << r.status().message(); EXPECT_TRUE(r->Directives().empty()); - EXPECT_TRUE(r->Tables().empty()); + EXPECT_TRUE(r->Datasets().empty()); } TEST_F(PxfResultDirectives, BareDirectiveRecorded) { @@ -155,18 +155,18 @@ string_field = "x" EXPECT_EQ(r->Directives()[0].name, "frob"); } -// ---- @table --------------------------------------------------------------- +// ---- @dataset --------------------------------------------------------------- TEST_F(PxfResultDirectives, TableRecordedWithColumnsAndRows) { auto msg = NewAllTypes(); - auto r = protowire::pxf::UnmarshalFull(R"(@table trades.v1.Trade ( px, qty ) + auto r = protowire::pxf::UnmarshalFull(R"(@dataset trades.v1.Trade ( px, qty ) ( 100, 5 ) ( 101, 7 ) )", msg.get()); ASSERT_TRUE(r.ok()) << r.status().message(); - ASSERT_EQ(r->Tables().size(), 1u); - const auto& t = r->Tables()[0]; + ASSERT_EQ(r->Datasets().size(), 1u); + const auto& t = r->Datasets()[0]; EXPECT_EQ(t.type, "trades.v1.Trade"); ASSERT_EQ(t.columns.size(), 2u); EXPECT_EQ(t.columns[0], "px"); @@ -179,13 +179,13 @@ TEST_F(PxfResultDirectives, TableRecordedWithColumnsAndRows) { TEST_F(PxfResultDirectives, TableCellsCarryActualValues) { auto msg = NewAllTypes(); - auto r = protowire::pxf::UnmarshalFull(R"(@table x.Row ( a, b, c ) + auto r = protowire::pxf::UnmarshalFull(R"(@dataset x.Row ( a, b, c ) ( 42, "hello", true ) )", msg.get()); ASSERT_TRUE(r.ok()) << r.status().message(); - ASSERT_EQ(r->Tables().size(), 1u); - const auto& row = r->Tables()[0].rows[0]; + ASSERT_EQ(r->Datasets().size(), 1u); + const auto& row = r->Datasets()[0].rows[0]; ASSERT_EQ(row.cells.size(), 3u); // Cell 0: IntVal with raw "42". ASSERT_TRUE(row.cells[0].has_value()); @@ -205,13 +205,13 @@ TEST_F(PxfResultDirectives, TableThreeStateCells) { // Empty cell = nullopt (absent); `null` literal = present-but-null // (cell holds a NullVal); value = present-with-value. auto msg = NewAllTypes(); - auto r = protowire::pxf::UnmarshalFull(R"(@table x.Row ( a, b, c ) + auto r = protowire::pxf::UnmarshalFull(R"(@dataset x.Row ( a, b, c ) ( 1, , null ) )", msg.get()); ASSERT_TRUE(r.ok()) << r.status().message(); - ASSERT_EQ(r->Tables().size(), 1u); - const auto& row = r->Tables()[0].rows[0]; + ASSERT_EQ(r->Datasets().size(), 1u); + const auto& row = r->Datasets()[0].rows[0]; ASSERT_EQ(row.cells.size(), 3u); EXPECT_TRUE(row.cells[0].has_value()); // present EXPECT_FALSE(row.cells[1].has_value()); // absent @@ -221,41 +221,41 @@ TEST_F(PxfResultDirectives, TableThreeStateCells) { TEST_F(PxfResultDirectives, MultipleTablesInOrder) { auto msg = NewAllTypes(); - auto r = protowire::pxf::UnmarshalFull(R"(@table a.Row ( x ) + auto r = protowire::pxf::UnmarshalFull(R"(@dataset a.Row ( x ) ( 1 ) -@table b.Row ( y, z ) +@dataset b.Row ( y, z ) ( "p", "q" ) )", msg.get()); ASSERT_TRUE(r.ok()) << r.status().message(); - ASSERT_EQ(r->Tables().size(), 2u); - EXPECT_EQ(r->Tables()[0].type, "a.Row"); - EXPECT_EQ(r->Tables()[1].type, "b.Row"); + ASSERT_EQ(r->Datasets().size(), 2u); + EXPECT_EQ(r->Datasets()[0].type, "a.Row"); + EXPECT_EQ(r->Datasets()[1].type, "b.Row"); } TEST_F(PxfResultDirectives, TableLeavesDirectivesEmpty) { - // Cross-check that @table populates only Tables(), not Directives(). + // Cross-check that @dataset populates only Datasets(), not Directives(). auto msg = NewAllTypes(); - auto r = protowire::pxf::UnmarshalFull("@table x.Row ( a )\n( 1 )\n", msg.get()); + auto r = protowire::pxf::UnmarshalFull("@dataset x.Row ( a )\n( 1 )\n", msg.get()); ASSERT_TRUE(r.ok()) << r.status().message(); - EXPECT_EQ(r->Tables().size(), 1u); + EXPECT_EQ(r->Datasets().size(), 1u); EXPECT_TRUE(r->Directives().empty()); } TEST_F(PxfResultDirectives, DirectivesAndTablesCanCoexist) { - // Note: a doc with @table can NOT have @type or body entries, but it - // CAN carry generic @s before the @table. + // Note: a doc with @dataset can NOT have @type or body entries, but it + // CAN carry generic @s before the @dataset. auto msg = NewAllTypes(); auto r = protowire::pxf::UnmarshalFull(R"(@header pkg.Hdr { id = "h" } -@table x.Row ( a ) +@dataset x.Row ( a ) ( 1 ) )", msg.get()); ASSERT_TRUE(r.ok()) << r.status().message(); ASSERT_EQ(r->Directives().size(), 1u); - ASSERT_EQ(r->Tables().size(), 1u); + ASSERT_EQ(r->Datasets().size(), 1u); EXPECT_EQ(r->Directives()[0].name, "header"); - EXPECT_EQ(r->Tables()[0].type, "x.Row"); + EXPECT_EQ(r->Datasets()[0].type, "x.Row"); } // ---- Discard path: Unmarshal (no Result) -------------------------------