diff --git a/Makefile b/Makefile index 6df1bde..8cedff2 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,7 @@ CXXFLAGS = -std=c++17 -Wall -Wextra -O2 -I. BUILD_DIR = build DBMS_OBJS = $(BUILD_DIR)/main.o \ + $(BUILD_DIR)/src/util/string.o \ $(BUILD_DIR)/src/sql/parser.o \ $(BUILD_DIR)/src/storage/disk_manager.o \ $(BUILD_DIR)/src/storage/buffer_pool.o \ @@ -27,6 +28,7 @@ TEST_OBJS = $(BUILD_DIR)/tests/sql/test_parser.o \ $(BUILD_DIR)/tests/sql/test_analyzer.o \ $(BUILD_DIR)/tests/sql/test_executor.o \ $(BUILD_DIR)/tests/sql/test_operators.o \ + $(BUILD_DIR)/src/util/string.o \ $(BUILD_DIR)/src/sql/parser.o \ $(BUILD_DIR)/src/storage/disk_manager.o \ $(BUILD_DIR)/src/storage/buffer_pool.o \ diff --git a/README.md b/README.md index 704b34f..d5f3968 100644 --- a/README.md +++ b/README.md @@ -199,8 +199,8 @@ twice in one query. ### What's not built yet -- DML (`INSERT` / `UPDATE` / `DELETE`) at the SQL surface — rows are - inserted today via `HeapFile::insert` + `TupleCodec::encode`, not SQL. +- `UPDATE` / `DELETE` at the SQL surface (`CREATE TABLE` and `INSERT` + are wired through Parser → Analyzer → Executor — see `grammar.md`). - `ORDER BY`, `LIMIT`, aggregates, expressions in the SELECT list, and table aliases. - Alternative operators (`HashJoin`, `IndexScan`, ...) and a real @@ -233,11 +233,12 @@ Run the demo (`main.cpp`): `main.cpp` is one end-to-end flow that exercises every layer: -1. open a fresh database file, create the catalog, declare a `users` and - a `posts` table; -2. seed both tables (5 rows each) via `TupleCodec` + `HeapFile`, flush; +1. open a fresh database file and bootstrap the catalog (system-table + pages 0/1); +2. issue `CREATE TABLE` + multi-row `INSERT` for `users` and `posts` + through Parser → Analyzer → Executor; flush; 3. *cold-reopen* the file with a brand-new `BufferPool` and `Catalog`; -4. run a handful of SQL strings (including filters and a join) +4. run a handful of SELECT strings (including filters and a join) through Parser → Analyzer → Planner → Executor and print each result as a padded table. diff --git a/grammar.md b/grammar.md new file mode 100644 index 0000000..80bfd40 --- /dev/null +++ b/grammar.md @@ -0,0 +1,116 @@ +# Supported SQL grammar + +This is the surface grammar accepted by `Parser` (`src/sql/parser.{h,cpp}`). +Notation: `` is a non-terminal, `[x]` is optional, `{x}` is zero-or-more, +`UPPERCASE` keywords are case-insensitive, single-quoted strings are literal +text. Whitespace between tokens is insignificant. + +The parser is a single-statement parser: each call to `Parser::parse()` +consumes exactly one statement and rejects trailing tokens. + +## Top-level + +``` + ::= + | + | +``` + +`Parser::parse()` returns a `Statement` variant whose alternative reflects +the production matched. + +## SELECT + +``` + ::= SELECT + FROM + {} + [WHERE ] + + ::= '*' + | {',' } + + ::= ['.' ] + + ::= JOIN ON '=' + + ::= + + ::= '=' | '!=' | '<' | '>' | '<=' | '>=' + + ::= | +``` + +Notes: +- `JOIN` is inner join only; ON accepts one equality between two columns. +- `WHERE` is a single predicate — no `AND` / `OR` yet. +- `WHERE`'s right-hand side must be a literal; column-vs-column is rejected. + +## CREATE TABLE + +``` + + ::= CREATE TABLE '(' {',' } ')' + + ::= [NOT NULL] + + ::= +``` + +Notes: +- The column list must contain at least one column. +- `` is any identifier; the analyzer is responsible for + mapping the surface text (e.g. `INT`, `BIGINT`, `BOOL`, `TEXT`) to a + concrete `Type`. The parser preserves the original casing. +- Columns are nullable by default. `NOT NULL` is the only column + constraint accepted. + +## INSERT + +``` + ::= INSERT INTO + [] + VALUES {',' } + + + ::= '(' {',' } ')' + + ::= '(' {',' } ')' + + + ::= | | NULL +``` + +Notes: +- The optional column list scopes the values to specific columns; when + omitted, the analyzer treats the row values as schema-column order. +- At least one row is required after `VALUES`. +- `NULL` is a literal in this position only (the analyzer rejects it + against a non-nullable column). Numeric and string literals carry the + same flags as in `WHERE`. + +## Lexical rules + +- **Identifiers**: letter or `_`, then letters, digits, or `_`. +- **Numbers**: one or more decimal digits. No sign, no decimal point. +- **Strings**: single-quoted; no escape sequences (a `'` ends the string). +- **Keywords** (case-insensitive, reserved): `SELECT`, `FROM`, `WHERE`, + `JOIN`, `ON`, `CREATE`, `TABLE`, `INSERT`, `INTO`, `VALUES`, `NOT`, + `NULL`. +- **Punctuation**: `,` `*` `.` `(` `)`. +- **Operators**: `=` `!=` `<` `>` `<=` `>=`. + +## Not yet supported + +- DML: `UPDATE`, `DELETE`. +- DDL beyond `CREATE TABLE`: `DROP`, `ALTER`, indexes, constraints other + than `NOT NULL`. +- `ORDER BY`, `LIMIT`, `GROUP BY`, aggregates. +- Expressions in the SELECT list (only column references are accepted). +- Compound `WHERE` predicates (`AND` / `OR`), parenthesised conditions, + `IN`, `LIKE`, `BETWEEN`, `IS NULL`. +- Table aliases (so a table cannot appear twice in one query). +- `INSERT ... SELECT`. +- `TRUE` / `FALSE` literals (use `0` / `1` against `Bool` columns). +- Numeric types beyond integer: floats, decimals, dates. +- Comments (`--`, `/* */`). diff --git a/main.cpp b/main.cpp index 2337ef6..abe4352 100644 --- a/main.cpp +++ b/main.cpp @@ -5,78 +5,26 @@ #include "src/sql/tuple.h" #include "src/storage/buffer_pool.h" #include "src/storage/disk_manager.h" -#include "src/storage/heap_file.h" #include #include #include #include #include -#include #include #include // ============================================================================= -// End-to-end demo: seed a small users + posts dataset, cold-reopen the -// database, then run a handful of SQL strings through Parser → Analyzer → -// Executor and print the rows that come back. +// End-to-end demo: run a CREATE TABLE, an INSERT, and a SELECT through +// Parser → Analyzer → Executor against a fresh database to exercise the +// full statement cycle. // ============================================================================= namespace { const std::string kDbPath = "/tmp/dbms_demo.db"; -void seedUsers(BufferPool& bp, const Catalog::TableInfo& info) { - const std::vector> rows = { - {1, "alice", 30}, - {2, "bob", 25}, - {3, "carol", 40}, - {4, "dave", 19}, - {5, "eve", 33}, - }; - HeapFile hf(&bp, info.root_page); - for (const auto& [id, name, age] : rows) { - const auto bytes = TupleCodec::encode(info.schema, { - Value::Int32(id), - Value::Text(name), - Value::Int32(age), - }); - hf.insert(bytes.data(), bytes.size()); - } -} - -void seedPosts(BufferPool& bp, const Catalog::TableInfo& info) { - // (id, title, user_id) — user_id matches the users table above. - const std::vector> rows = { - {100, "hello world", 1}, - {101, "second post", 1}, - {102, "carol's musings", 3}, - {103, "eve at midnight", 5}, - {104, "bob's silence", 2}, - }; - HeapFile hf(&bp, info.root_page); - for (const auto& [id, title, user_id] : rows) { - const auto bytes = TupleCodec::encode(info.schema, { - Value::Int32(id), - Value::Text(title), - Value::Int32(user_id), - }); - hf.insert(bytes.data(), bytes.size()); - } -} - -std::string valueToString(const Value& v) { - if (v.is_null) return "NULL"; - switch (v.type) { - case Type::Int32: return std::to_string(v.i32); - case Type::Int64: return std::to_string(v.i64); - case Type::Bool: return v.b ? "true" : "false"; - case Type::Text: return v.text; - } - return ""; -} - -void printResult(const ExecResult& r) { +void printSelectResult(const ExecResult& r) { // Column widths: max of header length and any value length, with a // small floor so single-char columns aren't crammed. std::vector widths(r.column_names.size()); @@ -117,76 +65,54 @@ void printResult(const ExecResult& r) { << (r.rows.size() == 1 ? "" : "s") << ")\n"; } -void runQuery(const Catalog& cat, BufferPool& bp, const std::string& sql) { +// Run one statement end-to-end. Picks a render based on the parsed +// statement kind: SELECT prints a padded table, CREATE TABLE / INSERT +// print a Postgres-style command tag. +void runStatement(Catalog& cat, BufferPool& bp, const std::string& sql) { std::cout << "\nSQL: " << sql << "\n"; try { Parser p(sql); - SelectQuery q = p.parse(); + Statement stmt = p.parse(); Analyzer az(cat); - BoundSelect bs = az.analyze(q); - Executor ex(&bp); - ExecResult r = ex.execute(std::move(bs)); - printResult(r); + BoundStatement bound = az.analyzeStatement(stmt); + Executor ex(&bp, &cat); + ExecResult r = ex.execute(std::move(bound)); + + if (std::holds_alternative(stmt)) { + printSelectResult(r); + } else if (std::holds_alternative(stmt)) { + std::cout << " CREATE TABLE\n"; + } else { + std::cout << " INSERT " << r.rows_affected << "\n"; + } } catch (const std::exception& e) { std::cout << " error: " << e.what() << "\n"; } } -void seedFreshDatabase() { - std::error_code ec; - std::filesystem::remove(kDbPath, ec); - - DiskManager dm(kDbPath); - BufferPool bp(8, &dm); - Catalog cat = Catalog::create(&bp); - - const Schema users_schema{{ - {"id", Type::Int32, false}, - {"name", Type::Text, false}, - {"age", Type::Int32, false}, - }}; - const Schema posts_schema{{ - {"id", Type::Int32, false}, - {"title", Type::Text, false}, - {"user_id", Type::Int32, false}, - }}; - cat.createTable("users", users_schema); - cat.createTable("posts", posts_schema); - - seedUsers(bp, *cat.getTable("users")); - seedPosts(bp, *cat.getTable("posts")); - bp.flushAll(); - - std::cout << "[seed] wrote users + posts to " << kDbPath - << " (" << std::filesystem::file_size(kDbPath) << " bytes)\n"; -} - } // namespace int main() { - seedFreshDatabase(); + std::error_code ec; + std::filesystem::remove(kDbPath, ec); - // Cold reopen — nothing is shared with the seeding phase except the file. DiskManager dm(kDbPath); BufferPool bp(8, &dm); - Catalog cat(&bp); + // Catalog::create allocates the system-table bootstrap pages + // (__tables at page 0, __columns at page 1) — user tables are + // built via SQL below. + Catalog cat = Catalog::create(&bp); - std::cout << "\n[query] reopened db; tables:"; - for (const auto& n : cat.tableNames()) std::cout << " " << n; - std::cout << "\n"; + runStatement(cat, bp, + "CREATE TABLE users (id INT NOT NULL, " + "name TEXT NOT NULL, age INT NOT NULL)"); + runStatement(cat, bp, + "INSERT INTO users VALUES " + "(1, 'alice', 30), (2, 'bob', 25), (3, 'carol', 40)"); + runStatement(cat, bp, "SELECT * FROM users"); - runQuery(cat, bp, "SELECT * FROM users"); - runQuery(cat, bp, "SELECT name, age FROM users WHERE age > 25"); - runQuery(cat, bp, "SELECT name FROM users WHERE name = 'alice'"); - runQuery(cat, bp, - "SELECT users.name, posts.title " - "FROM users JOIN posts ON users.id = posts.user_id"); - runQuery(cat, bp, - "SELECT users.name, posts.title " - "FROM users JOIN posts ON users.id = posts.user_id " - "WHERE users.age > 25"); + bp.flushAll(); - std::error_code ec; std::filesystem::remove(kDbPath, ec); return 0; } diff --git a/src/sql/analyzer.cpp b/src/sql/analyzer.cpp index 65bcee5..2d1771d 100644 --- a/src/sql/analyzer.cpp +++ b/src/sql/analyzer.cpp @@ -4,7 +4,10 @@ #include #include #include +#include +#include #include +#include Type resultTypeOf(const BoundExpr& e) { return std::visit( @@ -222,3 +225,131 @@ BoundSelect Analyzer::analyze(const SelectQuery& q) const { return out; } + +Value Analyzer::analyzeInsertCell(const InsertLiteral& lit, + const Column& col) const { + if (lit.is_null) { + if (!col.nullable) { + throw std::runtime_error( + "INSERT: NULL value for non-nullable column '" + col.name + "'"); + } + return Value::Null(col.type); + } + // Reuse the WHERE literal pipeline: same parsing rules, same range + // checks. analyzeLiteral throws on type mismatch (e.g. string for an + // Int32 column) which is exactly the error we want here. + BoundLiteral bl = analyzeLiteral(lit.text, lit.is_string, col.type); + return std::move(bl.value); +} + +BoundCreateTable Analyzer::analyze(const CreateTableStmt& s) const { + if (s.table.empty()) { + throw std::runtime_error("CREATE TABLE: empty table name"); + } + if (cat_.hasTable(s.table)) { + throw std::runtime_error( + "CREATE TABLE: table '" + s.table + "' already exists"); + } + if (s.columns.empty()) { + // The parser already rejects this, but reasserting here keeps + // the analyzer's preconditions self-contained. + throw std::runtime_error( + "CREATE TABLE: at least one column is required"); + } + + BoundCreateTable out; + out.name = s.table; + out.schema.columns.reserve(s.columns.size()); + + std::unordered_set seen; + for (const auto& cd : s.columns) { + if (!seen.insert(cd.name).second) { + throw std::runtime_error( + "CREATE TABLE: duplicate column name '" + cd.name + "'"); + } + out.schema.columns.push_back( + Column{cd.name, typeFromName(cd.type_name), cd.nullable}); + } + return out; +} + +BoundInsert Analyzer::analyze(const InsertStmt& s) const { + const Catalog::TableInfo* info = cat_.getTable(s.table); + if (info == nullptr) { + throw std::runtime_error("INSERT: no such table '" + s.table + "'"); + } + const Schema& schema = info->schema; + const size_t ncols = schema.columns.size(); + + // Build a permutation over schema columns: + // col_to_user_pos[c] = index into the user's row for schema column c, + // or kNotFound if the user didn't list that column. + // The default (no column list) is the identity mapping. + std::vector col_to_user_pos(ncols, Schema::kNotFound); + + if (s.columns.empty()) { + for (size_t c = 0; c < ncols; ++c) col_to_user_pos[c] = c; + } else { + std::unordered_set seen; + for (size_t i = 0; i < s.columns.size(); ++i) { + const std::string& name = s.columns[i]; + if (!seen.insert(name).second) { + throw std::runtime_error( + "INSERT: column '" + name + "' listed more than once"); + } + const size_t c = schema.indexOf(name); + if (c == Schema::kNotFound) { + throw std::runtime_error( + "INSERT: no such column '" + name + + "' in table '" + s.table + "'"); + } + col_to_user_pos[c] = i; + } + } + + // The user-side row width is what we validate each VALUES row against. + const size_t expected_user_n = s.columns.empty() ? ncols : s.columns.size(); + + BoundInsert out; + out.table = info; + out.rows.reserve(s.rows.size()); + + for (size_t r = 0; r < s.rows.size(); ++r) { + const auto& row = s.rows[r]; + if (row.size() != expected_user_n) { + throw std::runtime_error( + "INSERT row " + std::to_string(r) + + ": " + std::to_string(row.size()) + + " values, expected " + std::to_string(expected_user_n)); + } + + std::vector values; + values.reserve(ncols); + for (size_t c = 0; c < ncols; ++c) { + const Column& col = schema.columns[c]; + const size_t pos = col_to_user_pos[c]; + if (pos == Schema::kNotFound) { + // Column omitted from the INSERT column list — defaults + // to NULL, which only flies if the column is nullable. + if (!col.nullable) { + throw std::runtime_error( + "INSERT: column '" + col.name + + "' is not nullable and no value was provided"); + } + values.push_back(Value::Null(col.type)); + } else { + values.push_back(analyzeInsertCell(row[pos], col)); + } + } + out.rows.push_back(std::move(values)); + } + return out; +} + +BoundStatement Analyzer::analyzeStatement(const Statement& s) const { + return std::visit( + [this](const auto& alt) -> BoundStatement { + return this->analyze(alt); + }, + s); +} diff --git a/src/sql/analyzer.h b/src/sql/analyzer.h index 752bf7b..19e993b 100644 --- a/src/sql/analyzer.h +++ b/src/sql/analyzer.h @@ -77,6 +77,32 @@ struct BoundSelect { bool select_all = false; }; +// Bound CREATE TABLE: surface type names (e.g. "INT", "TEXT") have been +// mapped to Type, and column names checked unique. The catalog is *not* +// touched at bind time — the executor performs the actual creation, so +// the analyzer also pre-checks that the table name is free. +struct BoundCreateTable { + std::string name; + Schema schema; +}; + +// Bound INSERT: every literal cell has been parsed into a typed Value +// (NULL becomes Value::Null(col.type)). `rows` is in schema column +// order — if the user wrote a column list, the analyzer has already +// reordered each row and filled unspecified columns with NULL. Each +// row has exactly `table->schema.columns.size()` values. +struct BoundInsert { + const Catalog::TableInfo* table; + std::vector> rows; +}; + +// A bound (analyzed) top-level SQL statement. Mirrors `Statement` in +// parser.h: each parser variant alternative maps to one bound variant +// alternative. +using BoundStatement = std::variant; + // Walks a parsed SelectQuery against a Catalog, resolving every name and // type-checking every operator. Throws std::runtime_error on any name // resolution failure or type mismatch. Pure: no I/O of its own beyond the @@ -85,7 +111,15 @@ class Analyzer { public: explicit Analyzer(const Catalog& cat) : cat_(cat) {} - BoundSelect analyze(const SelectQuery& q) const; + // Top-level entry: dispatches on the parser's variant alternative. + BoundStatement analyzeStatement(const Statement& s) const; + + // Per-statement entry points. Overloaded so analyzeStatement can + // dispatch with a single visit, and so call sites that already know + // the statement kind (e.g. existing tests) can stay terse. + BoundSelect analyze(const SelectQuery& q) const; + BoundCreateTable analyze(const CreateTableStmt& s) const; + BoundInsert analyze(const InsertStmt& s) const; private: // Set of tables visible to expression resolution. For now, just the @@ -103,5 +137,12 @@ class Analyzer { const Scope& scope) const; Type checkBinaryOp(Op op, Type lhs, Type rhs) const; + // Convert one INSERT cell against the column it lands in. Handles + // NULL specially (rejected when the column is non-nullable); for + // non-NULL it delegates to analyzeLiteral. Type-name resolution + // for CREATE TABLE lives in tuple.h's free `typeFromName`. + Value analyzeInsertCell(const InsertLiteral& lit, + const Column& col) const; + const Catalog& cat_; }; diff --git a/src/sql/executor.cpp b/src/sql/executor.cpp index 7efdaa6..2396cfe 100644 --- a/src/sql/executor.cpp +++ b/src/sql/executor.cpp @@ -2,6 +2,7 @@ #include "src/sql/plan_node.h" #include "src/sql/planner.h" +#include "src/storage/heap_file.h" #include #include @@ -42,3 +43,35 @@ ExecResult Executor::execute(BoundSelect bs) const { return out; } + +ExecResult Executor::execute(BoundCreateTable bs) const { + // The analyzer already pre-checked that the table name is free and + // every column type resolved. Catalog::createTable still re-validates + // the duplicate-name guard (defense in depth) and is responsible for + // allocating the heap file plus the __tables / __columns rows. + cat_->createTable(bs.name, std::move(bs.schema)); + return ExecResult{}; +} + +ExecResult Executor::execute(BoundInsert bs) const { + // BoundInsert::rows are already type-checked and in schema column + // order; this loop is just encode + heap-insert. + HeapFile hf(bp_, bs.table->root_page); + const Schema& schema = bs.table->schema; + for (const auto& row : bs.rows) { + const auto bytes = TupleCodec::encode(schema, row); + hf.insert(bytes.data(), bytes.size()); + } + + ExecResult out; + out.rows_affected = bs.rows.size(); + return out; +} + +ExecResult Executor::execute(BoundStatement bs) const { + return std::visit( + [this](auto&& alt) -> ExecResult { + return this->execute(std::move(alt)); + }, + std::move(bs)); +} diff --git a/src/sql/executor.h b/src/sql/executor.h index 804512b..fc20181 100644 --- a/src/sql/executor.h +++ b/src/sql/executor.h @@ -1,33 +1,61 @@ #pragma once #include "src/sql/analyzer.h" +#include "src/sql/catalog.h" #include "src/sql/tuple.h" #include "src/storage/buffer_pool.h" +#include #include #include -// Materialized result of running a BoundSelect: column metadata plus -// one vector per output row, in scan order. Every row has -// exactly column_names.size() values. +// Materialized result of running a bound statement. +// +// SELECT populates `column_names`, `column_types`, and `rows`; for +// SELECT, `rows_affected` is left at 0 (the row count is already in +// `rows.size()`). +// +// CREATE TABLE / INSERT leave the column/row vectors empty and use +// `rows_affected` as a Postgres-style command tag — 0 for CREATE, +// the inserted row count for INSERT. struct ExecResult { std::vector column_names; std::vector column_types; std::vector> rows; + std::size_t rows_affected = 0; }; -// Thin coordinator: builds a Volcano-style plan via Planner, drives -// the root operator's open/next/close, and applies the SELECT list to -// each row to produce a flat-row ExecResult. +// Thin coordinator over the analyzer's bound IR. +// +// SELECT path: builds a Volcano-style plan via Planner, drives the +// root operator's open/next/close, and applies the SELECT list to each +// row to produce a flat-row ExecResult. // -// `bs` is consumed — the planner moves the WHERE expression out of it. -// Pass an rvalue at the call site (std::move). +// CREATE TABLE: forwards the bound schema to the catalog. +// INSERT: opens a HeapFile at the bound table's root page and inserts +// one encoded tuple per BoundInsert row. +// +// All execute() overloads consume their argument (the planner moves +// the WHERE expression out of a BoundSelect; CREATE moves the schema +// into the catalog). Pass rvalues at the call site (std::move). +// +// `bp` and `cat` are non-owning. The catalog is mutable because +// CREATE TABLE updates it; mutating-through-pointer keeps the +// execute() methods const-correct. class Executor { public: - explicit Executor(BufferPool* bp) : bp_(bp) {} + Executor(BufferPool* bp, Catalog* cat) : bp_(bp), cat_(cat) {} + + // Top-level entry: dispatches on the analyzer's variant alternative. + ExecResult execute(BoundStatement bs) const; - ExecResult execute(BoundSelect bs) const; + // Per-statement primitives. Tests and call sites that already know + // the statement kind use these directly. + ExecResult execute(BoundSelect bs) const; + ExecResult execute(BoundCreateTable bs) const; + ExecResult execute(BoundInsert bs) const; private: BufferPool* bp_; + Catalog* cat_; }; diff --git a/src/sql/parser.cpp b/src/sql/parser.cpp index a300ed4..74e2123 100644 --- a/src/sql/parser.cpp +++ b/src/sql/parser.cpp @@ -1,5 +1,7 @@ #include "parser.h" +#include "src/util/string.h" + #include #include #include @@ -16,13 +18,6 @@ const char* opToString(Op op) { return "?"; } -// Used to fold keywords case-insensitively (so `select` and `SELECT` match). -static std::string toUpper(const std::string& s) { - std::string out = s; - for (char& c : out) c = static_cast(std::toupper(static_cast(c))); - return out; -} - // Tokenize as soon as constructed // so parse() can just read the tokens Parser::Parser(std::string sql) : src_(std::move(sql)) { @@ -48,12 +43,19 @@ void Parser::tokenize() { size_t start = i; while (i < n && isIdentCont(src_[i])) ++i; std::string word = src_.substr(start, i - start); - std::string upper = toUpper(word); - if (upper == "SELECT") tokens_.push_back({Tok::Select, word, Op::Eq}); + std::string upper = util::toUpper(word); + if (upper == "SELECT") tokens_.push_back({Tok::Select, word, Op::Eq}); else if (upper == "FROM") tokens_.push_back({Tok::From, word, Op::Eq}); else if (upper == "WHERE") tokens_.push_back({Tok::Where, word, Op::Eq}); else if (upper == "JOIN") tokens_.push_back({Tok::Join, word, Op::Eq}); else if (upper == "ON") tokens_.push_back({Tok::On, word, Op::Eq}); + else if (upper == "CREATE") tokens_.push_back({Tok::Create, word, Op::Eq}); + else if (upper == "TABLE") tokens_.push_back({Tok::Table, word, Op::Eq}); + else if (upper == "INSERT") tokens_.push_back({Tok::Insert, word, Op::Eq}); + else if (upper == "INTO") tokens_.push_back({Tok::Into, word, Op::Eq}); + else if (upper == "VALUES") tokens_.push_back({Tok::Values, word, Op::Eq}); + else if (upper == "NOT") tokens_.push_back({Tok::Not, word, Op::Eq}); + else if (upper == "NULL") tokens_.push_back({Tok::Null, word, Op::Eq}); else tokens_.push_back({Tok::Identifier, word, Op::Eq}); continue; } @@ -79,9 +81,11 @@ void Parser::tokenize() { } // Single-char punctuation. - if (c == ',') { tokens_.push_back({Tok::Comma, ",", Op::Eq}); ++i; continue; } - if (c == '*') { tokens_.push_back({Tok::Star, "*", Op::Eq}); ++i; continue; } - if (c == '.') { tokens_.push_back({Tok::Dot, ".", Op::Eq}); ++i; continue; } + if (c == ',') { tokens_.push_back({Tok::Comma, ",", Op::Eq}); ++i; continue; } + if (c == '*') { tokens_.push_back({Tok::Star, "*", Op::Eq}); ++i; continue; } + if (c == '.') { tokens_.push_back({Tok::Dot, ".", Op::Eq}); ++i; continue; } + if (c == '(') { tokens_.push_back({Tok::Lparen, "(", Op::Eq}); ++i; continue; } + if (c == ')') { tokens_.push_back({Tok::Rparen, ")", Op::Eq}); ++i; continue; } // Comparison operators. `<` and `>` may be 1 or 2 chars (`<` vs `<=`), // so peek ahead before committing to a length. @@ -122,9 +126,25 @@ const Parser::Token& Parser::expect(Tok kind, const char* what) { return consume(); } -// Top-level production: -// SELECT FROM {JOIN
ON = } [WHERE ] -SelectQuery Parser::parse() { +// Top-level dispatch: SELECT / CREATE TABLE / INSERT INTO. The first +// keyword picks the production; each sub-parser leaves the tokens +// positioned just past its statement, and parse() rejects trailing input. +Statement Parser::parse() { + Statement out; + switch (peek().kind) { + case Tok::Select: out = parseSelect(); break; + case Tok::Create: out = parseCreateTable(); break; + case Tok::Insert: out = parseInsert(); break; + default: + throw std::runtime_error( + "expected SELECT, CREATE, or INSERT at start of statement"); + } + expect(Tok::End, "end of input"); + return out; +} + +// SELECT FROM
{JOIN
ON = } [WHERE ] +SelectQuery Parser::parseSelect() { SelectQuery q; expect(Tok::Select, "SELECT"); @@ -140,9 +160,6 @@ SelectQuery Parser::parse() { consume(); parseWhere(q); } - - // Reject trailing tokens — the whole input must be a single statement. - expect(Tok::End, "end of input"); return q; } @@ -218,3 +235,111 @@ void Parser::parseWhere(SelectQuery& q) { q.where = c; } + +// CREATE TABLE ({, }) +CreateTableStmt Parser::parseCreateTable() { + CreateTableStmt s; + expect(Tok::Create, "CREATE"); + expect(Tok::Table, "TABLE"); + s.table = expect(Tok::Identifier, "table name").text; + + expect(Tok::Lparen, "'(' before column list"); + // At least one column is required. + s.columns.push_back(parseColumnDef()); + while (peek().kind == Tok::Comma) { + consume(); + s.columns.push_back(parseColumnDef()); + } + expect(Tok::Rparen, "')' after column list"); + return s; +} + +// One CREATE TABLE column: [NOT NULL] +// The type keyword tokenizes as Identifier; the analyzer maps the surface +// text (e.g. "INT", "BIGINT") to a Type, mirroring how WHERE literals +// stay as raw text until the analyzer types them. +ColumnDef Parser::parseColumnDef() { + ColumnDef c; + c.name = expect(Tok::Identifier, "column name").text; + c.type_name = expect(Tok::Identifier, "column type").text; + + // Optional `NOT NULL`. NULL alone is not accepted as a constraint — + // columns are nullable by default, so writing it would only be noise. + if (peek().kind == Tok::Not) { + consume(); + expect(Tok::Null, "NULL after NOT"); + c.nullable = false; + } + return c; +} + +// INSERT INTO
[({,})] VALUES ({,}){, (...)} +InsertStmt Parser::parseInsert() { + InsertStmt s; + expect(Tok::Insert, "INSERT"); + expect(Tok::Into, "INTO"); + s.table = expect(Tok::Identifier, "table name").text; + + // Optional column list. A bare `(` here means the user is naming target + // columns; otherwise we go straight to VALUES. + if (peek().kind == Tok::Lparen) { + s.columns = parseInsertColumnList(); + } + + expect(Tok::Values, "VALUES"); + // At least one row is required. + s.rows.push_back(parseInsertRow()); + while (peek().kind == Tok::Comma) { + consume(); + s.rows.push_back(parseInsertRow()); + } + return s; +} + +// `(col{, col})` — used only for INSERT's optional target column list. +// Only bare identifiers; qualified `t.c` form makes no sense for a target. +std::vector Parser::parseInsertColumnList() { + std::vector out; + expect(Tok::Lparen, "'(' before INSERT column list"); + out.push_back(expect(Tok::Identifier, "column name").text); + while (peek().kind == Tok::Comma) { + consume(); + out.push_back(expect(Tok::Identifier, "column name").text); + } + expect(Tok::Rparen, "')' after INSERT column list"); + return out; +} + +// `({, })` — one row of values for INSERT. +std::vector Parser::parseInsertRow() { + std::vector out; + expect(Tok::Lparen, "'(' before VALUES row"); + out.push_back(parseInsertLiteral()); + while (peek().kind == Tok::Comma) { + consume(); + out.push_back(parseInsertLiteral()); + } + expect(Tok::Rparen, "')' after VALUES row"); + return out; +} + +// One literal cell: number, single-quoted string, or NULL keyword. +InsertLiteral Parser::parseInsertLiteral() { + const Token& t = peek(); + InsertLiteral lit; + if (t.kind == Tok::Number) { + lit.text = t.text; + consume(); + } else if (t.kind == Tok::String) { + lit.text = t.text; + lit.is_string = true; + consume(); + } else if (t.kind == Tok::Null) { + lit.is_null = true; + consume(); + } else { + throw std::runtime_error( + "expected number, string, or NULL in VALUES"); + } + return lit; +} diff --git a/src/sql/parser.h b/src/sql/parser.h index 0440e67..b0372e8 100644 --- a/src/sql/parser.h +++ b/src/sql/parser.h @@ -2,6 +2,7 @@ #include #include +#include #include // Comparison operators allowed in a WHERE clause. @@ -40,21 +41,65 @@ struct SelectQuery { std::optional where; }; -// Parses a single SELECT statement from a SQL string. -// Construction tokenizes; parse() walks the tokens to produce a SelectQuery. +// One column declaration inside a CREATE TABLE column list. +// `type_name` is the surface keyword as written (e.g. "INT", "BIGINT", +// "TEXT") — the analyzer maps it to a Type. `nullable` defaults to true; +// `NOT NULL` flips it to false. +struct ColumnDef { + std::string name; + std::string type_name; + bool nullable = true; +}; + +// AST for `CREATE TABLE ({, })`. +struct CreateTableStmt { + std::string table; + std::vector columns; +}; + +// One literal cell in an INSERT VALUES list. Mirrors the WHERE pattern: +// the parser stores the raw text plus enough flags for the analyzer to +// pick the right typed Value. When `is_null` is true the other fields +// are unused. +struct InsertLiteral { + std::string text; + bool is_string = false; + bool is_null = false; +}; + +// AST for `INSERT INTO [()] VALUES ({,}){, (...)}`. +// `columns` is empty when the user omitted the column list, meaning +// "values are in schema column order"; otherwise it lists the explicit +// target columns in source order. `rows` is non-empty (the parser +// rejects a trailing VALUES with no row). +struct InsertStmt { + std::string table; + std::vector columns; + std::vector> rows; +}; + +// A parsed top-level SQL statement. The parser dispatches on the first +// keyword and produces exactly one of these alternatives. +using Statement = std::variant; + +// Parses a single SQL statement from a SQL string. +// Construction tokenizes; parse() walks the tokens to produce a Statement. // Throws std::runtime_error on any lex or parse error. class Parser { public: explicit Parser(std::string sql); - SelectQuery parse(); + Statement parse(); private: - // Token kinds the lexer emits. Keywords (Select/From/Where) are - // separated from generic Identifier so the parser can match on kind alone. + // Token kinds the lexer emits. Keywords are separated from generic + // Identifier so the parser can match on kind alone. enum class Tok { Select, From, Where, Join, On, + Create, Table, Insert, Into, Values, + Not, Null, Identifier, Number, String, Comma, Star, Dot, Op, + Lparen, Rparen, End }; @@ -77,7 +122,12 @@ class Parser { const Token& consume(); const Token& expect(Tok kind, const char* what); - // Grammar productions; each consumes tokens and writes into `q`. + // Top-level dispatch: SELECT / CREATE TABLE / INSERT. + SelectQuery parseSelect(); + CreateTableStmt parseCreateTable(); + InsertStmt parseInsert(); + + // SELECT sub-productions. void parseColumns(SelectQuery& q); void parseJoins(SelectQuery& q); void parseWhere(SelectQuery& q); @@ -86,4 +136,12 @@ class Parser { // the joined surface form (e.g. "id" or "users.id"). Used everywhere a // column may appear so qualified names work uniformly. std::string parseColumnRef(); + + // CREATE TABLE sub-productions. + ColumnDef parseColumnDef(); + + // INSERT sub-productions. + std::vector parseInsertColumnList(); + std::vector parseInsertRow(); + InsertLiteral parseInsertLiteral(); }; diff --git a/src/sql/tuple.cpp b/src/sql/tuple.cpp index f963a45..9abf752 100644 --- a/src/sql/tuple.cpp +++ b/src/sql/tuple.cpp @@ -1,5 +1,7 @@ #include "src/sql/tuple.h" +#include "src/util/string.h" + #include #include #include @@ -98,6 +100,29 @@ Type typeFromCode(int32_t c) { } } +Type typeFromName(const std::string& name) { + const std::string up = util::toUpper(name); + if (up == "INT" || up == "INTEGER") return Type::Int32; + if (up == "BIGINT") return Type::Int64; + if (up == "BOOL" || up == "BOOLEAN") return Type::Bool; + if (up == "TEXT") return Type::Text; + throw std::runtime_error("unknown column type: '" + name + "'"); +} + +std::string valueToString(const Value& v) { + if (v.is_null) return "NULL"; + switch (v.type) { + case Type::Int32: return std::to_string(v.i32); + case Type::Int64: return std::to_string(v.i64); + case Type::Bool: return v.b ? "true" : "false"; + case Type::Text: return v.text; + } + // Unreachable: the switch above is exhaustive over Type. Keeping a + // throw rather than a fallback string so a future Type addition + // surfaces as an immediate failure rather than a silent "". + throw std::runtime_error("valueToString: unknown Type"); +} + bool operator==(const Value& a, const Value& b) { if (a.type != b.type) return false; if (a.is_null != b.is_null) return false; diff --git a/src/sql/tuple.h b/src/sql/tuple.h index 01c263d..dc05d8d 100644 --- a/src/sql/tuple.h +++ b/src/sql/tuple.h @@ -64,6 +64,21 @@ inline bool operator!=(const Value& a, const Value& b) { return !(a == b); } int32_t typeToCode(Type t); Type typeFromCode(int32_t c); +// Map a SQL surface type keyword (e.g. "INT", "BIGINT", "TEXT") to a +// Type. Case-insensitive. Throws std::runtime_error on anything we +// don't recognise. Accepted spellings: +// INT / INTEGER -> Int32 +// BIGINT -> Int64 +// BOOL / BOOLEAN -> Bool +// TEXT -> Text +Type typeFromName(const std::string& name); + +// Render a Value to a short human-readable string. NULL shows as +// "NULL"; Bool shows as "true" / "false"; Int32/Int64 use std::to_string; +// Text is returned verbatim. Used by the demo and any future EXPLAIN / +// REPL output. Not meant to be a round-trippable SQL literal. +std::string valueToString(const Value& v); + // Stateless codec converting a row of Values to/from the byte sequence // stored in a SlottedPage's tuple area. Layout: // diff --git a/src/util/string.cpp b/src/util/string.cpp new file mode 100644 index 0000000..bde7bb2 --- /dev/null +++ b/src/util/string.cpp @@ -0,0 +1,15 @@ +#include "src/util/string.h" + +#include + +namespace util { + +std::string toUpper(const std::string& s) { + std::string out = s; + for (char& c : out) { + c = static_cast(std::toupper(static_cast(c))); + } + return out; +} + +} // namespace util diff --git a/src/util/string.h b/src/util/string.h new file mode 100644 index 0000000..893b7de --- /dev/null +++ b/src/util/string.h @@ -0,0 +1,15 @@ +#pragma once + +#include + +// Generic string helpers. Reach for these from any layer; nothing in +// here is SQL- or storage-specific. +namespace util { + +// Return an ASCII-uppercased copy of `s`. Non-ASCII bytes are passed +// through unchanged. Used wherever we need to compare a user-typed +// keyword case-insensitively (lexer keywords, type-name resolution, +// future config flags, ...). +std::string toUpper(const std::string& s); + +} // namespace util diff --git a/tests/sql/test_analyzer.cpp b/tests/sql/test_analyzer.cpp index e427fbd..feaf056 100644 --- a/tests/sql/test_analyzer.cpp +++ b/tests/sql/test_analyzer.cpp @@ -35,7 +35,7 @@ Schema postsSchema() { // Convenience: parse + analyze in one go. BoundSelect parseAnalyze(const Analyzer& az, const std::string& sql) { Parser p(sql); - SelectQuery q = p.parse(); + SelectQuery q = std::get(p.parse()); return az.analyze(q); } @@ -411,6 +411,297 @@ TEST_CASE("Bare column appearing in two joined tables is ambiguous") { std::runtime_error); } +// ---- CREATE TABLE ------------------------------------------------------ + +namespace { + +CreateTableStmt parseCreate(const std::string& sql) { + Parser p(sql); + return std::get(p.parse()); +} + +InsertStmt parseInsertStmt(const std::string& sql) { + Parser p(sql); + return std::get(p.parse()); +} + +} // namespace + +TEST_CASE("CREATE TABLE binds every supported type keyword") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + Analyzer az(cat); + + auto b = az.analyze(parseCreate( + "CREATE TABLE t (a INT, b INTEGER, c BIGINT, " + "d BOOL, e BOOLEAN, f TEXT)")); + + CHECK(b.name == "t"); + REQUIRE(b.schema.columns.size() == 6); + CHECK(b.schema.columns[0].type == Type::Int32); + CHECK(b.schema.columns[1].type == Type::Int32); + CHECK(b.schema.columns[2].type == Type::Int64); + CHECK(b.schema.columns[3].type == Type::Bool); + CHECK(b.schema.columns[4].type == Type::Bool); + CHECK(b.schema.columns[5].type == Type::Text); + // Default nullability is true; analyzer leaves it untouched. + for (const auto& c : b.schema.columns) CHECK(c.nullable == true); +} + +TEST_CASE("CREATE TABLE carries NOT NULL through to the bound schema") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + Analyzer az(cat); + + auto b = az.analyze(parseCreate( + "CREATE TABLE users (id INT NOT NULL, name TEXT)")); + + CHECK(b.schema.columns[0].nullable == false); + CHECK(b.schema.columns[1].nullable == true); +} + +TEST_CASE("CREATE TABLE rejects a duplicate column name") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + Analyzer az(cat); + + CHECK_THROWS_AS(az.analyze(parseCreate( + "CREATE TABLE t (a INT, b TEXT, a INT)")), std::runtime_error); +} + +TEST_CASE("CREATE TABLE rejects an unknown type keyword") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + Analyzer az(cat); + + CHECK_THROWS_AS(az.analyze(parseCreate( + "CREATE TABLE t (a FLOAT)")), std::runtime_error); +} + +TEST_CASE("CREATE TABLE rejects a name already in the catalog") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + cat.createTable("users", usersSchema()); + Analyzer az(cat); + + CHECK_THROWS_AS(az.analyze(parseCreate( + "CREATE TABLE users (id INT)")), std::runtime_error); +} + +// ---- INSERT ------------------------------------------------------------ + +TEST_CASE("INSERT without a column list expects schema-order values") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + cat.createTable("users", usersSchema()); + Analyzer az(cat); + + auto b = az.analyze(parseInsertStmt( + "INSERT INTO users VALUES " + "(1, 'alice', 30, 80000, 1), (2, 'bob', NULL, 60000, 0)")); + + REQUIRE(b.table != nullptr); + CHECK(b.table->name == "users"); + REQUIRE(b.rows.size() == 2); + REQUIRE(b.rows[0].size() == 5); + + // Row 0: every cell is non-null and matches the schema. + CHECK(b.rows[0][0].i32 == 1); + CHECK(b.rows[0][0].type == Type::Int32); + CHECK(b.rows[0][1].text == "alice"); + CHECK(b.rows[0][1].type == Type::Text); + CHECK(b.rows[0][2].i32 == 30); + CHECK(b.rows[0][3].i64 == 80000); + CHECK(b.rows[0][3].type == Type::Int64); + CHECK(b.rows[0][4].b == true); + CHECK(b.rows[0][4].type == Type::Bool); + + // Row 1: age is NULL, allowed because usersSchema()'s age is nullable. + CHECK(b.rows[1][2].is_null); + CHECK(b.rows[1][2].type == Type::Int32); + CHECK_FALSE(b.rows[1][0].is_null); + CHECK(b.rows[1][4].b == false); +} + +TEST_CASE("INSERT with a column list reorders values into schema order") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + cat.createTable("users", usersSchema()); + Analyzer az(cat); + + // Listed in (name, salary, active, id) order; missing 'age' which + // is nullable and so should default to NULL. + auto b = az.analyze(parseInsertStmt( + "INSERT INTO users (name, salary, active, id) " + "VALUES ('alice', 80000, 1, 1)")); + + REQUIRE(b.rows.size() == 1); + REQUIRE(b.rows[0].size() == 5); + // Schema order: id, name, age, salary, active + CHECK(b.rows[0][0].i32 == 1); + CHECK(b.rows[0][1].text == "alice"); + CHECK(b.rows[0][2].is_null); + CHECK(b.rows[0][2].type == Type::Int32); + CHECK(b.rows[0][3].i64 == 80000); + CHECK(b.rows[0][4].b == true); +} + +TEST_CASE("INSERT with too few values throws") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + cat.createTable("users", usersSchema()); + Analyzer az(cat); + + CHECK_THROWS_AS(az.analyze(parseInsertStmt( + "INSERT INTO users VALUES (1, 'alice')")), std::runtime_error); +} + +TEST_CASE("INSERT with too many values throws") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + cat.createTable("users", usersSchema()); + Analyzer az(cat); + + CHECK_THROWS_AS(az.analyze(parseInsertStmt( + "INSERT INTO users VALUES (1, 'alice', 30, 80000, 1, 'extra')")), + std::runtime_error); +} + +TEST_CASE("INSERT NULL for a non-nullable column throws") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + cat.createTable("users", usersSchema()); + Analyzer az(cat); + + // 'id' is non-nullable in usersSchema(). + CHECK_THROWS_AS(az.analyze(parseInsertStmt( + "INSERT INTO users VALUES (NULL, 'alice', 30, 80000, 1)")), + std::runtime_error); +} + +TEST_CASE("INSERT omitting a non-nullable column throws") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + cat.createTable("users", usersSchema()); + Analyzer az(cat); + + // 'id' is non-nullable; column list leaves it out, so the analyzer + // would fill it with NULL — which is rejected. + CHECK_THROWS_AS(az.analyze(parseInsertStmt( + "INSERT INTO users (name, age, salary, active) " + "VALUES ('alice', 30, 80000, 1)")), std::runtime_error); +} + +TEST_CASE("INSERT into an unknown table throws") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + Analyzer az(cat); + + CHECK_THROWS_AS(az.analyze(parseInsertStmt( + "INSERT INTO ghosts VALUES (1)")), std::runtime_error); +} + +TEST_CASE("INSERT with an unknown column name throws") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + cat.createTable("users", usersSchema()); + Analyzer az(cat); + + CHECK_THROWS_AS(az.analyze(parseInsertStmt( + "INSERT INTO users (nope) VALUES (1)")), std::runtime_error); +} + +TEST_CASE("INSERT with a duplicate column in the list throws") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + cat.createTable("users", usersSchema()); + Analyzer az(cat); + + CHECK_THROWS_AS(az.analyze(parseInsertStmt( + "INSERT INTO users (id, id) VALUES (1, 2)")), std::runtime_error); +} + +TEST_CASE("INSERT with a string literal into an Int32 column throws") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + cat.createTable("users", usersSchema()); + Analyzer az(cat); + + CHECK_THROWS_AS(az.analyze(parseInsertStmt( + "INSERT INTO users VALUES ('not-an-int', 'alice', 30, 80000, 1)")), + std::runtime_error); +} + +TEST_CASE("INSERT with a number that overflows Int32 throws") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + cat.createTable("users", usersSchema()); + Analyzer az(cat); + + CHECK_THROWS_AS(az.analyze(parseInsertStmt( + "INSERT INTO users VALUES (9999999999, 'alice', 30, 80000, 1)")), + std::runtime_error); +} + +// ---- analyzeStatement dispatch ---------------------------------------- + +TEST_CASE("analyzeStatement returns the matching bound variant") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + Catalog cat = Catalog::create(&bp); + cat.createTable("users", usersSchema()); + Analyzer az(cat); + + { + Parser p("SELECT * FROM users"); + auto b = az.analyzeStatement(p.parse()); + CHECK(std::holds_alternative(b)); + } + { + Parser p("CREATE TABLE t (a INT)"); + auto b = az.analyzeStatement(p.parse()); + CHECK(std::holds_alternative(b)); + } + { + Parser p("INSERT INTO users VALUES (1, 'a', 30, 80000, 1)"); + auto b = az.analyzeStatement(p.parse()); + CHECK(std::holds_alternative(b)); + } +} + TEST_CASE("Multiple chained JOINs all resolve") { TempFile tf; DiskManager dm(tf.path()); diff --git a/tests/sql/test_executor.cpp b/tests/sql/test_executor.cpp index 2deb9ad..c806250 100644 --- a/tests/sql/test_executor.cpp +++ b/tests/sql/test_executor.cpp @@ -82,15 +82,25 @@ void seedPosts(BufferPool& bp, const Catalog::TableInfo& info) { } // Convenience: parse + analyze + execute in one shot. -ExecResult run(const Catalog& cat, BufferPool& bp, const std::string& sql) { +ExecResult run(Catalog& cat, BufferPool& bp, const std::string& sql) { Parser p(sql); - SelectQuery q = p.parse(); + SelectQuery q = std::get(p.parse()); Analyzer az(cat); BoundSelect bs = az.analyze(q); - Executor ex(&bp); + Executor ex(&bp, &cat); return ex.execute(std::move(bs)); } +// Variant of `run` that drives the analyzer's variant entry point — the +// only path through which CREATE TABLE / INSERT can reach the executor. +ExecResult runStatement(Catalog& cat, BufferPool& bp, const std::string& sql) { + Parser p(sql); + Analyzer az(cat); + BoundStatement bound = az.analyzeStatement(p.parse()); + Executor ex(&bp, &cat); + return ex.execute(std::move(bound)); +} + std::vector textCol(const ExecResult& r, size_t c) { std::vector out; out.reserve(r.rows.size()); @@ -341,6 +351,141 @@ TEST_CASE("Three-table chained JOIN") { CHECK(i32Col(r, 1) == std::vector{100, 300}); } +// ---- CREATE TABLE / INSERT through the executor ------------------------ + +TEST_CASE("CREATE TABLE adds the table to the catalog and SELECT sees it") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(8, &dm); + Catalog cat = Catalog::create(&bp); + + auto r = runStatement(cat, bp, + "CREATE TABLE t (a INT NOT NULL, b TEXT)"); + + CHECK(r.rows_affected == 0); + CHECK(r.rows.empty()); + REQUIRE(cat.hasTable("t")); + const auto* info = cat.getTable("t"); + REQUIRE(info != nullptr); + REQUIRE(info->schema.columns.size() == 2); + CHECK(info->schema.columns[0].name == "a"); + CHECK(info->schema.columns[0].type == Type::Int32); + CHECK(info->schema.columns[0].nullable == false); + CHECK(info->schema.columns[1].name == "b"); + CHECK(info->schema.columns[1].type == Type::Text); + CHECK(info->schema.columns[1].nullable == true); + + // Empty SELECT works against the freshly-created table. + auto sel = run(cat, bp, "SELECT * FROM t"); + CHECK(sel.rows.empty()); + CHECK(sel.column_names == std::vector{"a", "b"}); +} + +TEST_CASE("INSERT writes rows visible to a follow-up SELECT") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(8, &dm); + Catalog cat = Catalog::create(&bp); + + runStatement(cat, bp, + "CREATE TABLE users (id INT NOT NULL, name TEXT NOT NULL, age INT)"); + + auto ins = runStatement(cat, bp, + "INSERT INTO users VALUES " + "(1, 'alice', 30), (2, 'bob', NULL), (3, 'carol', 40)"); + CHECK(ins.rows_affected == 3); + CHECK(ins.rows.empty()); + + auto sel = run(cat, bp, "SELECT id, name, age FROM users"); + REQUIRE(sel.rows.size() == 3); + CHECK(sel.rows[0][0].i32 == 1); + CHECK(sel.rows[0][1].text == "alice"); + CHECK(sel.rows[0][2].i32 == 30); + CHECK(sel.rows[1][0].i32 == 2); + CHECK(sel.rows[1][1].text == "bob"); + CHECK(sel.rows[1][2].is_null); + CHECK(sel.rows[2][0].i32 == 3); + CHECK(sel.rows[2][1].text == "carol"); +} + +TEST_CASE("INSERT with explicit column list reorders into schema order on disk") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(8, &dm); + Catalog cat = Catalog::create(&bp); + + runStatement(cat, bp, + "CREATE TABLE t (id INT NOT NULL, name TEXT, score INT)"); + runStatement(cat, bp, + "INSERT INTO t (score, id, name) VALUES (99, 1, 'alice')"); + + auto sel = run(cat, bp, "SELECT * FROM t"); + REQUIRE(sel.rows.size() == 1); + CHECK(sel.rows[0][0].i32 == 1); // id + CHECK(sel.rows[0][1].text == "alice"); // name + CHECK(sel.rows[0][2].i32 == 99); // score +} + +TEST_CASE("INSERT followed by WHERE and JOIN through full SQL") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(8, &dm); + Catalog cat = Catalog::create(&bp); + + runStatement(cat, bp, + "CREATE TABLE u (id INT NOT NULL, name TEXT NOT NULL)"); + runStatement(cat, bp, + "CREATE TABLE p (uid INT NOT NULL, title TEXT NOT NULL)"); + runStatement(cat, bp, + "INSERT INTO u VALUES (1, 'a'), (2, 'b'), (3, 'c')"); + runStatement(cat, bp, + "INSERT INTO p VALUES (1, 'x'), (1, 'y'), (3, 'z')"); + + auto r = run(cat, bp, + "SELECT u.name, p.title FROM u JOIN p ON u.id = p.uid " + "WHERE u.id != 2"); + CHECK(textCol(r, 0) == std::vector{"a", "a", "c"}); + CHECK(textCol(r, 1) == std::vector{"x", "y", "z"}); +} + +TEST_CASE("CREATE TABLE on an existing name throws at the executor") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(8, &dm); + Catalog cat = Catalog::create(&bp); + + runStatement(cat, bp, "CREATE TABLE t (a INT)"); + // The analyzer normally rejects this first. To exercise the + // executor's own catalog-level guard, build a BoundCreateTable by + // hand and invoke the per-statement primitive. + BoundCreateTable bct; + bct.name = "t"; + bct.schema.columns.push_back({"a", Type::Int32, true}); + Executor ex(&bp, &cat); + CHECK_THROWS_AS(ex.execute(std::move(bct)), std::runtime_error); +} + +TEST_CASE("Cold reopen sees rows inserted via SQL in a previous session") { + TempFile tf; + { + DiskManager dm(tf.path()); + BufferPool bp(8, &dm); + Catalog cat = Catalog::create(&bp); + runStatement(cat, bp, + "CREATE TABLE k (n INT NOT NULL, label TEXT NOT NULL)"); + runStatement(cat, bp, + "INSERT INTO k VALUES (10, 'ten'), (20, 'twenty'), (30, 'thirty')"); + bp.flushAll(); + } + + DiskManager dm(tf.path()); + BufferPool bp(8, &dm); + Catalog cat(&bp); + + auto r = run(cat, bp, "SELECT label FROM k WHERE n >= 20"); + CHECK(textCol(r, 0) == std::vector{"twenty", "thirty"}); +} + TEST_CASE("Cold reopen: data seeded in one session is visible to a fresh executor") { // Mirrors main.cpp's flow: write + flush, drop the in-memory state, // reopen the file, and run a query against a fresh Catalog and diff --git a/tests/sql/test_parser.cpp b/tests/sql/test_parser.cpp index 080cb83..87e30b9 100644 --- a/tests/sql/test_parser.cpp +++ b/tests/sql/test_parser.cpp @@ -5,10 +5,23 @@ #include #include +#include +// Most cases here exercise the SELECT path — parse the statement and +// pull the SELECT alternative out, asserting the variant kind matches. static SelectQuery parse(const std::string& sql) { Parser p(sql); - return p.parse(); + return std::get(p.parse()); +} + +static CreateTableStmt parseCreate(const std::string& sql) { + Parser p(sql); + return std::get(p.parse()); +} + +static InsertStmt parseInsert(const std::string& sql) { + Parser p(sql); + return std::get(p.parse()); } TEST_CASE("opToString covers every operator") { @@ -125,3 +138,147 @@ TEST_CASE("WHERE with non-literal RHS throws") { TEST_CASE("Empty input throws") { CHECK_THROWS_AS(parse(""), std::runtime_error); } + +// ---- CREATE TABLE ------------------------------------------------------- + +TEST_CASE("CREATE TABLE with mixed types and one NOT NULL constraint") { + auto s = parseCreate( + "CREATE TABLE users (id INT NOT NULL, name TEXT, age INT)"); + + CHECK(s.table == "users"); + REQUIRE(s.columns.size() == 3); + + CHECK(s.columns[0].name == "id"); + CHECK(s.columns[0].type_name == "INT"); + CHECK(s.columns[0].nullable == false); + + CHECK(s.columns[1].name == "name"); + CHECK(s.columns[1].type_name == "TEXT"); + CHECK(s.columns[1].nullable == true); + + CHECK(s.columns[2].name == "age"); + CHECK(s.columns[2].type_name == "INT"); + CHECK(s.columns[2].nullable == true); +} + +TEST_CASE("CREATE TABLE preserves the surface form of the type keyword") { + // The analyzer is responsible for mapping these; the parser just + // hands them through verbatim. Lower-case input is preserved too. + auto s = parseCreate( + "create table t (a Bigint, b boolean not null, c text)"); + REQUIRE(s.columns.size() == 3); + CHECK(s.columns[0].type_name == "Bigint"); + CHECK(s.columns[1].type_name == "boolean"); + CHECK(s.columns[1].nullable == false); + CHECK(s.columns[2].type_name == "text"); +} + +TEST_CASE("CREATE TABLE with empty column list throws") { + CHECK_THROWS_AS(parseCreate("CREATE TABLE t ()"), std::runtime_error); +} + +TEST_CASE("CREATE TABLE without parentheses throws") { + CHECK_THROWS_AS( + parseCreate("CREATE TABLE t id INT"), std::runtime_error); +} + +TEST_CASE("CREATE TABLE with NOT but no NULL throws") { + CHECK_THROWS_AS( + parseCreate("CREATE TABLE t (id INT NOT)"), std::runtime_error); +} + +// ---- INSERT ------------------------------------------------------------- + +TEST_CASE("INSERT INTO with no column list and a single row") { + auto s = parseInsert("INSERT INTO users VALUES (1, 'alice', 30)"); + + CHECK(s.table == "users"); + CHECK(s.columns.empty()); + REQUIRE(s.rows.size() == 1); + REQUIRE(s.rows[0].size() == 3); + + CHECK(s.rows[0][0].text == "1"); + CHECK_FALSE(s.rows[0][0].is_string); + CHECK_FALSE(s.rows[0][0].is_null); + + CHECK(s.rows[0][1].text == "alice"); + CHECK(s.rows[0][1].is_string); + + CHECK(s.rows[0][2].text == "30"); + CHECK_FALSE(s.rows[0][2].is_string); +} + +TEST_CASE("INSERT with explicit column list and multiple rows") { + auto s = parseInsert( + "INSERT INTO users (id, name) VALUES (1, 'alice'), (2, 'bob')"); + + CHECK(s.columns == std::vector{"id", "name"}); + REQUIRE(s.rows.size() == 2); + CHECK(s.rows[0][1].text == "alice"); + CHECK(s.rows[1][1].text == "bob"); +} + +TEST_CASE("INSERT with NULL literal carries the is_null flag") { + auto s = parseInsert("INSERT INTO t VALUES (1, NULL, 'x')"); + REQUIRE(s.rows.size() == 1); + REQUIRE(s.rows[0].size() == 3); + + CHECK_FALSE(s.rows[0][0].is_null); + CHECK(s.rows[0][1].is_null); + CHECK(s.rows[0][1].text.empty()); + CHECK(s.rows[0][1].is_string == false); + CHECK_FALSE(s.rows[0][2].is_null); + CHECK(s.rows[0][2].is_string); +} + +TEST_CASE("INSERT keywords are case-insensitive") { + auto s = parseInsert("insert into t values (1)"); + CHECK(s.table == "t"); + REQUIRE(s.rows.size() == 1); + CHECK(s.rows[0][0].text == "1"); +} + +TEST_CASE("INSERT without VALUES throws") { + CHECK_THROWS_AS( + parseInsert("INSERT INTO t (id) (1)"), std::runtime_error); +} + +TEST_CASE("INSERT with empty VALUES row throws") { + CHECK_THROWS_AS( + parseInsert("INSERT INTO t VALUES ()"), std::runtime_error); +} + +TEST_CASE("INSERT with no rows after VALUES throws") { + CHECK_THROWS_AS( + parseInsert("INSERT INTO t VALUES"), std::runtime_error); +} + +TEST_CASE("INSERT trailing comma without another row throws") { + CHECK_THROWS_AS( + parseInsert("INSERT INTO t VALUES (1),"), std::runtime_error); +} + +// ---- top-level dispatch ------------------------------------------------- + +TEST_CASE("parse() rejects input that doesn't start with a known keyword") { + Parser p("UPDATE t SET a = 1"); + CHECK_THROWS_AS(p.parse(), std::runtime_error); +} + +TEST_CASE("parse() returns the right alternative for each statement kind") { + { + Parser p("SELECT * FROM t"); + auto s = p.parse(); + CHECK(std::holds_alternative(s)); + } + { + Parser p("CREATE TABLE t (a INT)"); + auto s = p.parse(); + CHECK(std::holds_alternative(s)); + } + { + Parser p("INSERT INTO t VALUES (1)"); + auto s = p.parse(); + CHECK(std::holds_alternative(s)); + } +}