jayphan14 · jayphan14 · May 14, 2026 · May 6, 2026 · May 6, 2026 · May 14, 2026
diff --git a/Makefile b/Makefile
@@ -4,6 +4,7 @@ CXXFLAGS = -std=c++17 -Wall -Wextra -O2 -I.
 BUILD_DIR = build
 
 DBMS_OBJS = $(BUILD_DIR)/main.o \
+            $(BUILD_DIR)/src/util/string.o \
             $(BUILD_DIR)/src/sql/parser.o \
             $(BUILD_DIR)/src/storage/disk_manager.o \
             $(BUILD_DIR)/src/storage/buffer_pool.o \
@@ -27,6 +28,7 @@ TEST_OBJS = $(BUILD_DIR)/tests/sql/test_parser.o \
             $(BUILD_DIR)/tests/sql/test_analyzer.o \
             $(BUILD_DIR)/tests/sql/test_executor.o \
             $(BUILD_DIR)/tests/sql/test_operators.o \
+            $(BUILD_DIR)/src/util/string.o \
             $(BUILD_DIR)/src/sql/parser.o \
             $(BUILD_DIR)/src/storage/disk_manager.o \
             $(BUILD_DIR)/src/storage/buffer_pool.o \

diff --git a/README.md b/README.md
@@ -199,8 +199,8 @@ twice in one query.
 
 ### What's not built yet
 
-- DML (`INSERT` / `UPDATE` / `DELETE`) at the SQL surface — rows are
-  inserted today via `HeapFile::insert` + `TupleCodec::encode`, not SQL.
+- `UPDATE` / `DELETE` at the SQL surface (`CREATE TABLE` and `INSERT`
+  are wired through Parser → Analyzer → Executor — see `grammar.md`).
 - `ORDER BY`, `LIMIT`, aggregates, expressions in the SELECT list, and
   table aliases.
 - Alternative operators (`HashJoin`, `IndexScan`, ...) and a real
@@ -233,11 +233,12 @@ Run the demo (`main.cpp`):
 
 `main.cpp` is one end-to-end flow that exercises every layer:
 
-1. open a fresh database file, create the catalog, declare a `users` and
-   a `posts` table;
-2. seed both tables (5 rows each) via `TupleCodec` + `HeapFile`, flush;
+1. open a fresh database file and bootstrap the catalog (system-table
+   pages 0/1);
+2. issue `CREATE TABLE` + multi-row `INSERT` for `users` and `posts`
+   through Parser → Analyzer → Executor; flush;
 3. *cold-reopen* the file with a brand-new `BufferPool` and `Catalog`;
-4. run a handful of SQL strings (including filters and a join)
+4. run a handful of SELECT strings (including filters and a join)
    through Parser → Analyzer → Planner → Executor and print each
    result as a padded table.
 

diff --git a/grammar.md b/grammar.md
@@ -0,0 +1,116 @@
+# Supported SQL grammar
+
+This is the surface grammar accepted by `Parser` (`src/sql/parser.{h,cpp}`).
+Notation: `<x>` is a non-terminal, `[x]` is optional, `{x}` is zero-or-more,
+`UPPERCASE` keywords are case-insensitive, single-quoted strings are literal
+text. Whitespace between tokens is insignificant.
+
+The parser is a single-statement parser: each call to `Parser::parse()`
+consumes exactly one statement and rejects trailing tokens.
+
+## Top-level
+
+```
+<statement>     ::= <select-stmt>
+                  | <create-table-stmt>
+                  | <insert-stmt>
+```
+
+`Parser::parse()` returns a `Statement` variant whose alternative reflects
+the production matched.
+
+## SELECT
+
+```
+<select-stmt>   ::= SELECT <select-list>
+                    FROM <ident>
+                    {<join-clause>}
+                    [WHERE <condition>]
+
+<select-list>   ::= '*'
+                  | <column-ref> {',' <column-ref>}
+
+<column-ref>    ::= <ident> ['.' <ident>]
+
+<join-clause>   ::= JOIN <ident> ON <column-ref> '=' <column-ref>
+
+<condition>     ::= <column-ref> <comparison-op> <literal>
+
+<comparison-op> ::= '=' | '!=' | '<' | '>' | '<=' | '>='
+
+<literal>       ::= <number> | <string>
+```
+
+Notes:
+- `JOIN` is inner join only; ON accepts one equality between two columns.
+- `WHERE` is a single predicate — no `AND` / `OR` yet.
+- `WHERE`'s right-hand side must be a literal; column-vs-column is rejected.
+
+## CREATE TABLE
+
+```
+<create-table-stmt>
+                ::= CREATE TABLE <ident> '(' <column-def> {',' <column-def>} ')'
+
+<column-def>    ::= <ident> <type-name> [NOT NULL]
+
+<type-name>     ::= <ident>
+```
+
+Notes:
+- The column list must contain at least one column.
+- `<type-name>` is any identifier; the analyzer is responsible for
+  mapping the surface text (e.g. `INT`, `BIGINT`, `BOOL`, `TEXT`) to a
+  concrete `Type`. The parser preserves the original casing.
+- Columns are nullable by default. `NOT NULL` is the only column
+  constraint accepted.
+
+## INSERT
+
+```
+<insert-stmt>   ::= INSERT INTO <ident>
+                    [<insert-column-list>]
+                    VALUES <values-row> {',' <values-row>}
+
+<insert-column-list>
+                ::= '(' <ident> {',' <ident>} ')'
+
+<values-row>    ::= '(' <insert-literal> {',' <insert-literal>} ')'
+
+<insert-literal>
+                ::= <number> | <string> | NULL
+```
+
+Notes:
+- The optional column list scopes the values to specific columns; when
+  omitted, the analyzer treats the row values as schema-column order.
+- At least one row is required after `VALUES`.
+- `NULL` is a literal in this position only (the analyzer rejects it
+  against a non-nullable column). Numeric and string literals carry the
+  same flags as in `WHERE`.
+
+## Lexical rules
+
+- **Identifiers**: letter or `_`, then letters, digits, or `_`.
+- **Numbers**: one or more decimal digits. No sign, no decimal point.
+- **Strings**: single-quoted; no escape sequences (a `'` ends the string).
+- **Keywords** (case-insensitive, reserved): `SELECT`, `FROM`, `WHERE`,
+  `JOIN`, `ON`, `CREATE`, `TABLE`, `INSERT`, `INTO`, `VALUES`, `NOT`,
+  `NULL`.
+- **Punctuation**: `,` `*` `.` `(` `)`.
+- **Operators**: `=` `!=` `<` `>` `<=` `>=`.
+
+## Not yet supported
+
+- DML: `UPDATE`, `DELETE`.
+- DDL beyond `CREATE TABLE`: `DROP`, `ALTER`, indexes, constraints other
+  than `NOT NULL`.
+- `ORDER BY`, `LIMIT`, `GROUP BY`, aggregates.
+- Expressions in the SELECT list (only column references are accepted).
+- Compound `WHERE` predicates (`AND` / `OR`), parenthesised conditions,
+  `IN`, `LIKE`, `BETWEEN`, `IS NULL`.
+- Table aliases (so a table cannot appear twice in one query).
+- `INSERT ... SELECT`.
+- `TRUE` / `FALSE` literals (use `0` / `1` against `Bool` columns).
+- Numeric types beyond integer: floats, decimals, dates.
+- Comments (`--`, `/* */`).
diff --git a/main.cpp b/main.cpp
@@ -5,78 +5,26 @@
 #include "src/sql/tuple.h"
 #include "src/storage/buffer_pool.h"
 #include "src/storage/disk_manager.h"
-#include "src/storage/heap_file.h"
 
 #include <cstdint>
 #include <filesystem>
 #include <iostream>
 #include <stdexcept>
 #include <string>
-#include <tuple>
 #include <utility>
 #include <vector>
 
 // =============================================================================
-// End-to-end demo: seed a small users + posts dataset, cold-reopen the
-// database, then run a handful of SQL strings through Parser → Analyzer →
-// Executor and print the rows that come back.
+// End-to-end demo: run a CREATE TABLE, an INSERT, and a SELECT through
+// Parser → Analyzer → Executor against a fresh database to exercise the
+// full statement cycle.
 // =============================================================================
 
 namespace {
 
 const std::string kDbPath = "/tmp/dbms_demo.db";
 
-void seedUsers(BufferPool& bp, const Catalog::TableInfo& info) {
-    const std::vector<std::tuple<int32_t, std::string, int32_t>> rows = {
-        {1, "alice", 30},
-        {2, "bob",   25},
-        {3, "carol", 40},
-        {4, "dave",  19},
-        {5, "eve",   33},
-    };
-    HeapFile hf(&bp, info.root_page);
-    for (const auto& [id, name, age] : rows) {
-        const auto bytes = TupleCodec::encode(info.schema, {
-            Value::Int32(id),
-            Value::Text(name),
-            Value::Int32(age),
-        });
-        hf.insert(bytes.data(), bytes.size());
-    }
-}
-
-void seedPosts(BufferPool& bp, const Catalog::TableInfo& info) {
-    // (id, title, user_id) — user_id matches the users table above.
-    const std::vector<std::tuple<int32_t, std::string, int32_t>> rows = {
-        {100, "hello world",     1},
-        {101, "second post",     1},
-        {102, "carol's musings", 3},
-        {103, "eve at midnight", 5},
-        {104, "bob's silence",   2},
-    };
-    HeapFile hf(&bp, info.root_page);
-    for (const auto& [id, title, user_id] : rows) {
-        const auto bytes = TupleCodec::encode(info.schema, {
-            Value::Int32(id),
-            Value::Text(title),
-            Value::Int32(user_id),
-        });
-        hf.insert(bytes.data(), bytes.size());
-    }
-}
-
-std::string valueToString(const Value& v) {
-    if (v.is_null) return "NULL";
-    switch (v.type) {
-        case Type::Int32: return std::to_string(v.i32);
-        case Type::Int64: return std::to_string(v.i64);
-        case Type::Bool:  return v.b ? "true" : "false";
-        case Type::Text:  return v.text;
-    }
-    return "<?>";
-}
-
-void printResult(const ExecResult& r) {
+void printSelectResult(const ExecResult& r) {
     // Column widths: max of header length and any value length, with a
     // small floor so single-char columns aren't crammed.
     std::vector<size_t> widths(r.column_names.size());
@@ -117,76 +65,54 @@ void printResult(const ExecResult& r) {
               << (r.rows.size() == 1 ? "" : "s") << ")\n";
 }
 
-void runQuery(const Catalog& cat, BufferPool& bp, const std::string& sql) {
+// Run one statement end-to-end. Picks a render based on the parsed
+// statement kind: SELECT prints a padded table, CREATE TABLE / INSERT
+// print a Postgres-style command tag.
+void runStatement(Catalog& cat, BufferPool& bp, const std::string& sql) {
     std::cout << "\nSQL: " << sql << "\n";
     try {
         Parser p(sql);
-        SelectQuery q = p.parse();
+        Statement stmt = p.parse();
         Analyzer az(cat);
-        BoundSelect bs = az.analyze(q);
-        Executor ex(&bp);
-        ExecResult r = ex.execute(std::move(bs));
-        printResult(r);
+        BoundStatement bound = az.analyzeStatement(stmt);
+        Executor ex(&bp, &cat);
+        ExecResult r = ex.execute(std::move(bound));
+
+        if (std::holds_alternative<SelectQuery>(stmt)) {
+            printSelectResult(r);
+        } else if (std::holds_alternative<CreateTableStmt>(stmt)) {
+            std::cout << "  CREATE TABLE\n";
+        } else {
+            std::cout << "  INSERT " << r.rows_affected << "\n";
+        }
     } catch (const std::exception& e) {
         std::cout << "  error: " << e.what() << "\n";
     }
 }
 
-void seedFreshDatabase() {
-    std::error_code ec;
-    std::filesystem::remove(kDbPath, ec);
-
-    DiskManager dm(kDbPath);
-    BufferPool bp(8, &dm);
-    Catalog cat = Catalog::create(&bp);
-
-    const Schema users_schema{{
-        {"id",   Type::Int32, false},
-        {"name", Type::Text,  false},
-        {"age",  Type::Int32, false},
-    }};
-    const Schema posts_schema{{
-        {"id",      Type::Int32, false},
-        {"title",   Type::Text,  false},
-        {"user_id", Type::Int32, false},
-    }};
-    cat.createTable("users", users_schema);
-    cat.createTable("posts", posts_schema);
-
-    seedUsers(bp, *cat.getTable("users"));
-    seedPosts(bp, *cat.getTable("posts"));
-    bp.flushAll();
-
-    std::cout << "[seed] wrote users + posts to " << kDbPath
-              << " (" << std::filesystem::file_size(kDbPath) << " bytes)\n";
-}
-
 }  // namespace
 
 int main() {
-    seedFreshDatabase();
+    std::error_code ec;
+    std::filesystem::remove(kDbPath, ec);
 
-    // Cold reopen — nothing is shared with the seeding phase except the file.
     DiskManager dm(kDbPath);
     BufferPool bp(8, &dm);
-    Catalog cat(&bp);
+    // Catalog::create allocates the system-table bootstrap pages
+    // (__tables at page 0, __columns at page 1) — user tables are
+    // built via SQL below.
+    Catalog cat = Catalog::create(&bp);
 
-    std::cout << "\n[query] reopened db; tables:";
-    for (const auto& n : cat.tableNames()) std::cout << " " << n;
-    std::cout << "\n";
+    runStatement(cat, bp,
+        "CREATE TABLE users (id INT NOT NULL, "
+        "name TEXT NOT NULL, age INT NOT NULL)");
+    runStatement(cat, bp,
+        "INSERT INTO users VALUES "
+        "(1, 'alice', 30), (2, 'bob', 25), (3, 'carol', 40)");
+    runStatement(cat, bp, "SELECT * FROM users");
 
-    runQuery(cat, bp, "SELECT * FROM users");
-    runQuery(cat, bp, "SELECT name, age FROM users WHERE age > 25");
-    runQuery(cat, bp, "SELECT name FROM users WHERE name = 'alice'");
-    runQuery(cat, bp,
-        "SELECT users.name, posts.title "
-        "FROM users JOIN posts ON users.id = posts.user_id");
-    runQuery(cat, bp,
-        "SELECT users.name, posts.title "
-        "FROM users JOIN posts ON users.id = posts.user_id "
-        "WHERE users.age > 25");
+    bp.flushAll();
 
-    std::error_code ec;
     std::filesystem::remove(kDbPath, ec);
     return 0;
 }