From a9bdafe4f09200a69a06c1265957d6178c16bdf2 Mon Sep 17 00:00:00 2001 From: Jay Phan Date: Sun, 26 Apr 2026 12:30:31 -0400 Subject: [PATCH 1/5] disk management layer --- Makefile | 5 +- src/storage/disk_manager.cpp | 84 +++++++++++++++ src/storage/disk_manager.h | 44 ++++++++ tests/storage/test_disk_manager.cpp | 157 ++++++++++++++++++++++++++++ 4 files changed, 289 insertions(+), 1 deletion(-) create mode 100644 src/storage/disk_manager.cpp create mode 100644 src/storage/disk_manager.h create mode 100644 tests/storage/test_disk_manager.cpp diff --git a/Makefile b/Makefile index 0437687..b863d66 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,10 @@ CXXFLAGS = -std=c++17 -Wall -Wextra -O2 -I. BUILD_DIR = build DBMS_OBJS = $(BUILD_DIR)/main.o $(BUILD_DIR)/src/parser.o -TEST_OBJS = $(BUILD_DIR)/tests/test_parser.o $(BUILD_DIR)/src/parser.o +TEST_OBJS = $(BUILD_DIR)/tests/test_parser.o \ + $(BUILD_DIR)/tests/storage/test_disk_manager.o \ + $(BUILD_DIR)/src/parser.o \ + $(BUILD_DIR)/src/storage/disk_manager.o dbms: $(DBMS_OBJS) $(CXX) $(CXXFLAGS) -o $@ $^ diff --git a/src/storage/disk_manager.cpp b/src/storage/disk_manager.cpp new file mode 100644 index 0000000..6d17bdb --- /dev/null +++ b/src/storage/disk_manager.cpp @@ -0,0 +1,84 @@ +#include "src/storage/disk_manager.h" + +#include +#include +#include + +DiskManager::DiskManager(const std::string& filename) + : filename_(filename), next_page_id_(0) { + namespace fs = std::filesystem; + + // Ensure the file exists before opening for read+write; std::fstream with + // in|out will not create a missing file on its own. + if (!fs::exists(filename_)) { + std::ofstream create(filename_, std::ios::binary); + if (!create) { + throw std::runtime_error("DiskManager: failed to create file '" + filename_ + "'"); + } + } + + file_.open(filename_, std::ios::in | std::ios::out | std::ios::binary); + if (!file_) { + throw std::runtime_error("DiskManager: failed to open file '" + filename_ + "'"); + } + + const auto size = fs::file_size(filename_); + if (size % PAGE_SIZE != 0) { + throw std::runtime_error("DiskManager: file size " + std::to_string(size) + + " is not a multiple of PAGE_SIZE"); + } + next_page_id_ = static_cast(size / PAGE_SIZE); +} + +DiskManager::~DiskManager() { + if (file_.is_open()) { + file_.close(); + } +} + +// just seek to where that page started and read the next PAGE_SIZE bytes +void DiskManager::readPage(PageId page_id, char* dest) { + if (page_id >= next_page_id_) { + throw std::runtime_error("DiskManager::readPage: page_id " + std::to_string(page_id) + + " out of range (numPages=" + std::to_string(next_page_id_) + ")"); + } + file_.clear(); + file_.seekg(static_cast(page_id) * PAGE_SIZE, std::ios::beg); + file_.read(dest, PAGE_SIZE); + if (file_.gcount() != static_cast(PAGE_SIZE)) { + throw std::runtime_error("DiskManager::readPage: short read on page " + std::to_string(page_id)); + } +} + +void DiskManager::writePage(PageId page_id, const char* src) { + if (page_id >= next_page_id_) { + throw std::runtime_error("DiskManager::writePage: page_id " + std::to_string(page_id) + + " out of range (numPages=" + std::to_string(next_page_id_) + ")"); + } + file_.clear(); + file_.seekp(static_cast(page_id) * PAGE_SIZE, std::ios::beg); + file_.write(src, PAGE_SIZE); + if (!file_) { + throw std::runtime_error("DiskManager::writePage: write failed on page " + std::to_string(page_id)); + } + file_.flush(); +} + +// TODO: may be double to amortize cost? +PageId DiskManager::allocatePage() { + const PageId new_id = next_page_id_; + std::vector zeros(PAGE_SIZE, 0); + file_.clear(); + file_.seekp(static_cast(new_id) * PAGE_SIZE, std::ios::beg); + file_.write(zeros.data(), PAGE_SIZE); + if (!file_) { + throw std::runtime_error("DiskManager::allocatePage: write failed"); + } + file_.flush(); + ++next_page_id_; + return new_id; +} + +PageId DiskManager::numPages() const { + return next_page_id_; +} diff --git a/src/storage/disk_manager.h b/src/storage/disk_manager.h new file mode 100644 index 0000000..a1775a8 --- /dev/null +++ b/src/storage/disk_manager.h @@ -0,0 +1,44 @@ +#pragma once + +#include +#include +#include +#include +#include + +constexpr size_t PAGE_SIZE = 4096; // 4KB to align with OS pages +using PageId = uint32_t; +constexpr PageId INVALID_PAGE_ID = std::numeric_limits::max(); + +/* +Lowest layer of the storage stack: a thin wrapper over a single file that +reads and writes fixed-size pages by page_id. Page N lives at byte offset +N * PAGE_SIZE. The file grows only via allocatePage(). + + +mydb.dat (one OS file): +NOTE: 1 OS file maps to multiple files to offset small file overhead, and limit on os file handle limits. +┌────────┬────────┬────────┬────────┬────────┬────────┬────────┐ +│ page 0 │ page 1 │ page 2 │ page 3 │ page 4 │ page 5 │ page 6 │ ... +│catalog │ users │ users │ orders │ users │ orders │ idx │ +└────────┴────────┴────────┴────────┴────────┴────────┴────────┘ +*/ + +class DiskManager { +public: + explicit DiskManager(const std::string& filename); + ~DiskManager(); + + DiskManager(const DiskManager&) = delete; + DiskManager& operator=(const DiskManager&) = delete; + + void readPage(PageId page_id, char* dest); + void writePage(PageId page_id, const char* src); + PageId allocatePage(); + PageId numPages() const; + +private: + std::string filename_; + std::fstream file_; + PageId next_page_id_; +}; diff --git a/tests/storage/test_disk_manager.cpp b/tests/storage/test_disk_manager.cpp new file mode 100644 index 0000000..966f0c8 --- /dev/null +++ b/tests/storage/test_disk_manager.cpp @@ -0,0 +1,157 @@ +#include "tests/vendor/doctest.h" + +#include "src/storage/disk_manager.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace { + +// RAII handle for a unique temp file path. Removes the file on destruction +// so each test case is isolated and the temp dir stays clean. +class TempFile { +public: + TempFile() { + static std::atomic counter{0}; + const auto stamp = std::chrono::high_resolution_clock::now().time_since_epoch().count(); + const auto seq = counter.fetch_add(1); + path_ = std::filesystem::temp_directory_path() / + ("dbms_test_" + std::to_string(stamp) + "_" + std::to_string(seq) + ".db"); + std::filesystem::remove(path_); + } + ~TempFile() { + std::error_code ec; + std::filesystem::remove(path_, ec); + } + TempFile(const TempFile&) = delete; + TempFile& operator=(const TempFile&) = delete; + + const std::string path() const { return path_.string(); } + +private: + std::filesystem::path path_; +}; + +// Fill a PAGE_SIZE buffer with a deterministic byte pattern keyed on `seed`, +// so every page in a test gets a distinguishable payload. +std::vector makePattern(uint8_t seed) { + std::vector buf(PAGE_SIZE); + for (size_t i = 0; i < PAGE_SIZE; ++i) { + buf[i] = static_cast((i + seed) & 0xff); + } + return buf; +} + +} // namespace + +TEST_CASE("allocate-write-read round trip (usage demo)") { + TempFile tf; + DiskManager dm(tf.path()); + + // 1. Allocate a page — file grows by PAGE_SIZE bytes. + const PageId pid = dm.allocatePage(); + CHECK(pid == 0); + + // 2. Write a known pattern into that page. + const auto pattern = makePattern(0xAB); + dm.writePage(pid, pattern.data()); + + // 3. Read it back into a fresh buffer and verify byte-for-byte equality. + std::vector read_back(PAGE_SIZE); + dm.readPage(pid, read_back.data()); + CHECK(std::memcmp(read_back.data(), pattern.data(), PAGE_SIZE) == 0); +} + +TEST_CASE("multiple pages are independent") { + TempFile tf; + DiskManager dm(tf.path()); + + const PageId p0 = dm.allocatePage(); + const PageId p1 = dm.allocatePage(); + const PageId p2 = dm.allocatePage(); + + const auto pat0 = makePattern(0x11); + const auto pat1 = makePattern(0x22); + const auto pat2 = makePattern(0x33); + dm.writePage(p0, pat0.data()); + dm.writePage(p1, pat1.data()); + dm.writePage(p2, pat2.data()); + + std::vector buf(PAGE_SIZE); + dm.readPage(p0, buf.data()); CHECK(std::memcmp(buf.data(), pat0.data(), PAGE_SIZE) == 0); + dm.readPage(p1, buf.data()); CHECK(std::memcmp(buf.data(), pat1.data(), PAGE_SIZE) == 0); + dm.readPage(p2, buf.data()); CHECK(std::memcmp(buf.data(), pat2.data(), PAGE_SIZE) == 0); +} + +TEST_CASE("numPages() reflects allocations") { + TempFile tf; + DiskManager dm(tf.path()); + + CHECK(dm.numPages() == 0); + dm.allocatePage(); + CHECK(dm.numPages() == 1); + dm.allocatePage(); + dm.allocatePage(); + CHECK(dm.numPages() == 3); +} + +TEST_CASE("allocatePage returns sequential ids starting at 0") { + TempFile tf; + DiskManager dm(tf.path()); + + CHECK(dm.allocatePage() == 0); + CHECK(dm.allocatePage() == 1); + CHECK(dm.allocatePage() == 2); +} + +TEST_CASE("data persists across DiskManager instances") { + TempFile tf; + + const auto pattern = makePattern(0x5A); + PageId pid; + { + DiskManager dm(tf.path()); + pid = dm.allocatePage(); + dm.writePage(pid, pattern.data()); + } // dm destroyed, file closed + + DiskManager dm2(tf.path()); + CHECK(dm2.numPages() == 1); + std::vector buf(PAGE_SIZE); + dm2.readPage(pid, buf.data()); + CHECK(std::memcmp(buf.data(), pattern.data(), PAGE_SIZE) == 0); +} + +TEST_CASE("readPage on out-of-range page id throws") { + TempFile tf; + DiskManager dm(tf.path()); + dm.allocatePage(); + + std::vector buf(PAGE_SIZE); + CHECK_THROWS_AS(dm.readPage(5, buf.data()), std::runtime_error); +} + +TEST_CASE("writePage on out-of-range page id throws") { + TempFile tf; + DiskManager dm(tf.path()); + dm.allocatePage(); + + const auto pattern = makePattern(0x01); + CHECK_THROWS_AS(dm.writePage(5, pattern.data()), std::runtime_error); +} + +TEST_CASE("opening a file whose size is not a multiple of PAGE_SIZE throws") { + TempFile tf; + { + std::ofstream out(tf.path(), std::ios::binary); + const std::string garbage(PAGE_SIZE + 17, '\0'); // deliberately misaligned + out.write(garbage.data(), static_cast(garbage.size())); + } + + CHECK_THROWS_AS(DiskManager dm(tf.path()), std::runtime_error); +} From f80e5388aa2849de89b55999c6997c8950f01fbb Mon Sep 17 00:00:00 2001 From: Jay Phan Date: Sun, 26 Apr 2026 12:57:36 -0400 Subject: [PATCH 2/5] buffer pool layer --- Makefile | 4 +- src/storage/buffer_pool.cpp | 176 +++++++++++++ src/storage/buffer_pool.h | 151 +++++++++++ tests/storage/test_buffer_pool.cpp | 381 ++++++++++++++++++++++++++++ tests/storage/test_disk_manager.cpp | 44 +--- tests/test_util.h | 47 ++++ 6 files changed, 760 insertions(+), 43 deletions(-) create mode 100644 src/storage/buffer_pool.cpp create mode 100644 src/storage/buffer_pool.h create mode 100644 tests/storage/test_buffer_pool.cpp create mode 100644 tests/test_util.h diff --git a/Makefile b/Makefile index b863d66..3052e4a 100644 --- a/Makefile +++ b/Makefile @@ -6,8 +6,10 @@ BUILD_DIR = build DBMS_OBJS = $(BUILD_DIR)/main.o $(BUILD_DIR)/src/parser.o TEST_OBJS = $(BUILD_DIR)/tests/test_parser.o \ $(BUILD_DIR)/tests/storage/test_disk_manager.o \ + $(BUILD_DIR)/tests/storage/test_buffer_pool.o \ $(BUILD_DIR)/src/parser.o \ - $(BUILD_DIR)/src/storage/disk_manager.o + $(BUILD_DIR)/src/storage/disk_manager.o \ + $(BUILD_DIR)/src/storage/buffer_pool.o dbms: $(DBMS_OBJS) $(CXX) $(CXXFLAGS) -o $@ $^ diff --git a/src/storage/buffer_pool.cpp b/src/storage/buffer_pool.cpp new file mode 100644 index 0000000..f89ac39 --- /dev/null +++ b/src/storage/buffer_pool.cpp @@ -0,0 +1,176 @@ +#include "src/storage/buffer_pool.h" + +#include +#include +#include + +BufferPool::BufferPool(size_t num_frames, DiskManager* disk) + : disk_(disk), frames_(num_frames) { + if (num_frames == 0) { + throw std::runtime_error("BufferPool: num_frames must be > 0"); + } + if (disk_ == nullptr) { + throw std::runtime_error("BufferPool: disk must not be null"); + } + for (size_t i = 0; i < num_frames; ++i) { + Frame& f = frames_[i]; + f.page_id = INVALID_PAGE_ID; + f.is_dirty = false; + f.pin_count = 0; + std::memset(f.data, 0, PAGE_SIZE); + addToLRU(i); + } +} + +void BufferPool::removeFromLRU(size_t frame_idx) { + auto it = lru_pos_.find(frame_idx); + if (it == lru_pos_.end()) return; + lru_.erase(it->second); + lru_pos_.erase(it); +} + +void BufferPool::addToLRU(size_t frame_idx) { + lru_.push_back(frame_idx); + lru_pos_[frame_idx] = std::prev(lru_.end()); +} + +size_t BufferPool::pickVictim() { + if (lru_.empty()) { + throw std::runtime_error("BufferPool: all frames pinned, no victim available"); + } + return lru_.front(); +} + +void BufferPool::evict(size_t frame_idx) { + Frame& f = frames_[frame_idx]; + if (f.page_id == INVALID_PAGE_ID) return; // empty slot, nothing to evict + if (f.is_dirty) { + disk_->writePage(f.page_id, f.data); + f.is_dirty = false; + } + page_table_.erase(f.page_id); + f.page_id = INVALID_PAGE_ID; +} + +Frame* BufferPool::fetchPage(PageId page_id) { + auto it = page_table_.find(page_id); + if (it != page_table_.end()) { + // Cache hit. If currently unpinned, pull it out of the LRU list so + // it can't be evicted while pinned. + size_t idx = it->second; + Frame& f = frames_[idx]; + if (f.pin_count == 0) { + removeFromLRU(idx); + } + ++f.pin_count; + return &f; + } + + // Miss: pick a victim and load the page into its slot. + size_t idx = pickVictim(); + removeFromLRU(idx); + evict(idx); + + Frame& f = frames_[idx]; + disk_->readPage(page_id, f.data); + f.page_id = page_id; + f.is_dirty = false; + f.pin_count = 1; + page_table_[page_id] = idx; + return &f; +} + +void BufferPool::unpinPage(PageId page_id, bool was_modified) { + auto it = page_table_.find(page_id); + if (it == page_table_.end()) { + throw std::runtime_error("BufferPool::unpinPage: page " + + std::to_string(page_id) + " not in pool"); + } + size_t idx = it->second; + Frame& f = frames_[idx]; + if (f.pin_count <= 0) { + throw std::runtime_error("BufferPool::unpinPage: page " + + std::to_string(page_id) + " not pinned"); + } + if (was_modified) f.is_dirty = true; + --f.pin_count; + if (f.pin_count == 0) { + addToLRU(idx); + } +} + +Frame* BufferPool::newPage(PageId* out_page_id) { + PageId new_id = disk_->allocatePage(); + if (out_page_id) *out_page_id = new_id; + + size_t idx = pickVictim(); + removeFromLRU(idx); + evict(idx); + + Frame& f = frames_[idx]; + std::memset(f.data, 0, PAGE_SIZE); + f.page_id = new_id; + f.is_dirty = false; + f.pin_count = 1; + page_table_[new_id] = idx; + return &f; +} + +void BufferPool::flushPage(PageId page_id) { + auto it = page_table_.find(page_id); + if (it == page_table_.end()) return; + Frame& f = frames_[it->second]; + if (f.is_dirty) { + disk_->writePage(f.page_id, f.data); + f.is_dirty = false; + } +} + +void BufferPool::flushAll() { + for (auto& f : frames_) { + if (f.page_id != INVALID_PAGE_ID && f.is_dirty) { + disk_->writePage(f.page_id, f.data); + f.is_dirty = false; + } + } +} + +PageGuard BufferPool::pin(PageId page_id) { + Frame* f = fetchPage(page_id); + return PageGuard(this, f); +} + +PageGuard BufferPool::pinNew() { + PageId pid = INVALID_PAGE_ID; + Frame* f = newPage(&pid); + return PageGuard(this, f); +} + +PageGuard::~PageGuard() { + if (frame_ && bp_) { + bp_->unpinPage(frame_->page_id, dirty_); + } +} + +PageGuard::PageGuard(PageGuard&& other) noexcept + : bp_(other.bp_), frame_(other.frame_), dirty_(other.dirty_) { + other.bp_ = nullptr; + other.frame_ = nullptr; + other.dirty_ = false; +} + +PageGuard& PageGuard::operator=(PageGuard&& other) noexcept { + if (this != &other) { + // Unpin our current page (if any) before taking over `other`'s state. + if (frame_ && bp_) { + bp_->unpinPage(frame_->page_id, dirty_); + } + bp_ = other.bp_; + frame_ = other.frame_; + dirty_ = other.dirty_; + other.bp_ = nullptr; + other.frame_ = nullptr; + other.dirty_ = false; + } + return *this; +} diff --git a/src/storage/buffer_pool.h b/src/storage/buffer_pool.h new file mode 100644 index 0000000..ac11838 --- /dev/null +++ b/src/storage/buffer_pool.h @@ -0,0 +1,151 @@ +#pragma once + +#include "src/storage/disk_manager.h" + +#include +#include +#include +#include + +// Layer 2 of the storage stack: a fixed-size array of `Frame`s that caches +// pages from a DiskManager. Eviction is plain LRU among unpinned frames. +// +// ---------------------------------------------------------------------------- +// Recommended usage: PageGuard (RAII) +// ---------------------------------------------------------------------------- +// +// `fetchPage`/`unpinPage`/`newPage` are the low-level primitives. Mixing pin +// and unpin manually is error-prone: +// - forgetting unpinPage permanently leaks a frame slot; +// - dereferencing the returned Frame* after unpinPage is undefined. +// +// Prefer `pin()` and `pinNew()`, which return a `PageGuard` that calls +// `unpinPage` for you when it goes out of scope: +// +// { +// PageGuard g = bp.pin(page_id); // pinned for the scope +// std::memcpy(g->data, src, PAGE_SIZE); +// g.markDirty(); // <-- if you mutated the bytes +// } // unpin happens here +// +// Always call `markDirty()` if you wrote into `g->data`; otherwise the edits +// are silently dropped on eviction. PageGuard is move-only — pass it across +// function boundaries with std::move, never copy. + +class BufferPool; // forward declaration for PageGuard + +// One slot in the buffer pool. Holds the bytes of a cached page plus the +// bookkeeping needed to evict it safely. +// +// `pin_count` is the buffer pool's most important invariant: while a frame +// has pin_count > 0, callers may hold raw pointers into `data` and the frame +// must not be evicted. fetchPage pins; unpinPage unpins. +struct Frame { + char data[PAGE_SIZE]; + PageId page_id; // INVALID_PAGE_ID if the slot is empty + bool is_dirty; // set when caller unpins with was_modified=true + int pin_count; // number of outstanding fetches +}; + +// RAII handle for a pinned page. Constructed only by BufferPool::pin / +// BufferPool::pinNew. On destruction, calls unpinPage on the buffer pool, +// passing whatever dirty flag has been set via markDirty(). +// +// Move-only: copying would double-unpin on destruction. +class PageGuard { +public: + PageGuard() = default; // empty/disarmed (e.g. moved-from) + ~PageGuard(); + + PageGuard(const PageGuard&) = delete; + PageGuard& operator=(const PageGuard&) = delete; + PageGuard(PageGuard&& other) noexcept; + PageGuard& operator=(PageGuard&& other) noexcept; + + // True if this guard currently owns a pin. Default-constructed and + // moved-from guards are not valid. + bool valid() const { return frame_ != nullptr; } + + // Direct access to the underlying Frame. Undefined if !valid(). + Frame& operator*() const { return *frame_; } + Frame* operator->() const { return frame_; } + Frame* get() const { return frame_; } + + // Mark the page as modified so the buffer pool writes it back before + // evicting (or on flush). Call this whenever you mutate `data`. Idempotent. + void markDirty() { dirty_ = true; } + +private: + friend class BufferPool; + PageGuard(BufferPool* bp, Frame* frame) noexcept + : bp_(bp), frame_(frame), dirty_(false) {} + + BufferPool* bp_ = nullptr; + Frame* frame_ = nullptr; + bool dirty_ = false; +}; + +class BufferPool { +public: + BufferPool(size_t num_frames, DiskManager* disk); + + BufferPool(const BufferPool&) = delete; + BufferPool& operator=(const BufferPool&) = delete; + + // ------------------------------------------------------------------------ + // Recommended high-level API + // ------------------------------------------------------------------------ + + // Pin `page_id` and return an RAII guard. The guard unpins on destruction. + // Throws if every frame is currently pinned. + PageGuard pin(PageId page_id); + + // Allocate a new page on disk and return a guard that owns its pin. The + // page bytes are zeroed. Recover the new page's id via `g->page_id`. + PageGuard pinNew(); + + // ------------------------------------------------------------------------ + // Low-level primitives + // ------------------------------------------------------------------------ + // Use these only if you need explicit control over the pin lifetime + // (e.g. across non-RAII boundaries). Prefer pin()/pinNew() otherwise. + + Frame* fetchPage(PageId page_id); + void unpinPage(PageId page_id, bool was_modified); + Frame* newPage(PageId* out_page_id); + + // ------------------------------------------------------------------------ + // Flushing + // ------------------------------------------------------------------------ + + // Flush a single cached page to disk (no-op if not cached or not dirty). + void flushPage(PageId page_id); + + // Flush every dirty cached page to disk. + void flushAll(); + + size_t numFrames() const { return frames_.size(); } + +private: + DiskManager* disk_; + std::vector frames_; + std::unordered_map page_table_; // page_id -> frame index + + // LRU list of frame indices that are eviction candidates (pin_count == 0). + // Front = least recently used (next victim). Empty frames live here too. + std::list lru_; + std::unordered_map::iterator> lru_pos_; + + // Pick a frame to evict (front of LRU). Throws if no frame is unpinned. + size_t pickVictim(); + + // Remove `frame_idx` from the LRU list. Caller asserts it is currently in. + void removeFromLRU(size_t frame_idx); + + // Push `frame_idx` to the back of the LRU list (most recently used). + void addToLRU(size_t frame_idx); + + // If the frame at `frame_idx` holds a page, write it back if dirty and + // remove it from page_table_. Used before reusing the slot. + void evict(size_t frame_idx); +}; diff --git a/tests/storage/test_buffer_pool.cpp b/tests/storage/test_buffer_pool.cpp new file mode 100644 index 0000000..04b5452 --- /dev/null +++ b/tests/storage/test_buffer_pool.cpp @@ -0,0 +1,381 @@ +#include "tests/vendor/doctest.h" + +#include "src/storage/buffer_pool.h" +#include "src/storage/disk_manager.h" +#include "tests/test_util.h" + +#include +#include +#include + +namespace { + +// Seed `n` pages on disk via DiskManager directly. Page i gets makePattern(i). +// Returns the DiskManager so the caller can hand it to a BufferPool. +void seedPages(DiskManager& dm, size_t n) { + for (size_t i = 0; i < n; ++i) { + const PageId pid = dm.allocatePage(); + const auto pattern = makePattern(static_cast(0x10 + i)); + dm.writePage(pid, pattern.data()); + } +} + +} // namespace + +TEST_CASE("fetchPage loads page bytes from disk") { + TempFile tf; + DiskManager dm(tf.path()); + seedPages(dm, 1); + + BufferPool bp(3, &dm); + Frame* f = bp.fetchPage(0); + REQUIRE(f != nullptr); + CHECK(f->page_id == 0); + CHECK(f->pin_count == 1); + CHECK_FALSE(f->is_dirty); + + const auto expected = makePattern(0x10); + CHECK(std::memcmp(f->data, expected.data(), PAGE_SIZE) == 0); + + bp.unpinPage(0, false); +} + +TEST_CASE("cache hit returns the same frame and bumps pin count") { + TempFile tf; + DiskManager dm(tf.path()); + seedPages(dm, 1); + + BufferPool bp(3, &dm); + Frame* f1 = bp.fetchPage(0); + Frame* f2 = bp.fetchPage(0); + CHECK(f1 == f2); + CHECK(f1->pin_count == 2); + + bp.unpinPage(0, false); + CHECK(f1->pin_count == 1); + bp.unpinPage(0, false); + CHECK(f1->pin_count == 0); +} + +TEST_CASE("LRU eviction reclaims the least recently used unpinned frame") { + TempFile tf; + DiskManager dm(tf.path()); + seedPages(dm, 4); + + BufferPool bp(3, &dm); + + // Fetch + immediately unpin pages 0,1,2 in order. After this, pool holds + // {0,1,2} all unpinned with LRU order [0, 1, 2] (front = 0 = next victim). + bp.fetchPage(0); bp.unpinPage(0, false); + bp.fetchPage(1); bp.unpinPage(1, false); + bp.fetchPage(2); bp.unpinPage(2, false); + + // Fetch page 3 — must evict page 0. + Frame* f3 = bp.fetchPage(3); + const auto expected3 = makePattern(0x13); + CHECK(std::memcmp(f3->data, expected3.data(), PAGE_SIZE) == 0); + bp.unpinPage(3, false); + + // Re-fetching page 0 should still work (re-read from disk). + Frame* f0 = bp.fetchPage(0); + const auto expected0 = makePattern(0x10); + CHECK(std::memcmp(f0->data, expected0.data(), PAGE_SIZE) == 0); + bp.unpinPage(0, false); +} + +TEST_CASE("dirty page is written back when evicted") { + TempFile tf; + DiskManager dm(tf.path()); + seedPages(dm, 4); + + { + BufferPool bp(3, &dm); + + // Modify page 0 through the pool. + Frame* f0 = bp.fetchPage(0); + const auto modified = makePattern(0xEE); + std::memcpy(f0->data, modified.data(), PAGE_SIZE); + bp.unpinPage(0, /*was_modified=*/true); + + // Force eviction of page 0 by filling the pool with other pages. + bp.fetchPage(1); bp.unpinPage(1, false); + bp.fetchPage(2); bp.unpinPage(2, false); + bp.fetchPage(3); bp.unpinPage(3, false); // evicts page 0 + } + + // After the BufferPool destructs (no flushAll called explicitly), the + // eviction itself should have written page 0 back to disk. + DiskManager dm2(tf.path()); + std::vector buf(PAGE_SIZE); + dm2.readPage(0, buf.data()); + const auto expected = makePattern(0xEE); + CHECK(std::memcmp(buf.data(), expected.data(), PAGE_SIZE) == 0); +} + +TEST_CASE("clean unpin discards in-memory edits on eviction") { + TempFile tf; + DiskManager dm(tf.path()); + seedPages(dm, 4); + + { + BufferPool bp(3, &dm); + Frame* f0 = bp.fetchPage(0); + std::memcpy(f0->data, makePattern(0xEE).data(), PAGE_SIZE); + bp.unpinPage(0, /*was_modified=*/false); // discard the edit + + // Evict page 0 by filling pool. + bp.fetchPage(1); bp.unpinPage(1, false); + bp.fetchPage(2); bp.unpinPage(2, false); + bp.fetchPage(3); bp.unpinPage(3, false); + } + + DiskManager dm2(tf.path()); + std::vector buf(PAGE_SIZE); + dm2.readPage(0, buf.data()); + const auto original = makePattern(0x10); + CHECK(std::memcmp(buf.data(), original.data(), PAGE_SIZE) == 0); +} + +TEST_CASE("fetchPage throws when every frame is pinned") { + TempFile tf; + DiskManager dm(tf.path()); + seedPages(dm, 3); + + BufferPool bp(2, &dm); + bp.fetchPage(0); + bp.fetchPage(1); + CHECK_THROWS_AS(bp.fetchPage(2), std::runtime_error); + + // Cleanup so the implicit destructor doesn't try to flush a still-pinned + // page (it doesn't, but be tidy regardless). + bp.unpinPage(0, false); + bp.unpinPage(1, false); +} + +TEST_CASE("a doubly-pinned page survives eviction pressure until fully unpinned") { + TempFile tf; + DiskManager dm(tf.path()); + seedPages(dm, 3); + + BufferPool bp(2, &dm); + bp.fetchPage(0); // pin_count = 1 + bp.fetchPage(0); // pin_count = 2 + + // Pool has 2 frames; page 0 occupies one and is doubly pinned. Fetch + // page 1 — fits in the other frame. + bp.fetchPage(1); + + // Now both frames are pinned; fetching page 2 must throw. + CHECK_THROWS_AS(bp.fetchPage(2), std::runtime_error); + + // Unpin page 0 once — still pinned (count = 1). + bp.unpinPage(0, false); + CHECK_THROWS_AS(bp.fetchPage(2), std::runtime_error); + + // Unpin again — now evictable. + bp.unpinPage(0, false); + Frame* f2 = bp.fetchPage(2); + REQUIRE(f2 != nullptr); + CHECK(f2->page_id == 2); + + bp.unpinPage(1, false); + bp.unpinPage(2, false); +} + +TEST_CASE("newPage allocates a fresh zeroed page") { + TempFile tf; + DiskManager dm(tf.path()); + + BufferPool bp(3, &dm); + PageId new_pid = INVALID_PAGE_ID; + Frame* f = bp.newPage(&new_pid); + REQUIRE(f != nullptr); + CHECK(new_pid == 0); + CHECK(f->page_id == 0); + CHECK(f->pin_count == 1); + CHECK_FALSE(f->is_dirty); + + std::vector zeros(PAGE_SIZE, 0); + CHECK(std::memcmp(f->data, zeros.data(), PAGE_SIZE) == 0); + + bp.unpinPage(new_pid, false); +} + +TEST_CASE("flushPage writes a dirty page through to disk") { + TempFile tf; + DiskManager dm(tf.path()); + seedPages(dm, 1); + + BufferPool bp(3, &dm); + Frame* f = bp.fetchPage(0); + std::memcpy(f->data, makePattern(0x99).data(), PAGE_SIZE); + bp.unpinPage(0, /*was_modified=*/true); + + bp.flushPage(0); + + // Read directly via DiskManager (no shared state with bp's cache). + std::vector buf(PAGE_SIZE); + dm.readPage(0, buf.data()); + CHECK(std::memcmp(buf.data(), makePattern(0x99).data(), PAGE_SIZE) == 0); +} + +TEST_CASE("flushAll writes every dirty page") { + TempFile tf; + DiskManager dm(tf.path()); + seedPages(dm, 2); + + BufferPool bp(3, &dm); + Frame* f0 = bp.fetchPage(0); + std::memcpy(f0->data, makePattern(0xA1).data(), PAGE_SIZE); + bp.unpinPage(0, true); + + Frame* f1 = bp.fetchPage(1); + std::memcpy(f1->data, makePattern(0xA2).data(), PAGE_SIZE); + bp.unpinPage(1, true); + + bp.flushAll(); + + std::vector buf(PAGE_SIZE); + dm.readPage(0, buf.data()); + CHECK(std::memcmp(buf.data(), makePattern(0xA1).data(), PAGE_SIZE) == 0); + dm.readPage(1, buf.data()); + CHECK(std::memcmp(buf.data(), makePattern(0xA2).data(), PAGE_SIZE) == 0); +} + +TEST_CASE("unpinPage on a non-cached page throws") { + TempFile tf; + DiskManager dm(tf.path()); + seedPages(dm, 1); + + BufferPool bp(3, &dm); + CHECK_THROWS_AS(bp.unpinPage(0, false), std::runtime_error); +} + +TEST_CASE("unpinPage on an already fully unpinned page throws") { + TempFile tf; + DiskManager dm(tf.path()); + seedPages(dm, 1); + + BufferPool bp(3, &dm); + bp.fetchPage(0); + bp.unpinPage(0, false); + CHECK_THROWS_AS(bp.unpinPage(0, false), std::runtime_error); +} + +TEST_CASE("PageGuard pins on construction and unpins on scope exit (usage demo)") { + TempFile tf; + DiskManager dm(tf.path()); + seedPages(dm, 4); + + BufferPool bp(2, &dm); + + { + PageGuard g = bp.pin(0); + CHECK(g.valid()); + CHECK(g->page_id == 0); + CHECK(g->pin_count == 1); + + // Read access — no markDirty needed. + const auto expected = makePattern(0x10); + CHECK(std::memcmp(g->data, expected.data(), PAGE_SIZE) == 0); + } + // Guard dropped: page 0 is now unpinned and evictable. + + // Sanity check by exhausting eviction. With 2 frames, fetching 1+2+3 in a + // row would have failed if page 0 were still pinned. + bp.fetchPage(1); bp.unpinPage(1, false); + bp.fetchPage(2); bp.unpinPage(2, false); + bp.fetchPage(3); bp.unpinPage(3, false); +} + +TEST_CASE("PageGuard::markDirty causes write-back on eviction") { + TempFile tf; + DiskManager dm(tf.path()); + seedPages(dm, 4); + + { + BufferPool bp(3, &dm); + { + PageGuard g = bp.pin(0); + std::memcpy(g->data, makePattern(0xC1).data(), PAGE_SIZE); + g.markDirty(); + } + // Force eviction of page 0. + bp.fetchPage(1); bp.unpinPage(1, false); + bp.fetchPage(2); bp.unpinPage(2, false); + bp.fetchPage(3); bp.unpinPage(3, false); + } + + DiskManager dm2(tf.path()); + std::vector buf(PAGE_SIZE); + dm2.readPage(0, buf.data()); + CHECK(std::memcmp(buf.data(), makePattern(0xC1).data(), PAGE_SIZE) == 0); +} + +TEST_CASE("PageGuard without markDirty discards in-memory edits") { + TempFile tf; + DiskManager dm(tf.path()); + seedPages(dm, 4); + + { + BufferPool bp(3, &dm); + { + PageGuard g = bp.pin(0); + std::memcpy(g->data, makePattern(0xC1).data(), PAGE_SIZE); + // Note: forgot to call g.markDirty() — edits will be lost. + } + bp.fetchPage(1); bp.unpinPage(1, false); + bp.fetchPage(2); bp.unpinPage(2, false); + bp.fetchPage(3); bp.unpinPage(3, false); + } + + DiskManager dm2(tf.path()); + std::vector buf(PAGE_SIZE); + dm2.readPage(0, buf.data()); + CHECK(std::memcmp(buf.data(), makePattern(0x10).data(), PAGE_SIZE) == 0); +} + +TEST_CASE("PageGuard is move-only and the moved-from guard is empty") { + TempFile tf; + DiskManager dm(tf.path()); + seedPages(dm, 1); + + BufferPool bp(2, &dm); + + PageGuard a = bp.pin(0); + CHECK(a.valid()); + + PageGuard b = std::move(a); + CHECK_FALSE(a.valid()); + CHECK(b.valid()); + CHECK(b->page_id == 0); + CHECK(b->pin_count == 1); + // When `b` goes out of scope, page 0 unpins exactly once. +} + +TEST_CASE("pinNew returns a guard for a freshly allocated zeroed page") { + TempFile tf; + DiskManager dm(tf.path()); + + BufferPool bp(3, &dm); + + PageId pid; + { + PageGuard g = bp.pinNew(); + REQUIRE(g.valid()); + pid = g->page_id; + CHECK(pid == 0); + CHECK(g->pin_count == 1); + + std::vector zeros(PAGE_SIZE, 0); + CHECK(std::memcmp(g->data, zeros.data(), PAGE_SIZE) == 0); + + std::memcpy(g->data, makePattern(0x77).data(), PAGE_SIZE); + g.markDirty(); + } + + bp.flushPage(pid); + std::vector buf(PAGE_SIZE); + dm.readPage(pid, buf.data()); + CHECK(std::memcmp(buf.data(), makePattern(0x77).data(), PAGE_SIZE) == 0); +} diff --git a/tests/storage/test_disk_manager.cpp b/tests/storage/test_disk_manager.cpp index 966f0c8..bc56643 100644 --- a/tests/storage/test_disk_manager.cpp +++ b/tests/storage/test_disk_manager.cpp @@ -1,54 +1,14 @@ #include "tests/vendor/doctest.h" #include "src/storage/disk_manager.h" +#include "tests/test_util.h" -#include -#include #include -#include +#include #include #include #include -namespace { - -// RAII handle for a unique temp file path. Removes the file on destruction -// so each test case is isolated and the temp dir stays clean. -class TempFile { -public: - TempFile() { - static std::atomic counter{0}; - const auto stamp = std::chrono::high_resolution_clock::now().time_since_epoch().count(); - const auto seq = counter.fetch_add(1); - path_ = std::filesystem::temp_directory_path() / - ("dbms_test_" + std::to_string(stamp) + "_" + std::to_string(seq) + ".db"); - std::filesystem::remove(path_); - } - ~TempFile() { - std::error_code ec; - std::filesystem::remove(path_, ec); - } - TempFile(const TempFile&) = delete; - TempFile& operator=(const TempFile&) = delete; - - const std::string path() const { return path_.string(); } - -private: - std::filesystem::path path_; -}; - -// Fill a PAGE_SIZE buffer with a deterministic byte pattern keyed on `seed`, -// so every page in a test gets a distinguishable payload. -std::vector makePattern(uint8_t seed) { - std::vector buf(PAGE_SIZE); - for (size_t i = 0; i < PAGE_SIZE; ++i) { - buf[i] = static_cast((i + seed) & 0xff); - } - return buf; -} - -} // namespace - TEST_CASE("allocate-write-read round trip (usage demo)") { TempFile tf; DiskManager dm(tf.path()); diff --git a/tests/test_util.h b/tests/test_util.h new file mode 100644 index 0000000..33bfb3c --- /dev/null +++ b/tests/test_util.h @@ -0,0 +1,47 @@ +#pragma once + +#include "src/storage/disk_manager.h" + +#include +#include +#include +#include +#include +#include +#include + +// RAII handle for a unique temp file path. Creates a unique path under the +// system temp dir on construction (with the file removed if anything stale +// existed) and removes it on destruction so test cases stay isolated. +class TempFile { +public: + TempFile() { + static std::atomic counter{0}; + const auto stamp = std::chrono::high_resolution_clock::now().time_since_epoch().count(); + const auto seq = counter.fetch_add(1); + path_ = std::filesystem::temp_directory_path() / + ("dbms_test_" + std::to_string(stamp) + "_" + std::to_string(seq) + ".db"); + std::filesystem::remove(path_); + } + ~TempFile() { + std::error_code ec; + std::filesystem::remove(path_, ec); + } + TempFile(const TempFile&) = delete; + TempFile& operator=(const TempFile&) = delete; + + std::string path() const { return path_.string(); } + +private: + std::filesystem::path path_; +}; + +// Fill a PAGE_SIZE buffer with a deterministic byte pattern keyed on `seed`, +// so distinct pages get visually distinguishable payloads. +inline std::vector makePattern(uint8_t seed) { + std::vector buf(PAGE_SIZE); + for (size_t i = 0; i < PAGE_SIZE; ++i) { + buf[i] = static_cast((i + seed) & 0xff); + } + return buf; +} From 5c35af627ea1e8341ef7ca1687843bc8ffd54295 Mon Sep 17 00:00:00 2001 From: Jay Phan Date: Sun, 26 Apr 2026 13:17:44 -0400 Subject: [PATCH 3/5] slot layer --- Makefile | 4 +- src/storage/buffer_pool.cpp | 12 +- src/storage/buffer_pool.h | 9 +- src/storage/slotted_page.cpp | 173 +++++++++ src/storage/slotted_page.h | 111 ++++++ tests/storage/test_buffer_pool.cpp | 154 ++++++++ tests/storage/test_disk_manager.cpp | 49 +++ tests/storage/test_slotted_page.cpp | 544 ++++++++++++++++++++++++++++ 8 files changed, 1044 insertions(+), 12 deletions(-) create mode 100644 src/storage/slotted_page.cpp create mode 100644 src/storage/slotted_page.h create mode 100644 tests/storage/test_slotted_page.cpp diff --git a/Makefile b/Makefile index 3052e4a..b0d3392 100644 --- a/Makefile +++ b/Makefile @@ -7,9 +7,11 @@ DBMS_OBJS = $(BUILD_DIR)/main.o $(BUILD_DIR)/src/parser.o TEST_OBJS = $(BUILD_DIR)/tests/test_parser.o \ $(BUILD_DIR)/tests/storage/test_disk_manager.o \ $(BUILD_DIR)/tests/storage/test_buffer_pool.o \ + $(BUILD_DIR)/tests/storage/test_slotted_page.o \ $(BUILD_DIR)/src/parser.o \ $(BUILD_DIR)/src/storage/disk_manager.o \ - $(BUILD_DIR)/src/storage/buffer_pool.o + $(BUILD_DIR)/src/storage/buffer_pool.o \ + $(BUILD_DIR)/src/storage/slotted_page.o dbms: $(DBMS_OBJS) $(CXX) $(CXXFLAGS) -o $@ $^ diff --git a/src/storage/buffer_pool.cpp b/src/storage/buffer_pool.cpp index f89ac39..8578275 100644 --- a/src/storage/buffer_pool.cpp +++ b/src/storage/buffer_pool.cpp @@ -148,29 +148,27 @@ PageGuard BufferPool::pinNew() { PageGuard::~PageGuard() { if (frame_ && bp_) { - bp_->unpinPage(frame_->page_id, dirty_); + // Frame::is_dirty was already set by markDirty() (if at all), so the + // unpin call doesn't need to OR in any additional bit. + bp_->unpinPage(frame_->page_id, false); } } PageGuard::PageGuard(PageGuard&& other) noexcept - : bp_(other.bp_), frame_(other.frame_), dirty_(other.dirty_) { + : bp_(other.bp_), frame_(other.frame_) { other.bp_ = nullptr; other.frame_ = nullptr; - other.dirty_ = false; } PageGuard& PageGuard::operator=(PageGuard&& other) noexcept { if (this != &other) { - // Unpin our current page (if any) before taking over `other`'s state. if (frame_ && bp_) { - bp_->unpinPage(frame_->page_id, dirty_); + bp_->unpinPage(frame_->page_id, false); } bp_ = other.bp_; frame_ = other.frame_; - dirty_ = other.dirty_; other.bp_ = nullptr; other.frame_ = nullptr; - other.dirty_ = false; } return *this; } diff --git a/src/storage/buffer_pool.h b/src/storage/buffer_pool.h index ac11838..adb410d 100644 --- a/src/storage/buffer_pool.h +++ b/src/storage/buffer_pool.h @@ -72,17 +72,18 @@ class PageGuard { Frame* get() const { return frame_; } // Mark the page as modified so the buffer pool writes it back before - // evicting (or on flush). Call this whenever you mutate `data`. Idempotent. - void markDirty() { dirty_ = true; } + // evicting (or on flush). Call this whenever you mutate `data`. + // Sets Frame::is_dirty immediately, so a subsequent flushAll() observes + // the page as dirty even before this guard goes out of scope. Idempotent. + void markDirty() { if (frame_) frame_->is_dirty = true; } private: friend class BufferPool; PageGuard(BufferPool* bp, Frame* frame) noexcept - : bp_(bp), frame_(frame), dirty_(false) {} + : bp_(bp), frame_(frame) {} BufferPool* bp_ = nullptr; Frame* frame_ = nullptr; - bool dirty_ = false; }; class BufferPool { diff --git a/src/storage/slotted_page.cpp b/src/storage/slotted_page.cpp new file mode 100644 index 0000000..e501ea5 --- /dev/null +++ b/src/storage/slotted_page.cpp @@ -0,0 +1,173 @@ +#include "src/storage/slotted_page.h" + +#include +#include + +namespace { + +uint16_t loadU16(const char* p) { + uint16_t v; + std::memcpy(&v, p, sizeof(v)); + return v; +} + +void storeU16(char* p, uint16_t v) { + std::memcpy(p, &v, sizeof(v)); +} + +void storeU32(char* p, uint32_t v) { + std::memcpy(p, &v, sizeof(v)); +} + +} // namespace + +uint16_t SlottedPage::numSlotsRaw() const { return loadU16(data_ + 4); } +void SlottedPage::setNumSlots(uint16_t v) { storeU16(data_ + 4, v); } +uint16_t SlottedPage::freeSpaceOffset() const { return loadU16(data_ + 6); } +void SlottedPage::setFreeSpaceOffset(uint16_t v) { storeU16(data_ + 6, v); } + +SlottedPage::SlotEntry SlottedPage::readSlot(SlotId i) const { + const char* p = data_ + HEADER_SIZE + static_cast(i) * SLOT_SIZE; + return {loadU16(p), loadU16(p + 2)}; +} + +void SlottedPage::writeSlot(SlotId i, SlotEntry e) { + char* p = data_ + HEADER_SIZE + static_cast(i) * SLOT_SIZE; + storeU16(p, e.offset); + storeU16(p + 2, e.length); +} + +size_t SlottedPage::endOfSlotArray() const { + return HEADER_SIZE + static_cast(numSlotsRaw()) * SLOT_SIZE; +} + +size_t SlottedPage::contiguousFree() const { + return freeSpaceOffset() - endOfSlotArray(); +} + +size_t SlottedPage::deadTupleBytes() const { + // Bytes inside the tuple area (between freeSpaceOffset and PAGE_SIZE) + // that are not pointed to by any live slot. Reclaimed by compact(). + size_t live = 0; + const uint16_t n = numSlotsRaw(); + for (SlotId i = 0; i < n; ++i) { + SlotEntry s = readSlot(i); + if (isLive(s)) live += s.length; + } + const size_t tuple_area = PAGE_SIZE - freeSpaceOffset(); + return tuple_area - live; +} + +void SlottedPage::init() { + storeU32(data_ + 0, 0); // lsn + setNumSlots(0); + setFreeSpaceOffset(static_cast(PAGE_SIZE)); + // Zero the rest of the page so reads from uninitialized regions are + // deterministic. Cheap and helpful when debugging hex dumps. + std::memset(data_ + HEADER_SIZE, 0, PAGE_SIZE - HEADER_SIZE); +} + +size_t SlottedPage::numSlots() const { + return numSlotsRaw(); +} + +size_t SlottedPage::freeSpace() const { + return contiguousFree() + deadTupleBytes(); +} + +std::optional SlottedPage::findDeadSlot() const { + const uint16_t n = numSlotsRaw(); + for (SlotId i = 0; i < n; ++i) { + if (!isLive(readSlot(i))) return i; + } + return std::nullopt; +} + +void SlottedPage::compact() { + const uint16_t n = numSlotsRaw(); + std::vector scratch(PAGE_SIZE); + uint16_t new_offset = static_cast(PAGE_SIZE); + for (SlotId i = 0; i < n; ++i) { + SlotEntry s = readSlot(i); + if (!isLive(s)) continue; + new_offset -= s.length; + std::memcpy(scratch.data() + new_offset, data_ + s.offset, s.length); + writeSlot(i, {new_offset, s.length}); + } + // Copy the repacked tuple area back. Region above new_offset is untouched. + std::memcpy(data_ + new_offset, + scratch.data() + new_offset, + PAGE_SIZE - new_offset); + setFreeSpaceOffset(new_offset); +} + +std::optional SlottedPage::insert(const char* tuple, size_t len) { + if (len == 0 || len > MAX_TUPLE_SIZE) return std::nullopt; + + const auto reuse = findDeadSlot(); + const size_t needed_contig = len + (reuse.has_value() ? 0 : SLOT_SIZE); + const size_t needed_total = needed_contig; // same after compaction + + if (contiguousFree() < needed_contig) { + if (freeSpace() < needed_total) return std::nullopt; + compact(); + if (contiguousFree() < needed_contig) return std::nullopt; // defensive + } + + const uint16_t new_offset = static_cast(freeSpaceOffset() - len); + std::memcpy(data_ + new_offset, tuple, len); + setFreeSpaceOffset(new_offset); + + SlotId slot_id; + if (reuse) { + slot_id = *reuse; + } else { + slot_id = static_cast(numSlotsRaw()); + setNumSlots(static_cast(slot_id + 1)); + } + writeSlot(slot_id, {new_offset, static_cast(len)}); + return slot_id; +} + +std::pair SlottedPage::get(SlotId i) const { + if (i >= numSlotsRaw()) return {nullptr, 0}; + SlotEntry s = readSlot(i); + if (!isLive(s)) return {nullptr, 0}; + return {data_ + s.offset, s.length}; +} + +bool SlottedPage::remove(SlotId i) { + if (i >= numSlotsRaw()) return false; + SlotEntry s = readSlot(i); + if (!isLive(s)) return false; + writeSlot(i, {0, 0}); + return true; +} + +bool SlottedPage::update(SlotId i, const char* tuple, size_t len) { + if (i >= numSlotsRaw()) return false; + SlotEntry s = readSlot(i); + if (!isLive(s)) return false; + if (len == 0 || len > MAX_TUPLE_SIZE) return false; + + if (len <= s.length) { + // In-place. Wastes (s.length - len) bytes until the next compaction. + std::memcpy(data_ + s.offset, tuple, len); + writeSlot(i, {s.offset, static_cast(len)}); + return true; + } + + // New length is larger. Tombstoning the old bytes adds s.length to the + // free pool; if even that is not enough, the update cannot fit. + const size_t available_after_tombstone = freeSpace() + s.length; + if (available_after_tombstone < len) return false; + + writeSlot(i, {0, 0}); // tombstone — old bytes now count as dead + if (contiguousFree() < len) compact(); + + const uint16_t new_offset = static_cast(freeSpaceOffset() - len); + std::memcpy(data_ + new_offset, tuple, len); + setFreeSpaceOffset(new_offset); + writeSlot(i, {new_offset, static_cast(len)}); + return true; +} diff --git a/src/storage/slotted_page.h b/src/storage/slotted_page.h new file mode 100644 index 0000000..6a21c52 --- /dev/null +++ b/src/storage/slotted_page.h @@ -0,0 +1,111 @@ +#pragma once + +#include "src/storage/disk_manager.h" // for PAGE_SIZE + +#include +#include +#include +#include + +using SlotId = uint16_t; + +// Layer 3 of the storage stack: the byte-level layout of a page. +// +// SlottedPage wraps a `char*` of exactly PAGE_SIZE bytes (typically the +// `data` field of a buffer pool Frame) and treats it as: +// +// ┌────────────────────────────────────────────────────────┐ +// │ header (HEADER_SIZE bytes) │ +// ├────────────────────────────────────────────────────────┤ +// │ slot[0], slot[1], ... slot[num_slots-1] ← grows down │ +// │ │ +// │ ← free space → │ +// │ │ +// │ ...tuple bytes packed at the high end ← grows up │ +// └────────────────────────────────────────────────────────┘ +// +// A tuple is identified by its SlotId. Slot IDs are *stable*: once a tuple +// is inserted, its slot id does not change for the lifetime of the slot, +// even across compaction (which moves tuple bytes around). remove() turns +// the slot into a tombstone (length = 0). A subsequent insert may reuse +// the tombstoned slot id; otherwise it appends a new one. +// +// SlottedPage stores opaque byte sequences; it knows nothing about column +// types. Higher layers decide what's inside a tuple. +class SlottedPage { +public: + // Number of bytes the header occupies at the start of the page. + static constexpr size_t HEADER_SIZE = 8; + + // Number of bytes per slot entry in the slot array. + static constexpr size_t SLOT_SIZE = 4; + + // Largest tuple that could ever fit on a fresh empty page. + static constexpr size_t MAX_TUPLE_SIZE = PAGE_SIZE - HEADER_SIZE - SLOT_SIZE; + + // Wrap an existing block of PAGE_SIZE bytes. The bytes are NOT touched + // until you call init() (for a fresh page) or any mutating method. + explicit SlottedPage(char* data) : data_(data) {} + + // Initialize the bytes as an empty slotted page. Call this exactly once + // when the page is first allocated; do NOT call it on a page that + // already holds data, or you will lose every tuple. + void init(); + + // Insert `tuple` (`len` bytes, must be > 0). Returns the slot id on + // success, std::nullopt if the page does not have room. Will compact + // automatically if there is enough total free space but it is fragmented. + std::optional insert(const char* tuple, size_t len); + + // Read the tuple at `slot`. Returns {nullptr, 0} if the slot id is out + // of range or has been tombstoned. The returned pointer aliases the + // page bytes — do not retain it past the next mutation of this page. + std::pair get(SlotId slot) const; + + // Tombstone `slot`. Returns true if the slot was live, false if the + // slot was already dead or out of range. The slot id remains reserved + // (numSlots() does not decrease) but may be reused by a future insert. + bool remove(SlotId slot); + + // Replace the tuple at `slot` with new bytes. Returns true on success, + // false if `slot` is dead/out-of-range or the new tuple does not fit. + // Slot id is preserved across the update even if the tuple bytes move. + bool update(SlotId slot, const char* tuple, size_t len); + + // Total free bytes available for new tuple data after a compaction. + // (Slot array growth is not counted; budget SLOT_SIZE separately if you + // intend to append a new slot rather than reuse a tombstone.) + size_t freeSpace() const; + + // Number of slot ids currently in use. Includes tombstones — a fresh + // insert without a free tombstone would receive slot id `numSlots()`. + size_t numSlots() const; + +private: + char* data_; + + // Header field accessors. The header lives at offset 0 with layout: + // [0..4) lsn (uint32_t, currently always 0; reserved for recovery) + // [4..6) num_slots (uint16_t) + // [6..8) free_space_offset (uint16_t) — where the next tuple goes + uint16_t numSlotsRaw() const; + void setNumSlots(uint16_t v); + uint16_t freeSpaceOffset() const; + void setFreeSpaceOffset(uint16_t v); + + struct SlotEntry { uint16_t offset; uint16_t length; }; + SlotEntry readSlot(SlotId i) const; + void writeSlot(SlotId i, SlotEntry e); + static bool isLive(SlotEntry s) { return s.length > 0; } + + size_t endOfSlotArray() const; + size_t contiguousFree() const; + size_t deadTupleBytes() const; + + // Walk live tuples and pack them at the high end of the page, updating + // slot offsets. Slot ids are preserved. + void compact(); + + // First slot id whose entry is a tombstone, or std::nullopt if none. + std::optional findDeadSlot() const; +}; diff --git a/tests/storage/test_buffer_pool.cpp b/tests/storage/test_buffer_pool.cpp index 04b5452..c07d069 100644 --- a/tests/storage/test_buffer_pool.cpp +++ b/tests/storage/test_buffer_pool.cpp @@ -2,10 +2,14 @@ #include "src/storage/buffer_pool.h" #include "src/storage/disk_manager.h" +#include "src/storage/slotted_page.h" #include "tests/test_util.h" #include +#include +#include #include +#include #include namespace { @@ -379,3 +383,153 @@ TEST_CASE("pinNew returns a guard for a freshly allocated zeroed page") { dm.readPage(pid, buf.data()); CHECK(std::memcmp(buf.data(), makePattern(0x77).data(), PAGE_SIZE) == 0); } + +TEST_CASE("constructor rejects num_frames == 0") { + TempFile tf; + DiskManager dm(tf.path()); + CHECK_THROWS_AS(BufferPool(0, &dm), std::runtime_error); +} + +TEST_CASE("constructor rejects null DiskManager") { + CHECK_THROWS_AS(BufferPool(4, nullptr), std::runtime_error); +} + +TEST_CASE("a 1-frame pool still functions") { + TempFile tf; + DiskManager dm(tf.path()); + seedPages(dm, 3); + + BufferPool bp(1, &dm); + Frame* f0 = bp.fetchPage(0); + CHECK(std::memcmp(f0->data, makePattern(0x10).data(), PAGE_SIZE) == 0); + bp.unpinPage(0, false); + + // Each subsequent fetch evicts the only frame. + Frame* f1 = bp.fetchPage(1); + CHECK(std::memcmp(f1->data, makePattern(0x11).data(), PAGE_SIZE) == 0); + bp.unpinPage(1, false); + + Frame* f2 = bp.fetchPage(2); + CHECK(std::memcmp(f2->data, makePattern(0x12).data(), PAGE_SIZE) == 0); + bp.unpinPage(2, false); +} + +TEST_CASE("eviction order tracks the LRU recency of unpins") { + TempFile tf; + DiskManager dm(tf.path()); + seedPages(dm, 4); + + BufferPool bp(3, &dm); + + // Touch order: 0, 1, 2 (LRU = 0 after this round of unpins). + bp.fetchPage(0); bp.unpinPage(0, false); + bp.fetchPage(1); bp.unpinPage(1, false); + bp.fetchPage(2); bp.unpinPage(2, false); + + // Re-touch 0 — now LRU should be 1, not 0. + bp.fetchPage(0); bp.unpinPage(0, false); + + // Fetching 3 should evict 1 (the LRU). 0 and 2 must still be cache hits. + bp.fetchPage(3); bp.unpinPage(3, false); + + Frame* f0 = bp.fetchPage(0); + CHECK(std::memcmp(f0->data, makePattern(0x10).data(), PAGE_SIZE) == 0); + bp.unpinPage(0, false); + + Frame* f2 = bp.fetchPage(2); + CHECK(std::memcmp(f2->data, makePattern(0x12).data(), PAGE_SIZE) == 0); + bp.unpinPage(2, false); +} + +TEST_CASE("newPage evicts an unpinned frame when the pool is full") { + TempFile tf; + DiskManager dm(tf.path()); + seedPages(dm, 2); + + BufferPool bp(2, &dm); + bp.fetchPage(0); bp.unpinPage(0, false); + bp.fetchPage(1); bp.unpinPage(1, false); + + PageId new_pid = INVALID_PAGE_ID; + Frame* f = bp.newPage(&new_pid); + REQUIRE(f != nullptr); + CHECK(new_pid == 2); + CHECK(f->page_id == 2); + CHECK(f->pin_count == 1); + bp.unpinPage(new_pid, false); +} + +TEST_CASE("newPage throws when every frame is pinned") { + TempFile tf; + DiskManager dm(tf.path()); + seedPages(dm, 2); + + BufferPool bp(2, &dm); + bp.fetchPage(0); + bp.fetchPage(1); + + PageId out; + CHECK_THROWS_AS(bp.newPage(&out), std::runtime_error); + + bp.unpinPage(0, false); + bp.unpinPage(1, false); +} + +TEST_CASE("flushPage on a page not in the pool is a silent no-op") { + TempFile tf; + DiskManager dm(tf.path()); + seedPages(dm, 1); + + BufferPool bp(2, &dm); + // Not cached at all — must not throw. + bp.flushPage(0); + bp.flushPage(99); +} + +TEST_CASE("integration: many tuples across many pages with heavy eviction") { + TempFile tf; + DiskManager dm(tf.path()); + + constexpr int kPages = 6; + constexpr int kPerPage = 8; + std::map, std::vector> model; + std::vector page_ids; + + { + // Tiny pool relative to working set forces lots of eviction traffic. + BufferPool bp(2, &dm); + std::mt19937 rng(1234); + std::uniform_int_distribution len_dist(1, 80); + std::uniform_int_distribution byte_dist(0, 255); + + for (int p = 0; p < kPages; ++p) { + PageGuard g = bp.pinNew(); + page_ids.push_back(g->page_id); + + SlottedPage sp(g->data); + sp.init(); + + for (int t = 0; t < kPerPage; ++t) { + std::vector data(len_dist(rng)); + for (auto& c : data) c = static_cast(byte_dist(rng)); + auto sid = sp.insert(data.data(), data.size()); + REQUIRE(sid); + model[{g->page_id, *sid}] = std::move(data); + } + g.markDirty(); + } + bp.flushAll(); + } + + // Reopen everything fresh and verify each tuple is exactly recoverable. + DiskManager dm2(tf.path()); + BufferPool bp(2, &dm2); + for (const auto& [key, expected] : model) { + PageGuard g = bp.pin(key.first); + SlottedPage sp(g->data); + auto [p, len] = sp.get(key.second); + REQUIRE(p != nullptr); + REQUIRE(len == expected.size()); + REQUIRE(std::memcmp(p, expected.data(), len) == 0); + } +} diff --git a/tests/storage/test_disk_manager.cpp b/tests/storage/test_disk_manager.cpp index bc56643..9802b50 100644 --- a/tests/storage/test_disk_manager.cpp +++ b/tests/storage/test_disk_manager.cpp @@ -4,6 +4,7 @@ #include "tests/test_util.h" #include +#include #include #include #include @@ -115,3 +116,51 @@ TEST_CASE("opening a file whose size is not a multiple of PAGE_SIZE throws") { CHECK_THROWS_AS(DiskManager dm(tf.path()), std::runtime_error); } + +TEST_CASE("file size on disk grows by exactly PAGE_SIZE per allocatePage") { + TempFile tf; + DiskManager dm(tf.path()); + + CHECK(std::filesystem::file_size(tf.path()) == 0); + dm.allocatePage(); + CHECK(std::filesystem::file_size(tf.path()) == PAGE_SIZE); + dm.allocatePage(); + dm.allocatePage(); + CHECK(std::filesystem::file_size(tf.path()) == 3 * PAGE_SIZE); +} + +TEST_CASE("read/write round trips on the highest valid page id") { + TempFile tf; + DiskManager dm(tf.path()); + + for (int i = 0; i < 5; ++i) dm.allocatePage(); + CHECK(dm.numPages() == 5); + + const PageId top = dm.numPages() - 1; + std::vector buf(PAGE_SIZE, 'Z'); + dm.writePage(top, buf.data()); + + std::vector got(PAGE_SIZE); + dm.readPage(top, got.data()); + CHECK(std::memcmp(got.data(), buf.data(), PAGE_SIZE) == 0); +} + +TEST_CASE("stress: 50 pages with distinct patterns round-trip correctly") { + TempFile tf; + DiskManager dm(tf.path()); + + constexpr int N = 50; + for (int i = 0; i < N; ++i) { + const PageId pid = dm.allocatePage(); + const auto pat = makePattern(static_cast(i)); + dm.writePage(pid, pat.data()); + } + CHECK(dm.numPages() == N); + + std::vector buf(PAGE_SIZE); + for (int i = 0; i < N; ++i) { + dm.readPage(static_cast(i), buf.data()); + const auto pat = makePattern(static_cast(i)); + REQUIRE(std::memcmp(buf.data(), pat.data(), PAGE_SIZE) == 0); + } +} diff --git a/tests/storage/test_slotted_page.cpp b/tests/storage/test_slotted_page.cpp new file mode 100644 index 0000000..73f715e --- /dev/null +++ b/tests/storage/test_slotted_page.cpp @@ -0,0 +1,544 @@ +#include "tests/vendor/doctest.h" + +#include "src/storage/buffer_pool.h" +#include "src/storage/disk_manager.h" +#include "src/storage/slotted_page.h" +#include "tests/test_util.h" + +#include +#include +#include +#include +#include +#include + +namespace { + +// Build a tuple buffer from a string for ergonomic test code. +std::vector tup(const std::string& s) { + return std::vector(s.begin(), s.end()); +} + +// Comparison helper: returns true if SlottedPage::get(slot) yields exactly +// the bytes of `expected`. +bool slotEquals(const SlottedPage& sp, SlotId slot, const std::string& expected) { + auto [p, len] = sp.get(slot); + if (p == nullptr) return false; + if (len != expected.size()) return false; + return std::memcmp(p, expected.data(), len) == 0; +} + +} // namespace + +TEST_CASE("init produces an empty page") { + std::array page{}; + SlottedPage sp(page.data()); + sp.init(); + + CHECK(sp.numSlots() == 0); + CHECK(sp.freeSpace() == PAGE_SIZE - SlottedPage::HEADER_SIZE); +} + +TEST_CASE("insert / get round trip a single tuple") { + std::array page{}; + SlottedPage sp(page.data()); + sp.init(); + + const auto t = tup("hello"); + auto sid = sp.insert(t.data(), t.size()); + REQUIRE(sid.has_value()); + CHECK(*sid == 0); + CHECK(sp.numSlots() == 1); + CHECK(slotEquals(sp, *sid, "hello")); +} + +TEST_CASE("multiple inserts get distinct sequential slot ids and round trip") { + std::array page{}; + SlottedPage sp(page.data()); + sp.init(); + + auto a = tup("alpha"); + auto b = tup("beta-much-longer-than-alpha"); + auto c = tup("gamma"); + + auto sa = sp.insert(a.data(), a.size()); REQUIRE(sa); + auto sb = sp.insert(b.data(), b.size()); REQUIRE(sb); + auto sc = sp.insert(c.data(), c.size()); REQUIRE(sc); + CHECK(*sa == 0); + CHECK(*sb == 1); + CHECK(*sc == 2); + CHECK(sp.numSlots() == 3); + + CHECK(slotEquals(sp, *sa, "alpha")); + CHECK(slotEquals(sp, *sb, "beta-much-longer-than-alpha")); + CHECK(slotEquals(sp, *sc, "gamma")); +} + +TEST_CASE("freeSpace decreases by len + SLOT_SIZE on each fresh insert") { + std::array page{}; + SlottedPage sp(page.data()); + sp.init(); + + const size_t free0 = sp.freeSpace(); + auto t = tup("twenty-byte-tuple.."); // 19 bytes + REQUIRE(sp.insert(t.data(), t.size()).has_value()); + const size_t free1 = sp.freeSpace(); + CHECK(free0 - free1 == t.size() + SlottedPage::SLOT_SIZE); +} + +TEST_CASE("remove returns false for out-of-range or already-dead slots") { + std::array page{}; + SlottedPage sp(page.data()); + sp.init(); + + auto t = tup("x"); + auto sid = sp.insert(t.data(), t.size()); + REQUIRE(sid); + + CHECK(sp.remove(*sid)); + CHECK_FALSE(sp.remove(*sid)); // already dead + CHECK_FALSE(sp.remove(99)); // out of range +} + +TEST_CASE("get on a tombstoned or out-of-range slot returns {nullptr, 0}") { + std::array page{}; + SlottedPage sp(page.data()); + sp.init(); + + auto t = tup("doomed"); + auto sid = sp.insert(t.data(), t.size()); + REQUIRE(sid); + REQUIRE(sp.remove(*sid)); + + auto [p1, l1] = sp.get(*sid); + CHECK(p1 == nullptr); + CHECK(l1 == 0); + + auto [p2, l2] = sp.get(99); + CHECK(p2 == nullptr); + CHECK(l2 == 0); +} + +TEST_CASE("insert reuses the first tombstoned slot id") { + std::array page{}; + SlottedPage sp(page.data()); + sp.init(); + + auto a = tup("a"); + auto b = tup("b"); + auto c = tup("c"); + auto sa = sp.insert(a.data(), a.size()); REQUIRE(sa); + auto sb = sp.insert(b.data(), b.size()); REQUIRE(sb); + auto sc = sp.insert(c.data(), c.size()); REQUIRE(sc); + + REQUIRE(sp.remove(*sb)); + CHECK(sp.numSlots() == 3); // tombstone keeps slot id reserved + + auto d = tup("d"); + auto sd = sp.insert(d.data(), d.size()); + REQUIRE(sd); + CHECK(*sd == *sb); // slot id reused + CHECK(sp.numSlots() == 3); // no new slot appended + + CHECK(slotEquals(sp, *sa, "a")); + CHECK(slotEquals(sp, *sd, "d")); + CHECK(slotEquals(sp, *sc, "c")); +} + +TEST_CASE("update with shorter or equal length is in-place; slot id stable") { + std::array page{}; + SlottedPage sp(page.data()); + sp.init(); + + auto t = tup("abcdefghij"); // 10 bytes + auto sid = sp.insert(t.data(), t.size()); + REQUIRE(sid); + + const auto same = tup("0123456789"); // same length + CHECK(sp.update(*sid, same.data(), same.size())); + CHECK(slotEquals(sp, *sid, "0123456789")); + + const auto shorter = tup("xy"); // shorter + CHECK(sp.update(*sid, shorter.data(), shorter.size())); + CHECK(slotEquals(sp, *sid, "xy")); +} + +TEST_CASE("update with a longer tuple relocates but preserves slot id") { + std::array page{}; + SlottedPage sp(page.data()); + sp.init(); + + auto t = tup("short"); + auto sid = sp.insert(t.data(), t.size()); + REQUIRE(sid); + + auto longer = tup("a-much-longer-replacement-tuple"); + CHECK(sp.update(*sid, longer.data(), longer.size())); + CHECK(*sid == 0); + CHECK(slotEquals(sp, *sid, "a-much-longer-replacement-tuple")); +} + +TEST_CASE("update returns false for out-of-range, tombstoned, or oversized") { + std::array page{}; + SlottedPage sp(page.data()); + sp.init(); + + auto t = tup("hello"); + auto sid = sp.insert(t.data(), t.size()); + REQUIRE(sid); + + auto repl = tup("world"); + CHECK_FALSE(sp.update(99, repl.data(), repl.size())); // out of range + REQUIRE(sp.remove(*sid)); + CHECK_FALSE(sp.update(*sid, repl.data(), repl.size())); // tombstoned + + // Re-add and try a too-large tuple. + auto sid2 = sp.insert(t.data(), t.size()); REQUIRE(sid2); + std::vector huge(SlottedPage::MAX_TUPLE_SIZE + 1, 'x'); + CHECK_FALSE(sp.update(*sid2, huge.data(), huge.size())); +} + +TEST_CASE("insert returns nullopt when the page cannot fit even one more byte") { + std::array page{}; + SlottedPage sp(page.data()); + sp.init(); + + // Fill the page with a single near-maximum tuple, then try to insert again. + std::vector big(SlottedPage::MAX_TUPLE_SIZE, 'A'); + auto sid = sp.insert(big.data(), big.size()); + REQUIRE(sid); + + auto small = tup("nope"); + CHECK_FALSE(sp.insert(small.data(), small.size()).has_value()); + + // Length 0 and oversized are also rejected. + CHECK_FALSE(sp.insert(nullptr, 0).has_value()); + std::vector oversize(SlottedPage::MAX_TUPLE_SIZE + 1, 'x'); + CHECK_FALSE(sp.insert(oversize.data(), oversize.size()).has_value()); +} + +TEST_CASE("compaction is triggered when free space is fragmented") { + std::array page{}; + SlottedPage sp(page.data()); + sp.init(); + + // Fill the page with several large tuples, then delete the middle ones + // so the surviving tuples leave a gap that requires compaction to use. + const size_t per = 800; // 5 * 800 = 4000 bytes; leaves headroom + std::vector a(per, 'A'); auto sa = sp.insert(a.data(), per); REQUIRE(sa); + std::vector b(per, 'B'); auto sb = sp.insert(b.data(), per); REQUIRE(sb); + std::vector c(per, 'C'); auto sc = sp.insert(c.data(), per); REQUIRE(sc); + std::vector d(per, 'D'); auto sd = sp.insert(d.data(), per); REQUIRE(sd); + std::vector e(per, 'E'); auto se = sp.insert(e.data(), per); REQUIRE(se); + + // Tombstone two interior tuples — leaves dead bytes scattered. + REQUIRE(sp.remove(*sb)); + REQUIRE(sp.remove(*sd)); + + // After tombstoning, contiguous free is small but total free is large. + // A new tuple of size ~1500 should still fit because compaction kicks in. + std::vector big(1500, 'X'); + auto sx = sp.insert(big.data(), big.size()); + REQUIRE(sx); + + // Survivors and the new insert are all readable. + { + auto [p, n] = sp.get(*sa); + REQUIRE(n == per); + CHECK(std::memcmp(p, a.data(), per) == 0); + } + { + auto [p, n] = sp.get(*sc); + REQUIRE(n == per); + CHECK(std::memcmp(p, c.data(), per) == 0); + } + { + auto [p, n] = sp.get(*se); + REQUIRE(n == per); + CHECK(std::memcmp(p, e.data(), per) == 0); + } + { + auto [p, n] = sp.get(*sx); + REQUIRE(n == big.size()); + CHECK(std::memcmp(p, big.data(), big.size()) == 0); + } +} + +TEST_CASE("slot ids are stable across compaction") { + std::array page{}; + SlottedPage sp(page.data()); + sp.init(); + + const std::vector a(100, 'A'); + const std::vector b(100, 'B'); + const std::vector c(100, 'C'); + + auto sa = sp.insert(a.data(), a.size()); REQUIRE(sa); CHECK(*sa == 0); + auto sb = sp.insert(b.data(), b.size()); REQUIRE(sb); CHECK(*sb == 1); + auto sc = sp.insert(c.data(), c.size()); REQUIRE(sc); CHECK(*sc == 2); + + REQUIRE(sp.remove(*sa)); + + // contigFree at this point = PAGE_SIZE - HEADER - 3*SLOT - 300. + // a contributed 100 dead bytes that compaction can reclaim. Pick a tuple + // size that exceeds contigFree but fits after a's bytes are recovered — + // compaction is then required to satisfy the insert. + const size_t contig = PAGE_SIZE + - SlottedPage::HEADER_SIZE + - 3 * SlottedPage::SLOT_SIZE + - 300; + std::vector filler(contig + 50, 'F'); + auto sf = sp.insert(filler.data(), filler.size()); + REQUIRE(sf); + CHECK(*sf == 0); // reused a's tombstoned slot id + + // Slot ids 1 and 2 (b and c) must still resolve to the original bytes + // even though compaction relocated them. + CHECK(slotEquals(sp, *sb, std::string(b.begin(), b.end()))); + CHECK(slotEquals(sp, *sc, std::string(c.begin(), c.end()))); +} + +TEST_CASE("SlottedPage round-trips through BufferPool and DiskManager") { + TempFile tf; + DiskManager dm(tf.path()); + + // Insert a few tuples, modify, evict the page, then reload via a fresh + // BufferPool and a fresh DiskManager and verify everything is readable. + PageId pid; + { + BufferPool bp(2, &dm); + PageGuard g = bp.pinNew(); + pid = g->page_id; + + SlottedPage sp(g->data); + sp.init(); + auto a = tup("hello"); + auto b = tup("world"); + REQUIRE(sp.insert(a.data(), a.size())); + REQUIRE(sp.insert(b.data(), b.size())); + g.markDirty(); + + bp.flushAll(); + } + + { + DiskManager dm2(tf.path()); + BufferPool bp(2, &dm2); + PageGuard g = bp.pin(pid); + + SlottedPage sp(g->data); + CHECK(sp.numSlots() == 2); + CHECK(slotEquals(sp, 0, "hello")); + CHECK(slotEquals(sp, 1, "world")); + } +} + +TEST_CASE("a tuple of exactly MAX_TUPLE_SIZE fits on a fresh page") { + std::array page{}; + SlottedPage sp(page.data()); + sp.init(); + + std::vector big(SlottedPage::MAX_TUPLE_SIZE, 'M'); + auto sid = sp.insert(big.data(), big.size()); + REQUIRE(sid); + CHECK(*sid == 0); + + auto [p, len] = sp.get(*sid); + REQUIRE(p != nullptr); + REQUIRE(len == big.size()); + CHECK(std::memcmp(p, big.data(), big.size()) == 0); +} + +TEST_CASE("many one-byte tuples fill the page; all are independently readable") { + std::array page{}; + SlottedPage sp(page.data()); + sp.init(); + + std::vector ids; + for (int i = 0; i < 200; ++i) { + const char b = static_cast(i & 0xff); + auto sid = sp.insert(&b, 1); + if (!sid) break; + ids.push_back(*sid); + } + REQUIRE(ids.size() >= 100); // sanity floor; the page should fit many + + for (size_t i = 0; i < ids.size(); ++i) { + auto [p, len] = sp.get(ids[i]); + REQUIRE(p != nullptr); + REQUIRE(len == 1); + CHECK(*p == static_cast(i & 0xff)); + } +} + +TEST_CASE("update shrink-then-grow preserves slot id and final content") { + std::array page{}; + SlottedPage sp(page.data()); + sp.init(); + + auto t = tup("MMMMMMMMMM"); // 10 bytes + auto sid = sp.insert(t.data(), t.size()); + REQUIRE(sid); + + auto small = tup("xy"); + REQUIRE(sp.update(*sid, small.data(), small.size())); + CHECK(slotEquals(sp, *sid, "xy")); + + auto larger = tup("now-this-tuple-is-quite-a-bit-larger"); + REQUIRE(sp.update(*sid, larger.data(), larger.size())); + CHECK(slotEquals(sp, *sid, "now-this-tuple-is-quite-a-bit-larger")); +} + +TEST_CASE("removing every tuple makes all tuple bytes reclaimable") { + std::array page{}; + SlottedPage sp(page.data()); + sp.init(); + + constexpr int N = 6; + constexpr size_t kTupleLen = 200; + + std::vector ids; + for (int i = 0; i < N; ++i) { + std::vector data(kTupleLen, static_cast('a' + i)); + auto sid = sp.insert(data.data(), data.size()); + REQUIRE(sid); + ids.push_back(*sid); + } + for (auto sid : ids) REQUIRE(sp.remove(sid)); + + // Slot ids stay reserved after remove (tombstones), so freeSpace is the + // page minus header minus the tombstoned slot array. + const size_t expected = PAGE_SIZE - SlottedPage::HEADER_SIZE + - static_cast(N) * SlottedPage::SLOT_SIZE; + CHECK(sp.freeSpace() == expected); + + // We can insert a tuple of size `expected`: slot 0 is the lowest + // tombstone and gets reused (so no extra slot space is needed), and + // compaction reclaims the dead bytes in the tuple area. + std::vector filler(expected, 'Z'); + auto sf = sp.insert(filler.data(), filler.size()); + REQUIRE(sf); + CHECK(*sf == 0); + + auto [p, len] = sp.get(*sf); + REQUIRE(p != nullptr); + REQUIRE(len == expected); + CHECK(std::memcmp(p, filler.data(), expected) == 0); +} + +TEST_CASE("insert reuses the LOWEST tombstoned slot id, not an arbitrary one") { + std::array page{}; + SlottedPage sp(page.data()); + sp.init(); + + auto a = tup("a"); + auto b = tup("b"); + auto c = tup("c"); + auto d = tup("d"); + auto sa = sp.insert(a.data(), 1); REQUIRE(sa); + auto sb = sp.insert(b.data(), 1); REQUIRE(sb); + auto sc = sp.insert(c.data(), 1); REQUIRE(sc); + auto sd = sp.insert(d.data(), 1); REQUIRE(sd); + + REQUIRE(sp.remove(*sb)); // tombstone slot 1 + REQUIRE(sp.remove(*sc)); // tombstone slot 2 + + auto repl = tup("R"); + auto sr = sp.insert(repl.data(), 1); + REQUIRE(sr); + CHECK(*sr == *sb); // slot 1, not slot 2 — the lowest tombstone wins +} + +TEST_CASE("fill-empty-fill cycle reuses bytes correctly") { + std::array page{}; + SlottedPage sp(page.data()); + sp.init(); + + // First fill: insert until the page is full. + std::vector ids; + for (int i = 0; ; ++i) { + std::vector data(50, static_cast('A' + (i % 26))); + auto sid = sp.insert(data.data(), data.size()); + if (!sid) break; + ids.push_back(*sid); + } + REQUIRE(ids.size() > 10); + + // Empty: tombstone everything. + for (auto sid : ids) REQUIRE(sp.remove(sid)); + + // Refill: every fresh insert should now succeed (compaction reclaims + // dead bytes). Verify each new tuple round-trips. + std::vector new_ids; + for (int i = 0; i < static_cast(ids.size()); ++i) { + std::vector data(50, static_cast('z' - (i % 26))); + auto sid = sp.insert(data.data(), data.size()); + REQUIRE(sid); + new_ids.push_back(*sid); + } + for (size_t i = 0; i < new_ids.size(); ++i) { + auto [p, len] = sp.get(new_ids[i]); + REQUIRE(p != nullptr); + REQUIRE(len == 50); + for (size_t j = 0; j < 50; ++j) { + REQUIRE(p[j] == static_cast('z' - (static_cast(i) % 26))); + } + } +} + +TEST_CASE("randomized insert/remove/update sequence stays consistent with a model") { + std::array page{}; + SlottedPage sp(page.data()); + sp.init(); + + std::mt19937 rng(0xC0FFEEu); + std::uniform_int_distribution len_dist(1, 120); + std::uniform_int_distribution byte_dist(0, 255); + + // Source of truth: what we *expect* each live slot to currently hold. + std::map> model; + + for (int op = 0; op < 2000; ++op) { + const int choice = rng() % 4; + if (model.empty() || choice == 0) { + // INSERT + std::vector data(len_dist(rng)); + for (auto& c : data) c = static_cast(byte_dist(rng)); + auto sid = sp.insert(data.data(), data.size()); + if (sid) model[*sid] = std::move(data); + // else page genuinely full; future removes will free space. + } else if (choice == 1) { + // REMOVE + auto it = std::next(model.begin(), + rng() % static_cast(model.size())); + REQUIRE(sp.remove(it->first)); + model.erase(it); + } else if (choice == 2) { + // UPDATE + auto it = std::next(model.begin(), + rng() % static_cast(model.size())); + std::vector data(len_dist(rng)); + for (auto& c : data) c = static_cast(byte_dist(rng)); + if (sp.update(it->first, data.data(), data.size())) { + it->second = std::move(data); + } + // else update didn't fit; model unchanged. + } else { + // VERIFY a random live slot + auto it = std::next(model.begin(), + rng() % static_cast(model.size())); + auto [p, len] = sp.get(it->first); + REQUIRE(p != nullptr); + REQUIRE(len == it->second.size()); + REQUIRE(std::memcmp(p, it->second.data(), len) == 0); + } + } + + // Final sweep: every remaining slot in the model must round-trip. + for (const auto& [sid, expected] : model) { + auto [p, len] = sp.get(sid); + REQUIRE(p != nullptr); + REQUIRE(len == expected.size()); + REQUIRE(std::memcmp(p, expected.data(), len) == 0); + } +} From d781cb1e8bc05444873915163d5e3a3f06bf7b97 Mon Sep 17 00:00:00 2001 From: Jay Phan Date: Sun, 26 Apr 2026 13:52:45 -0400 Subject: [PATCH 4/5] heap file layer --- Makefile | 5 +- src/storage/heap_file.cpp | 114 +++++++++++ src/storage/heap_file.h | 107 ++++++++++ src/storage/slotted_page.cpp | 18 +- src/storage/slotted_page.h | 18 +- tests/storage/test_heap_file.cpp | 310 +++++++++++++++++++++++++++++ tests/storage/test_integration.cpp | 72 +++++++ 7 files changed, 634 insertions(+), 10 deletions(-) create mode 100644 src/storage/heap_file.cpp create mode 100644 src/storage/heap_file.h create mode 100644 tests/storage/test_heap_file.cpp create mode 100644 tests/storage/test_integration.cpp diff --git a/Makefile b/Makefile index b0d3392..c77ad2a 100644 --- a/Makefile +++ b/Makefile @@ -8,10 +8,13 @@ TEST_OBJS = $(BUILD_DIR)/tests/test_parser.o \ $(BUILD_DIR)/tests/storage/test_disk_manager.o \ $(BUILD_DIR)/tests/storage/test_buffer_pool.o \ $(BUILD_DIR)/tests/storage/test_slotted_page.o \ + $(BUILD_DIR)/tests/storage/test_heap_file.o \ + $(BUILD_DIR)/tests/storage/test_integration.o \ $(BUILD_DIR)/src/parser.o \ $(BUILD_DIR)/src/storage/disk_manager.o \ $(BUILD_DIR)/src/storage/buffer_pool.o \ - $(BUILD_DIR)/src/storage/slotted_page.o + $(BUILD_DIR)/src/storage/slotted_page.o \ + $(BUILD_DIR)/src/storage/heap_file.o dbms: $(DBMS_OBJS) $(CXX) $(CXXFLAGS) -o $@ $^ diff --git a/src/storage/heap_file.cpp b/src/storage/heap_file.cpp new file mode 100644 index 0000000..00855d7 --- /dev/null +++ b/src/storage/heap_file.cpp @@ -0,0 +1,114 @@ +#include "src/storage/heap_file.h" + +#include +#include + +HeapFile::HeapFile(BufferPool* bp, PageId first_page_id) + : bp_(bp), first_page_(first_page_id) {} + +HeapFile HeapFile::create(BufferPool* bp) { + PageGuard g = bp->pinNew(); + SlottedPage sp(g->data); + sp.init(); + g.markDirty(); + return HeapFile(bp, g->page_id); +} + +RID HeapFile::insert(const char* tuple, size_t len) { + if (len == 0 || len > SlottedPage::MAX_TUPLE_SIZE) { + throw std::runtime_error("HeapFile::insert: invalid tuple length " + + std::to_string(len)); + } + + // Walk the chain looking for a page that can hold this tuple. Track the + // tail in case we need to allocate a new page and link to it. + PageId current = first_page_; + PageId tail = first_page_; + while (current != INVALID_PAGE_ID) { + PageGuard g = bp_->pin(current); + SlottedPage sp(g->data); + auto sid = sp.insert(tuple, len); + if (sid) { + g.markDirty(); + return RID{current, *sid}; + } + tail = current; + current = sp.nextPageId(); + } + + // No page in the chain has room. Allocate a new page, insert into it, + // then re-pin the tail to update its next pointer. Doing the work in + // two stages avoids pinning two pages simultaneously, so this stays + // correct even on a single-frame buffer pool. + PageId new_pid; + SlotId new_sid; + { + PageGuard g = bp_->pinNew(); + new_pid = g->page_id; + SlottedPage sp(g->data); + sp.init(); + auto sid = sp.insert(tuple, len); + if (!sid) { + // MAX_TUPLE_SIZE check above means this is a bug, not user error. + throw std::runtime_error("HeapFile::insert: tuple did not fit on a fresh page"); + } + new_sid = *sid; + g.markDirty(); + } + { + PageGuard g = bp_->pin(tail); + SlottedPage sp(g->data); + sp.setNextPageId(new_pid); + g.markDirty(); + } + return RID{new_pid, new_sid}; +} + +bool HeapFile::get(RID rid, std::string* out) { + PageGuard g = bp_->pin(rid.page_id); + SlottedPage sp(g->data); + auto [p, len] = sp.get(rid.slot_id); + if (p == nullptr) return false; + out->assign(p, len); + return true; +} + +bool HeapFile::remove(RID rid) { + PageGuard g = bp_->pin(rid.page_id); + SlottedPage sp(g->data); + bool ok = sp.remove(rid.slot_id); + if (ok) g.markDirty(); + return ok; +} + +HeapFile::Iterator::Iterator(BufferPool* bp, PageId start) + : bp_(bp), cur_page_(start), cur_slot_(0) { + advanceToLive(); +} + +HeapFile::Iterator& HeapFile::Iterator::operator++() { + ++cur_slot_; + advanceToLive(); + return *this; +} + +void HeapFile::Iterator::advanceToLive() { + while (cur_page_ != INVALID_PAGE_ID) { + PageGuard g = bp_->pin(cur_page_); + SlottedPage sp(g->data); + const size_t n = sp.numSlots(); + while (cur_slot_ < n) { + auto [p, len] = sp.get(cur_slot_); + if (p != nullptr) { + entry_.first = RID{cur_page_, cur_slot_}; + entry_.second.assign(p, len); + return; + } + ++cur_slot_; + } + cur_page_ = sp.nextPageId(); + cur_slot_ = 0; + } + // Exhausted: cur_page_ == INVALID_PAGE_ID, cur_slot_ == 0 — matches + // the default-constructed end iterator. +} diff --git a/src/storage/heap_file.h b/src/storage/heap_file.h new file mode 100644 index 0000000..3844b0e --- /dev/null +++ b/src/storage/heap_file.h @@ -0,0 +1,107 @@ +#pragma once + +#include "src/storage/buffer_pool.h" +#include "src/storage/disk_manager.h" +#include "src/storage/slotted_page.h" + +#include +#include +#include + +// Record identifier — locates a single tuple in a heap file. +// Slot ids are stable for the lifetime of the slot, so an RID handed back +// from insert remains valid until the tuple is removed. +struct RID { + PageId page_id; + SlotId slot_id; + + bool operator==(const RID& o) const { + return page_id == o.page_id && slot_id == o.slot_id; + } + bool operator!=(const RID& o) const { return !(*this == o); } + bool operator<(const RID& o) const { + return page_id != o.page_id ? page_id < o.page_id : slot_id < o.slot_id; + } +}; + +// Layer 4 of the storage stack: a heap file is an unordered collection of +// tuples spread across one or more slotted pages chained together via the +// `next_page_id` field of each page's header. +// +// Usage: +// +// // First time: +// HeapFile hf = HeapFile::create(&bp); +// PageId root = hf.firstPageId(); // stash this somewhere durable +// RID r = hf.insert(data, len); +// +// // Later, possibly in another process, after reopening the file: +// HeapFile hf2(&bp, root); +// for (auto it = hf2.begin(); it != hf2.end(); ++it) { +// const auto& [rid, bytes] = *it; +// ... +// } +// +// Thread-safety: not thread-safe. Concurrency belongs to a higher layer. +class HeapFile { +public: + // Open an existing heap file. The page at `first_page_id` must already + // have been initialized as a slotted page (typically by create()). + HeapFile(BufferPool* bp, PageId first_page_id); + + // Create a brand new heap file by allocating a fresh first page from + // `bp`'s disk manager. Persist the returned firstPageId() to find this + // file again later. + static HeapFile create(BufferPool* bp); + + // Insert a tuple. Allocates and links a new page if no existing page + // has room. Throws if `len == 0` or `len > SlottedPage::MAX_TUPLE_SIZE`. + RID insert(const char* tuple, size_t len); + + // Read a tuple by RID. Returns false (and leaves *out untouched) if the + // slot is tombstoned. The bytes are copied into *out. + bool get(RID rid, std::string* out); + + // Tombstone the tuple at `rid`. Returns false if it was already dead. + bool remove(RID rid); + + PageId firstPageId() const { return first_page_; } + + // Forward iterator yielding every live tuple in the file in + // (page, slot) order. The iterator does not hold a pin between calls, + // so you can hold many of them and freely interleave with other + // HeapFile operations. + class Iterator { + public: + using Entry = std::pair; + + Iterator() = default; // end iterator + Iterator(BufferPool* bp, PageId start); + + const Entry& operator*() const { return entry_; } + const Entry* operator->() const { return &entry_; } + Iterator& operator++(); + bool operator==(const Iterator& o) const { + return cur_page_ == o.cur_page_ && cur_slot_ == o.cur_slot_; + } + bool operator!=(const Iterator& o) const { return !(*this == o); } + + private: + BufferPool* bp_ = nullptr; + PageId cur_page_ = INVALID_PAGE_ID; + SlotId cur_slot_ = 0; + Entry entry_; + + // Walk forward from (cur_page_, cur_slot_) to the next live tuple, + // crossing page boundaries as needed. Sets the state to end if no + // live tuple remains. + void advanceToLive(); + }; + + Iterator begin() { return Iterator(bp_, first_page_); } + Iterator end() { return Iterator(); } + +private: + BufferPool* bp_; + PageId first_page_; +}; diff --git a/src/storage/slotted_page.cpp b/src/storage/slotted_page.cpp index e501ea5..2ca9457 100644 --- a/src/storage/slotted_page.cpp +++ b/src/storage/slotted_page.cpp @@ -11,6 +11,12 @@ uint16_t loadU16(const char* p) { return v; } +uint32_t loadU32(const char* p) { + uint32_t v; + std::memcpy(&v, p, sizeof(v)); + return v; +} + void storeU16(char* p, uint16_t v) { std::memcpy(p, &v, sizeof(v)); } @@ -21,10 +27,13 @@ void storeU32(char* p, uint32_t v) { } // namespace -uint16_t SlottedPage::numSlotsRaw() const { return loadU16(data_ + 4); } -void SlottedPage::setNumSlots(uint16_t v) { storeU16(data_ + 4, v); } -uint16_t SlottedPage::freeSpaceOffset() const { return loadU16(data_ + 6); } -void SlottedPage::setFreeSpaceOffset(uint16_t v) { storeU16(data_ + 6, v); } +uint16_t SlottedPage::numSlotsRaw() const { return loadU16(data_ + 8); } +void SlottedPage::setNumSlots(uint16_t v) { storeU16(data_ + 8, v); } +uint16_t SlottedPage::freeSpaceOffset() const { return loadU16(data_ + 10); } +void SlottedPage::setFreeSpaceOffset(uint16_t v) { storeU16(data_ + 10, v); } + +PageId SlottedPage::nextPageId() const { return loadU32(data_ + 4); } +void SlottedPage::setNextPageId(PageId next) { storeU32(data_ + 4, next); } SlottedPage::SlotEntry SlottedPage::readSlot(SlotId i) const { const char* p = data_ + HEADER_SIZE + static_cast(i) * SLOT_SIZE; @@ -60,6 +69,7 @@ size_t SlottedPage::deadTupleBytes() const { void SlottedPage::init() { storeU32(data_ + 0, 0); // lsn + storeU32(data_ + 4, INVALID_PAGE_ID); // next_page_id setNumSlots(0); setFreeSpaceOffset(static_cast(PAGE_SIZE)); // Zero the rest of the page so reads from uninitialized regions are diff --git a/src/storage/slotted_page.h b/src/storage/slotted_page.h index 6a21c52..690a66a 100644 --- a/src/storage/slotted_page.h +++ b/src/storage/slotted_page.h @@ -35,7 +35,12 @@ using SlotId = uint16_t; class SlottedPage { public: // Number of bytes the header occupies at the start of the page. - static constexpr size_t HEADER_SIZE = 8; + // Layout: + // [0..4) lsn (uint32, reserved for recovery) + // [4..8) next_page_id (PageId, INVALID_PAGE_ID for the chain tail) + // [8..10) num_slots (uint16) + // [10..12) free_space_offset (uint16) + static constexpr size_t HEADER_SIZE = 12; // Number of bytes per slot entry in the slot array. static constexpr size_t SLOT_SIZE = 4; @@ -81,13 +86,16 @@ class SlottedPage { // insert without a free tombstone would receive slot id `numSlots()`. size_t numSlots() const; + // Page-chain pointer used by higher layers (HeapFile) to link pages. + // Lives in the page header so it persists with the page bytes. + // INVALID_PAGE_ID marks the tail of the chain. + PageId nextPageId() const; + void setNextPageId(PageId next); + private: char* data_; - // Header field accessors. The header lives at offset 0 with layout: - // [0..4) lsn (uint32_t, currently always 0; reserved for recovery) - // [4..6) num_slots (uint16_t) - // [6..8) free_space_offset (uint16_t) — where the next tuple goes + // Header field accessors. See HEADER_SIZE comment for layout. uint16_t numSlotsRaw() const; void setNumSlots(uint16_t v); uint16_t freeSpaceOffset() const; diff --git a/tests/storage/test_heap_file.cpp b/tests/storage/test_heap_file.cpp new file mode 100644 index 0000000..ce0b4b9 --- /dev/null +++ b/tests/storage/test_heap_file.cpp @@ -0,0 +1,310 @@ +#include "tests/vendor/doctest.h" + +#include "src/storage/buffer_pool.h" +#include "src/storage/disk_manager.h" +#include "src/storage/heap_file.h" +#include "src/storage/slotted_page.h" +#include "tests/test_util.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +// Collect every (rid, tuple) pair yielded by a heap file's iterator. +std::vector scanAll(HeapFile& hf) { + std::vector out; + for (auto it = hf.begin(); it != hf.end(); ++it) { + out.push_back(*it); + } + return out; +} + +// Insert helper that takes a string and returns the RID. +RID insertStr(HeapFile& hf, const std::string& s) { + return hf.insert(s.data(), s.size()); +} + +} // namespace + +TEST_CASE("HeapFile::create + insert + get round trip (usage demo)") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + + HeapFile hf = HeapFile::create(&bp); + RID r = insertStr(hf, "hello world"); + + std::string out; + REQUIRE(hf.get(r, &out)); + CHECK(out == "hello world"); + + // The freshly created file's first page is page 0. + CHECK(hf.firstPageId() == 0); + CHECK(r.page_id == 0); +} + +TEST_CASE("iterator on an empty file yields nothing") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + HeapFile hf = HeapFile::create(&bp); + + CHECK(hf.begin() == hf.end()); + CHECK(scanAll(hf).empty()); +} + +TEST_CASE("iterator yields every inserted tuple, in (page, slot) order") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + HeapFile hf = HeapFile::create(&bp); + + const std::vector tuples = {"alpha", "beta", "gamma", "delta", "epsilon"}; + std::vector rids; + for (const auto& s : tuples) rids.push_back(insertStr(hf, s)); + + auto entries = scanAll(hf); + REQUIRE(entries.size() == tuples.size()); + + // Iteration order matches the order of slot ids on each page; on a + // single page that's the order tuples were inserted. + for (size_t i = 0; i < tuples.size(); ++i) { + CHECK(entries[i].first == rids[i]); + CHECK(entries[i].second == tuples[i]); + } +} + +TEST_CASE("inserting beyond a single page allocates and links new pages") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + HeapFile hf = HeapFile::create(&bp); + + // Each tuple is roughly half a page, so two fit per page comfortably. + // Inserting many of them must spill into multiple chained pages. + std::vector blob(2000, 'q'); + std::vector rids; + for (int i = 0; i < 10; ++i) { + blob[0] = static_cast('A' + i); // distinguishes the tuples + rids.push_back(hf.insert(blob.data(), blob.size())); + } + + // The chain definitely grew beyond the first page. + std::set pages_used; + for (auto r : rids) pages_used.insert(r.page_id); + CHECK(pages_used.size() > 1); + + // Every tuple must still round trip. + std::string out; + for (size_t i = 0; i < rids.size(); ++i) { + REQUIRE(hf.get(rids[i], &out)); + REQUIRE(out.size() == blob.size()); + CHECK(out[0] == static_cast('A' + i)); + } +} + +TEST_CASE("remove makes get return false and the iterator skip the slot") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + HeapFile hf = HeapFile::create(&bp); + + RID a = insertStr(hf, "a"); + RID b = insertStr(hf, "b"); + RID c = insertStr(hf, "c"); + + REQUIRE(hf.remove(b)); + CHECK_FALSE(hf.remove(b)); // double-remove + + std::string out; + CHECK(hf.get(a, &out)); + CHECK(out == "a"); + CHECK_FALSE(hf.get(b, &out)); + CHECK(hf.get(c, &out)); + CHECK(out == "c"); + + auto entries = scanAll(hf); + REQUIRE(entries.size() == 2); + CHECK(entries[0].first == a); + CHECK(entries[0].second == "a"); + CHECK(entries[1].first == c); + CHECK(entries[1].second == "c"); +} + +TEST_CASE("iterator advances across an entirely-tombstoned page") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + HeapFile hf = HeapFile::create(&bp); + + // Fill the first page with chunky tuples so the next insert spills + // onto a fresh page. + std::vector filler(1500, 'x'); + std::vector first_page_rids; + for (int i = 0; i < 3; ++i) { + first_page_rids.push_back(hf.insert(filler.data(), filler.size())); + } + + // Insert a small tuple. Either it fits on the first page or rolls onto + // a new page; either way, removing every tuple on the first page leaves + // the iterator with a fully-tombstoned page to scan past. + RID after = insertStr(hf, "afterwards"); + + for (RID r : first_page_rids) REQUIRE(hf.remove(r)); + + auto entries = scanAll(hf); + REQUIRE(entries.size() == 1); + CHECK(entries[0].first == after); + CHECK(entries[0].second == "afterwards"); +} + +TEST_CASE("RID comparisons behave the way tests expect") { + RID a{0, 0}, b{0, 1}, c{1, 0}; + CHECK(a == a); + CHECK(a != b); + CHECK(a < b); + CHECK(b < c); + CHECK_FALSE(c < a); +} + +TEST_CASE("insert rejects len == 0 and len > MAX_TUPLE_SIZE") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + HeapFile hf = HeapFile::create(&bp); + + CHECK_THROWS_AS(hf.insert(nullptr, 0), std::runtime_error); + + std::vector oversize(SlottedPage::MAX_TUPLE_SIZE + 1, 'x'); + CHECK_THROWS_AS(hf.insert(oversize.data(), oversize.size()), std::runtime_error); +} + +TEST_CASE("data persists across BufferPool and DiskManager lifetimes") { + TempFile tf; + + PageId root_pid; + std::map model; + { + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + HeapFile hf = HeapFile::create(&bp); + root_pid = hf.firstPageId(); + + // Sized to span a few pages. + std::vector blob(1500, '.'); + for (int i = 0; i < 8; ++i) { + blob[0] = static_cast('A' + i); + RID r = hf.insert(blob.data(), blob.size()); + model[r] = std::string(blob.begin(), blob.end()); + } + + bp.flushAll(); + } + + // Reopen and verify everything is recoverable from `root_pid` alone. + DiskManager dm2(tf.path()); + BufferPool bp(4, &dm2); + HeapFile hf(&bp, root_pid); + + auto entries = scanAll(hf); + REQUIRE(entries.size() == model.size()); + + std::map got; + for (const auto& [rid, bytes] : entries) got[rid] = bytes; + CHECK(got == model); +} + +TEST_CASE("free-space reuse: removing a tuple lets a same-sized one fit on the same page") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + HeapFile hf = HeapFile::create(&bp); + + // Fill the first page. + std::vector rids; + std::vector blob(1000, 'X'); + for (int i = 0; i < 4; ++i) rids.push_back(hf.insert(blob.data(), blob.size())); + + // The next insert at this size would normally spill — remove one and + // verify the freed space is reused on the same page. + REQUIRE(hf.remove(rids[1])); + + RID r = hf.insert(blob.data(), blob.size()); + CHECK(r.page_id == rids[1].page_id); // same page + // The lowest tombstoned slot should be reused. + CHECK(r.slot_id == rids[1].slot_id); +} + +TEST_CASE("range-based for iteration works (STL conformance)") { + TempFile tf; + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + HeapFile hf = HeapFile::create(&bp); + + insertStr(hf, "one"); + insertStr(hf, "two"); + insertStr(hf, "three"); + + std::vector seen; + for (auto& entry : hf) { + seen.push_back(entry.second); + } + REQUIRE(seen.size() == 3); + CHECK(std::count(seen.begin(), seen.end(), std::string("one")) == 1); + CHECK(std::count(seen.begin(), seen.end(), std::string("two")) == 1); + CHECK(std::count(seen.begin(), seen.end(), std::string("three")) == 1); +} + +TEST_CASE("stress: many tuples across many pages with eviction-heavy buffer pool") { + TempFile tf; + DiskManager dm(tf.path()); + // Tiny pool relative to working set forces lots of eviction during the + // workload; the heap file API must keep things consistent throughout. + BufferPool bp(2, &dm); + HeapFile hf = HeapFile::create(&bp); + + constexpr int N = 600; + std::mt19937 rng(0xCAFEBABEu); + std::uniform_int_distribution len_dist(1, 80); + std::uniform_int_distribution byte_dist(0, 255); + + std::map model; + for (int i = 0; i < N; ++i) { + std::string data(len_dist(rng), '\0'); + for (auto& c : data) c = static_cast(byte_dist(rng)); + RID r = hf.insert(data.data(), data.size()); + REQUIRE(model.emplace(r, std::move(data)).second); // RIDs unique + } + + // Tombstone roughly a third of them and verify get / iterator agree. + std::vector all_rids; + for (const auto& [rid, _] : model) all_rids.push_back(rid); + std::shuffle(all_rids.begin(), all_rids.end(), rng); + for (size_t i = 0; i < all_rids.size() / 3; ++i) { + REQUIRE(hf.remove(all_rids[i])); + model.erase(all_rids[i]); + } + + // get() agrees with the model on every surviving RID. + std::string out; + for (const auto& [rid, expected] : model) { + REQUIRE(hf.get(rid, &out)); + REQUIRE(out == expected); + } + // get() refuses every removed RID. + for (size_t i = 0; i < all_rids.size() / 3; ++i) { + CHECK_FALSE(hf.get(all_rids[i], &out)); + } + + // Iterator yields exactly the surviving tuples. + std::map seen; + for (const auto& [rid, bytes] : hf) seen[rid] = bytes; + CHECK(seen == model); +} diff --git a/tests/storage/test_integration.cpp b/tests/storage/test_integration.cpp new file mode 100644 index 0000000..b6b7f10 --- /dev/null +++ b/tests/storage/test_integration.cpp @@ -0,0 +1,72 @@ +#include "tests/vendor/doctest.h" + +#include "src/storage/buffer_pool.h" +#include "src/storage/disk_manager.h" +#include "src/storage/heap_file.h" +#include "tests/test_util.h" + +#include +#include +#include +#include + +// End-to-end persistence test: load 1000 rows into a heap file, simulate a +// program restart by destroying every storage object and reopening on the +// same file, then scan the rows back. Every row must round-trip exactly, +// and the iterator must yield the same set as the in-memory model. +TEST_CASE("integration: load 1000 rows, restart, scan them back") { + TempFile tf; + constexpr int N = 1000; + + PageId root_pid; + std::map expected; + + // ----- Phase 1: load 1000 rows and shut down ----- + { + DiskManager dm(tf.path()); + // Small pool relative to the working set forces lots of eviction + // during the load, exercising the dirty-write-back path. + BufferPool bp(4, &dm); + HeapFile hf = HeapFile::create(&bp); + root_pid = hf.firstPageId(); + + std::mt19937 rng(0xDEC1DEDu); + std::uniform_int_distribution len_dist(1, 200); + std::uniform_int_distribution byte_dist(0, 255); + + for (int i = 0; i < N; ++i) { + std::string row(len_dist(rng), '\0'); + for (auto& c : row) c = static_cast(byte_dist(rng)); + RID r = hf.insert(row.data(), row.size()); + REQUIRE(expected.emplace(r, std::move(row)).second); // RIDs unique + } + REQUIRE(expected.size() == static_cast(N)); + + // Flush every dirty frame before shutdown — without this the writes + // sitting in evictable-but-clean-on-disk frames would be lost. + bp.flushAll(); + } + // BufferPool and DiskManager destructors run here, closing the file. + // Anything still in memory is gone — we have nothing but `tf.path()` and + // `root_pid` to find the data again. + + // ----- Phase 2: cold reopen and verify ----- + DiskManager dm(tf.path()); + BufferPool bp(4, &dm); + HeapFile hf(&bp, root_pid); + + // Iterator yields exactly the rows we inserted, with matching RIDs. + std::map seen; + for (const auto& [rid, bytes] : hf) { + REQUIRE(seen.emplace(rid, bytes).second); + } + REQUIRE(seen.size() == expected.size()); + CHECK(seen == expected); + + // Random-access get() resolves every original RID to the exact bytes. + std::string out; + for (const auto& [rid, bytes] : expected) { + REQUIRE(hf.get(rid, &out)); + REQUIRE(out == bytes); + } +} From 60b990f013672a98b5c13c50cb2f394900de26d6 Mon Sep 17 00:00:00 2001 From: Jay Phan Date: Sun, 26 Apr 2026 13:55:23 -0400 Subject: [PATCH 5/5] add readme to storage --- src/storage/README.md | 64 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 src/storage/README.md diff --git a/src/storage/README.md b/src/storage/README.md new file mode 100644 index 0000000..5a1bd08 --- /dev/null +++ b/src/storage/README.md @@ -0,0 +1,64 @@ +# Storage + +Four-layer stack. Each layer exposes a small surface to the one above and +hides everything below. + +``` +┌──────────────────────────────────────────────────────────────────┐ +│ heap_file.h HeapFile table abstraction; insert / get / scan │ +├──────────────────────────────────────────────────────────────────┤ +│ slotted_page.h SlottedPage byte layout inside one page │ +├──────────────────────────────────────────────────────────────────┤ +│ buffer_pool.h BufferPool page cache with LRU eviction + pinning │ +├──────────────────────────────────────────────────────────────────┤ +│ disk_manager.h DiskManager read/write fixed-size pages on a file │ +└──────────────────────────────────────────────────────────────────┘ +``` + +## What each layer owns + +- **DiskManager** — one file, page id ↔ byte offset. Has no opinion on what's + in a page. +- **BufferPool** — fixed array of `Frame`s caching pages from disk. Hands out + `Frame*` via `pin()`/`pinNew()`, returning a `PageGuard` (RAII; calls + `unpinPage` on scope exit). Evicts the LRU unpinned frame on a miss; writes + dirty frames back when evicted or on `flushAll()`. +- **SlottedPage** — wraps a `char*` of `PAGE_SIZE` bytes. Header + slot array + growing forward, tuple bytes packed at the high end. Slot ids are stable + across compaction; `remove` tombstones; insert auto-compacts when needed. + The 12-byte header includes a `next_page_id` field used by HeapFile to + chain pages. +- **HeapFile** — unordered collection of tuples spread across a chain of + slotted pages. Allocates and links new pages as the chain fills. RIDs + (`{page_id, slot_id}`) are stable for the life of a tuple. Provides a + forward iterator that yields every live tuple in (page, slot) order. + +## Insert flow + +``` +HeapFile::insert(bytes, len) + └─> walk page chain via SlottedPage::nextPageId() + └─> BufferPool::pin(page) → Frame + └─> SlottedPage::insert(bytes, len) → SlotId + ↑ if no page has room: BufferPool::pinNew() + and link via setNextPageId() +``` + +A miss inside `pin()` triggers `DiskManager::readPage()`; an eviction of a +dirty frame triggers `DiskManager::writePage()`. + +## Read flow + +``` +HeapFile::get(rid) HeapFile::Iterator::operator++ + BufferPool::pin(rid.page) BufferPool::pin(cur_page) + SlottedPage::get(rid.slot) SlottedPage::get(cur_slot) + → if exhausted: cur_page = nextPageId() +``` + +## Tests + +Each layer has its own test file under `tests/storage/`. The end-to-end +persistence test lives in `tests/storage/test_integration.cpp` (loads 1000 +rows, simulates a program restart by destroying every storage object, then +scans the rows back).