diff --git a/README.md b/README.md
index 4aded2a..2ce6fae 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # top4grep
-A grep tool for the top 4 security conferences
+A grep tool for the top 4 security conferences, with cached raw fetches for repeatable literature review.
 
 ## Installation
 ```
@@ -15,20 +15,99 @@ If you want to update the papers stored in `papers.db`, you can recreate it with
 top4grep --build-db
 ```
 
-Which will build the db wherever you run it.
+The database and all cached raw fetches live under `top4grep/data/` inside the package directory, not in the current working directory.
+
+Useful build filters:
+```bash
+top4grep --build-db --conference "IEEE S&P" --conference CCS --year-from 2021 --year-to 2026
+top4grep --build-db --abstract --year-from 2023 --year-to 2024
+```
+
+### Cache Bundle Bootstrap
+You can package the local database plus all cached raw fetches into one zip file:
+
+```bash
+top4grep --export-cache-bundle /tmp/top4grep-cache-bundle.zip
+```
+
+Current published bundle cutoff:
+
+- snapshot date: `2026-03-12`
+- download: `https://drive.google.com/file/d/1vAeDHNnDDKyXWPQTGLnlWnQchoRt9-_s/view?usp=sharing`
+- expected usage: install the bundle first, then run incremental refreshes for anything added or corrected after `2026-03-12`
+
+The bundle contains:
+
+- `papers.db`
+- `raw/dblp/...`
+- `raw/publisher_html/...`
+- `raw/openalex/...`
+- `raw/semantic_scholar/...`
+- `raw/pdf/...`
+
+To install a downloaded bundle into the correct local package data directory:
+
+```bash
+top4grep --install-cache-bundle ~/Downloads/top4grep-cache-bundle.zip
+```
+
+If you want the downloaded snapshot to fully replace your current local data first:
+
+```bash
+top4grep --install-cache-bundle ~/Downloads/top4grep-cache-bundle.zip --replace-data
+```
+
+After installing a bundle, users can do only incremental refreshes for newer years instead of cold-crawling the full history again. For example:
+
+```bash
+top4grep --build-db --abstract --year-from 2024
+```
+
+Notes:
+
+- `--abstract` updates existing rows in place when abstracts are missing and keeps papers even when their abstracts remain unavailable after all configured fallbacks
+- rebuilds also remove stale rows that no longer appear in the latest source data
+- the first abstract build for a new slice can take noticeably longer because publisher and OpenAlex data are fetched live
+- rerunning the same abstract build is much faster because raw responses are cached under `top4grep/data/raw/`
+- stored titles and authors are normalized automatically, so escaped HTML entities such as `&quot;` do not create duplicate rows
+- abstract recovery currently tries publisher HTML, OpenAlex, Semantic Scholar, and PDF extraction from open conference paper links
 
 ### Query
 ```bash
-top4grep -k <kerywords>
+top4grep -k <keywords>
 ```
 
-For example, `python top4grep.py -k linux,kernel`
-Currently, the query is just a case-insensitive match (just like grep). The returned results must contains all the input keywords (papers containing keyword1 AND keyword2 AND ...). Support for `OR` operation (papers containing keyword1 OR keyword2) is missing, but will be added in the future.
+For example:
+
+```bash
+top4grep -k linux,kernel
+top4grep -k "linux,kernel|driver" --field all --year-from 2021
+top4grep -k "supply,chain|dependency" --abstract --conference CCS
+```
+
+Query semantics:
+
+- `,` means `AND`
+- `|` means `OR`
+- matching is case-insensitive and stemmed, so `exploiting` matches `exploit`
+- `--field` supports `title`, `abstract`, `authors`, `conference`, and `all`
+- `--abstract` is kept for compatibility and defaults query field selection to `abstract`
+
+Raw source material is cached under:
+
+- `top4grep/data/raw/dblp/html/`
+- `top4grep/data/raw/dblp/api/`
+- `top4grep/data/raw/publisher_html/`
+- `top4grep/data/raw/openalex/`
+- `top4grep/data/raw/semantic_scholar/`
+- `top4grep/data/raw/pdf/`
+
+The bundled cache installs into the same `top4grep/data/` location, so the normal build pipeline automatically reuses it.
 
 ## Screenshot
 ![screenshot](https://raw.githubusercontent.com/Kyle-Kyle/top4grep/master/img/screenshot.png)
 
-## TODO
-- [ ] grep in abstract
-- [ ] fuzzy match
-- [ ] complex search logic (`OR` operation)
+## Status
+- [x] grep in abstract
+- [x] fuzzy match
+- [x] complex search logic (`OR` operation)
diff --git a/setup.cfg b/setup.cfg
index ca2d8c1..e663f3d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -18,7 +18,6 @@ install_requires =
     colorlog
     beautifulsoup4
     nltk
-    selenium
     requests
 python_requires = >= 3.8
 packages = find:
diff --git a/tests/test_abstract.py b/tests/test_abstract.py
new file mode 100644
index 0000000..738d0b5
--- /dev/null
+++ b/tests/test_abstract.py
@@ -0,0 +1,194 @@
+import unittest
+from requests import RequestException
+from unittest.mock import patch
+
+from bs4 import BeautifulSoup
+from top4grep.abstract import (
+    AbstractCCS,
+    extract_abstract_from_pdf_text,
+    extract_doi,
+    find_pdf_url_in_soup,
+    get_openalex_abstract_for_title,
+    get_openalex_abstract,
+    get_semantic_scholar_abstract_for_title,
+    normalize_abstract,
+    normalize_source_url,
+    normalize_title_key,
+    reconstruct_abstract,
+)
+
+
+class AbstractTests(unittest.TestCase):
+    def test_extract_doi_from_url(self):
+        doi = extract_doi("https://doi.org/10.1145/3576915.3616615")
+        self.assertEqual(doi, "10.1145/3576915.3616615")
+
+    def test_reconstruct_openalex_abstract(self):
+        text = reconstruct_abstract(
+            {
+                "hello": [0],
+                "world": [1],
+                "again": [2],
+            }
+        )
+        self.assertEqual(text, "hello world again")
+
+    def test_normalize_abstract_collapses_whitespace(self):
+        text = normalize_abstract("First line\n\n  Second   line  ")
+        self.assertEqual(text, "First line\nSecond line")
+
+    def test_normalize_abstract_unescapes_html_entities(self):
+        text = normalize_abstract("&quot;Quoted&quot;  text")
+        self.assertEqual(text, '"Quoted" text')
+
+    def test_normalize_title_key_ignores_entities_and_punctuation(self):
+        key = normalize_title_key('&quot;AttackGNN&quot;: Red-Teaming GNNs!')
+        self.assertEqual(key, "attackgnnredteaminggnns")
+
+    def test_normalize_title_key_strips_html_markup_and_diacritics(self):
+        key = normalize_title_key("<i>Avara:</i> Voge\u0308le and \u03bcCFI")
+        self.assertEqual(key, "avaravogeleandcfi")
+
+    def test_extract_abstract_from_pdf_text(self):
+        text = extract_abstract_from_pdf_text(
+            "Paper Title\nAuthors\nAbstract\nThis is the abstract.\n1 Introduction\nBody"
+        )
+        self.assertEqual(text, "This is the abstract.")
+
+    def test_find_pdf_url_in_soup_prefers_pdf_links(self):
+        soup = BeautifulSoup(
+            '<html><head><meta name="citation_pdf_url" content="/paper.pdf"></head><body></body></html>',
+            "html.parser",
+        )
+
+        self.assertEqual(
+            find_pdf_url_in_soup("https://example.com/paper", soup),
+            "https://example.com/paper.pdf",
+        )
+
+    def test_normalize_source_url_rewrites_legacy_ndss_pdf_host(self):
+        self.assertEqual(
+            normalize_source_url(
+                "http://wp.internetsociety.org/ndss/wp-content/uploads/sites/25/2017/09/paper.pdf"
+            ),
+            "https://www.ndss-symposium.org/wp-content/uploads/2017/09/paper.pdf",
+        )
+
+    def test_ccs_falls_back_to_non_boilerplate_meta_description(self):
+        soup = BeautifulSoup(
+            '<html><head><meta property="og:description" content="A real page summary."></head></html>',
+            "html.parser",
+        )
+        with patch("top4grep.abstract.resolve_doi", return_value=""), \
+             patch("top4grep.abstract.load_publisher_soup", return_value=soup):
+            text = AbstractCCS().get_abstract_from_publisher("https://example.com/paper", [])
+
+        self.assertEqual(text, "A real page summary.")
+
+    @patch("top4grep.abstract.cached_get_json", side_effect=RequestException("dns failed"))
+    def test_openalex_request_errors_return_empty_abstract(self, _cached_get_json):
+        self.assertEqual(get_openalex_abstract("10.1145/3658644.3670278"), "")
+
+    @patch("top4grep.abstract.cached_get_json")
+    def test_openalex_title_search_returns_exact_title_match(self, cached_get_json):
+        cached_get_json.return_value = {
+            "results": [
+                {
+                    "display_name": "AttackGNN: Red-Teaming GNNs in Hardware Security Using Reinforcement Learning",
+                    "abstract_inverted_index": {
+                        "hello": [0],
+                        "world": [1],
+                    },
+                }
+            ]
+        }
+
+        text = get_openalex_abstract_for_title(
+            "AttackGNN: Red-Teaming GNNs in Hardware Security Using Reinforcement Learning"
+        )
+
+        self.assertEqual(text, "hello world")
+
+    @patch("top4grep.abstract.cached_get_json")
+    def test_semantic_scholar_title_search_returns_exact_title_match(self, cached_get_json):
+        cached_get_json.return_value = {
+            "data": [
+                {
+                    "title": "Avara: A Uniform Evaluation System for Perceptibility Analysis Against Adversarial Object Evasion Attacks",
+                    "authors": [{"name": "Xinyao Ma"}],
+                    "abstract": "Semantic Scholar abstract",
+                }
+            ]
+        }
+
+        text = get_semantic_scholar_abstract_for_title(
+            "Avara: A Uniform Evaluation System for Perceptibility Analysis Against Adversarial Object Evasion Attacks.",
+            ["Xinyao Ma"],
+        )
+
+        self.assertEqual(text, "Semantic Scholar abstract")
+
+    @patch("top4grep.abstract.load_publisher_soup")
+    @patch("top4grep.abstract.get_openalex_abstract")
+    def test_ccs_doi_urls_use_openalex_before_publisher_html(self, get_openalex_abstract, load_publisher_soup):
+        get_openalex_abstract.return_value = "abstract from openalex"
+
+        text = AbstractCCS().get_abstract_from_publisher("https://doi.org/10.1145/3658644.3670278", [])
+
+        self.assertEqual(text, "abstract from openalex")
+        load_publisher_soup.assert_not_called()
+        get_openalex_abstract.assert_called_once_with("10.1145/3658644.3670278")
+
+    @patch("top4grep.abstract.load_publisher_soup")
+    @patch("top4grep.abstract.get_openalex_abstract")
+    def test_ccs_falls_back_to_publisher_html_when_openalex_is_empty(self, get_openalex_abstract, load_publisher_soup):
+        get_openalex_abstract.return_value = ""
+        load_publisher_soup.return_value.find.return_value.get_text.return_value = "publisher abstract"
+
+        text = AbstractCCS().get_abstract_from_publisher("https://doi.org/10.1145/3658644.3670278", [])
+
+        self.assertEqual(text, "publisher abstract")
+        load_publisher_soup.assert_called_once()
+
+    @patch.object(AbstractCCS, "get_abstract_from_publisher", return_value="")
+    @patch("top4grep.abstract.get_semantic_scholar_abstract", return_value="semantic scholar abstract")
+    @patch("top4grep.abstract.resolve_doi", side_effect=AssertionError("resolve_doi should not be retried"))
+    @patch("top4grep.abstract.get_openalex_abstract_for_title", return_value="")
+    def test_direct_doi_urls_use_semantic_scholar_before_title_search(self, _title_fallback, _resolve_doi, _semantic_doi_fallback, _get_abstract_from_publisher):
+        text = AbstractCCS().get_abstract_from_url(
+            "https://doi.org/10.1145/3658644.3670278",
+            "example title",
+            [],
+        )
+
+        self.assertEqual(text, "semantic scholar abstract")
+
+    @patch.object(AbstractCCS, "get_abstract_from_publisher", return_value="")
+    @patch("top4grep.abstract.resolve_doi", side_effect=RuntimeError("publisher fetch failed"))
+    @patch("top4grep.abstract.get_semantic_scholar_abstract_for_title", return_value="semantic scholar title fallback")
+    @patch("top4grep.abstract.get_openalex_abstract_for_title", return_value="title fallback abstract")
+    def test_publisher_resolution_failures_fall_back_to_title_search(self, _title_fallback, _semantic_title_fallback, _resolve_doi, _get_abstract_from_publisher):
+        text = AbstractCCS().get_abstract_from_url(
+            "https://example.com/paper",
+            "example title",
+            [],
+        )
+
+        self.assertEqual(text, "title fallback abstract")
+
+    @patch.object(AbstractCCS, "get_abstract_from_publisher", return_value="")
+    @patch("top4grep.abstract.resolve_doi", side_effect=RuntimeError("publisher fetch failed"))
+    @patch("top4grep.abstract.get_semantic_scholar_abstract_for_title", return_value="semantic scholar title fallback")
+    @patch("top4grep.abstract.get_openalex_abstract_for_title", return_value="")
+    def test_title_search_falls_back_to_semantic_scholar_when_openalex_is_empty(self, _title_fallback, _semantic_title_fallback, _resolve_doi, _get_abstract_from_publisher):
+        text = AbstractCCS().get_abstract_from_url(
+            "https://example.com/paper",
+            "example title",
+            [],
+        )
+
+        self.assertEqual(text, "semantic scholar title fallback")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_build_db.py b/tests/test_build_db.py
new file mode 100644
index 0000000..d22a769
--- /dev/null
+++ b/tests/test_build_db.py
@@ -0,0 +1,217 @@
+import unittest
+from unittest.mock import patch
+from types import SimpleNamespace
+
+import sqlalchemy
+from sqlalchemy.orm import sessionmaker
+from requests import RequestException
+
+import top4grep.build_db as build_db
+from top4grep.db import Base, Paper
+
+
+class BuildDbTests(unittest.TestCase):
+    def test_normalize_toc_api_records_unescapes_titles_and_authors(self):
+        hits = [
+            {
+                "info": {
+                    "title": "&quot;Do Anything Now&quot;: Testing &amp; Evaluation.",
+                    "authors": {
+                        "author": [
+                            {"text": "Alice &amp; Bob"},
+                            {"text": "Carol"},
+                        ]
+                    },
+                    "ee": "https://doi.org/10.1145/3658644.3670278",
+                }
+            }
+        ]
+
+        records = build_db.normalize_toc_api_records(hits)
+
+        self.assertEqual(records[0]["title"], '"Do Anything Now": Testing & Evaluation.')
+        self.assertEqual(records[0]["authors"], ["Alice & Bob", "Carol"])
+
+    def test_normalize_toc_api_records_skips_front_matter_entries(self):
+        hits = [
+            {"info": {"title": "Conference Organizers.", "authors": {"author": []}, "ee": ""}},
+            {"info": {"title": "Poster: A Short Result.", "authors": {"author": [{"text": "Alice"}]}, "ee": ""}},
+            {"info": {"title": "Real Paper.", "authors": {"author": [{"text": "Bob"}]}, "ee": ""}},
+        ]
+
+        records = build_db.normalize_toc_api_records(hits)
+
+        self.assertEqual([record["title"] for record in records], ["Real Paper."])
+
+    def test_should_skip_record_detects_slides_and_keynotes(self):
+        self.assertTrue(build_db.should_skip_record("Keynote Address: Back to the Future.", ""))
+        self.assertTrue(build_db.should_skip_record("Interesting Topic.", "https://www.usenix.org/slides/talk.pdf"))
+        self.assertFalse(build_db.should_skip_record("Interesting Topic.", "https://example.com/paper.pdf"))
+
+    def test_migrate_database_normalizes_and_deduplicates_titles(self):
+        engine = sqlalchemy.create_engine("sqlite:///:memory:")
+        Base.metadata.create_all(engine)
+        test_session = sessionmaker(bind=engine)
+
+        with test_session.begin() as session:
+            session.add_all(
+                [
+                    Paper(
+                        conference="CCS",
+                        year=2024,
+                        title='&quot;Do Anything Now&quot;: Characterizing Jailbreak Prompts.',
+                        authors="Alice",
+                        abstract="",
+                    ),
+                    Paper(
+                        conference="CCS",
+                        year=2024,
+                        title='"Do Anything Now": Characterizing Jailbreak Prompts.',
+                        authors="Alice",
+                        abstract="Useful abstract",
+                    ),
+                ]
+            )
+
+        with patch.object(build_db, "Session", test_session):
+            normalized_rows, duplicate_rows = build_db.migrate_database()
+
+        self.assertGreaterEqual(normalized_rows, 1)
+        self.assertEqual(duplicate_rows, 1)
+
+        with test_session() as session:
+            rows = session.query(Paper).all()
+
+        self.assertEqual(len(rows), 1)
+        self.assertEqual(rows[0].title, '"Do Anything Now": Characterizing Jailbreak Prompts.')
+        self.assertEqual(rows[0].abstract, "Useful abstract")
+
+    def test_prune_stale_papers_removes_rows_missing_from_latest_source(self):
+        engine = sqlalchemy.create_engine("sqlite:///:memory:")
+        Base.metadata.create_all(engine)
+        test_session = sessionmaker(bind=engine)
+
+        with test_session.begin() as session:
+            session.add_all(
+                [
+                    Paper(conference="CCS", year=2024, title="Current Title", authors="Alice", abstract=""),
+                    Paper(conference="CCS", year=2024, title="Stale Title", authors="Bob", abstract=""),
+                ]
+            )
+
+        with patch.object(build_db, "Session", test_session):
+            removed = build_db.prune_stale_papers("CCS", 2024, {"Current Title"})
+
+        self.assertEqual(removed, 1)
+        with test_session() as session:
+            rows = session.query(Paper).order_by(Paper.title).all()
+
+        self.assertEqual([row.title for row in rows], ["Current Title"])
+
+    @patch.object(build_db, "TOC_API_HOSTS", ["https://one.example", "https://two.example"])
+    @patch("top4grep.build_db.cached_get_json")
+    def test_fetch_toc_api_records_retries_other_hosts_for_current_year(self, cached_get_json):
+        cached_get_json.side_effect = [
+            RequestException("first host timeout"),
+            {"result": {"hits": {"hit": [{"info": {"title": "Paper"}}]}}},
+        ]
+
+        hits = build_db.fetch_toc_api_records("ccs", 2026)
+
+        self.assertEqual(len(hits), 1)
+        self.assertEqual(cached_get_json.call_count, 2)
+
+    @patch.object(build_db, "TOC_API_HOSTS", ["https://one.example", "https://two.example"])
+    @patch("top4grep.build_db.cached_get_json", side_effect=RequestException("network down"))
+    def test_fetch_toc_api_records_raises_when_all_hosts_fail(self, _cached_get_json):
+        with self.assertRaises(RequestException):
+            build_db.fetch_toc_api_records("ccs", 2026)
+
+    def test_get_papers_continues_when_one_record_extraction_fails(self):
+        engine = sqlalchemy.create_engine("sqlite:///:memory:")
+        Base.metadata.create_all(engine)
+        test_session = sessionmaker(bind=engine)
+
+        with test_session.begin() as session:
+            session.add_all(
+                [
+                    Paper(conference="USENIX", year=2024, title="Broken Paper", authors="Alice", abstract=""),
+                    Paper(conference="USENIX", year=2024, title="Recoverable Paper", authors="Bob", abstract=""),
+                ]
+            )
+
+        records = [
+            {"title": "Broken Paper", "authors": ["Alice"], "paper_html": SimpleNamespace(marker="broken")},
+            {"title": "Recoverable Paper", "authors": ["Bob"], "paper_html": SimpleNamespace(marker="ok")},
+        ]
+
+        def fake_get_abstract(paper_html, title, authors):
+            if paper_html.marker == "broken":
+                raise RuntimeError("boom")
+            return "Recovered abstract"
+
+        with patch.object(build_db, "Session", test_session), \
+             patch.object(build_db, "load_records", return_value=records), \
+             patch.object(build_db, "prune_stale_papers", return_value=0), \
+             patch.dict(build_db.Abstracts, {"USENIX": SimpleNamespace(get_abstract=fake_get_abstract)}):
+            processed, changed = build_db.get_papers("USENIX", 2024, True)
+
+        self.assertEqual(processed, 2)
+        self.assertEqual(changed, 1)
+        with test_session() as session:
+            recoverable = session.query(Paper).filter(Paper.title == "Recoverable Paper").one()
+            broken = session.query(Paper).filter(Paper.title == "Broken Paper").one()
+
+        self.assertEqual(recoverable.abstract, "Recovered abstract")
+        self.assertEqual(broken.abstract, "")
+
+    def test_ndss_legacy_years_still_attempt_abstract_fallbacks(self):
+        engine = sqlalchemy.create_engine("sqlite:///:memory:")
+        Base.metadata.create_all(engine)
+        test_session = sessionmaker(bind=engine)
+
+        records = [
+            {"title": "Legacy NDSS Paper", "authors": ["Alice"], "paper_html": SimpleNamespace(marker="legacy")},
+        ]
+
+        with patch.object(build_db, "Session", test_session), \
+             patch.object(build_db, "load_records", return_value=records), \
+             patch.object(build_db, "prune_stale_papers", return_value=0), \
+             patch.dict(build_db.Abstracts, {"NDSS": SimpleNamespace(get_abstract=lambda *_args: "Recovered abstract")}):
+            processed, changed = build_db.get_papers("NDSS", 2016, True)
+
+        self.assertEqual(processed, 1)
+        self.assertEqual(changed, 1)
+        with test_session() as session:
+            row = session.query(Paper).filter(Paper.title == "Legacy NDSS Paper").one()
+
+        self.assertEqual(row.abstract, "Recovered abstract")
+
+    def test_abstract_build_keeps_rows_without_recoverable_abstracts(self):
+        engine = sqlalchemy.create_engine("sqlite:///:memory:")
+        Base.metadata.create_all(engine)
+        test_session = sessionmaker(bind=engine)
+
+        with test_session.begin() as session:
+            session.add(Paper(conference="CCS", year=2024, title="Unrecoverable Paper", authors="Alice", abstract=""))
+
+        records = [
+            {"title": "Unrecoverable Paper", "authors": ["Alice"], "paper_html": SimpleNamespace(marker="missing")},
+        ]
+
+        with patch.object(build_db, "Session", test_session), \
+             patch.object(build_db, "load_records", return_value=records), \
+             patch.dict(build_db.Abstracts, {"CCS": SimpleNamespace(get_abstract=lambda *_args: "")}):
+            processed, changed = build_db.get_papers("CCS", 2024, True)
+
+        self.assertEqual(processed, 1)
+        self.assertEqual(changed, 0)
+        with test_session() as session:
+            row = session.query(Paper).one()
+
+        self.assertEqual(row.title, "Unrecoverable Paper")
+        self.assertEqual(row.abstract, "")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_cache.py b/tests/test_cache.py
new file mode 100644
index 0000000..7d13222
--- /dev/null
+++ b/tests/test_cache.py
@@ -0,0 +1,84 @@
+import tempfile
+import unittest
+import zipfile
+from pathlib import Path
+from unittest.mock import Mock, patch
+
+from top4grep.bundle import create_cache_bundle, install_cache_bundle
+from top4grep.cache import cached_get_json, cached_get_text
+
+
+class CacheTests(unittest.TestCase):
+    def test_cached_get_json_negative_caches_404(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            cache_path = Path(tmpdir) / "missing.json"
+            response = Mock(status_code=404)
+
+            with patch("top4grep.cache.HTTP.get", return_value=response) as mock_get:
+                self.assertIsNone(cached_get_json("https://example.com/missing.json", cache_path))
+                self.assertIsNone(cached_get_json("https://example.com/missing.json", cache_path))
+
+            mock_get.assert_called_once()
+
+    def test_cached_get_text_negative_caches_404(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            cache_path = Path(tmpdir) / "missing.html"
+            response = Mock(status_code=404)
+
+            with patch("top4grep.cache.HTTP.get", return_value=response) as mock_get:
+                self.assertIsNone(cached_get_text("https://example.com/missing.html", cache_path))
+                self.assertIsNone(cached_get_text("https://example.com/missing.html", cache_path))
+
+            mock_get.assert_called_once()
+
+    def test_create_cache_bundle_includes_db_and_raw_cache(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            data_dir = Path(tmpdir) / "data"
+            raw_dir = data_dir / "raw" / "openalex"
+            raw_dir.mkdir(parents=True)
+            (data_dir / "papers.db").write_text("db", encoding="utf-8")
+            (raw_dir / "paper.json").write_text("{}", encoding="utf-8")
+            bundle_path = Path(tmpdir) / "bundle.zip"
+
+            result = create_cache_bundle(bundle_path, data_dir=data_dir)
+
+            self.assertEqual(result["file_count"], 2)
+            with zipfile.ZipFile(bundle_path, "r") as bundle:
+                names = set(bundle.namelist())
+
+            self.assertIn("manifest.json", names)
+            self.assertIn("papers.db", names)
+            self.assertIn("raw/openalex/paper.json", names)
+
+    def test_install_cache_bundle_restores_files(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            source_dir = Path(tmpdir) / "source"
+            source_raw = source_dir / "raw" / "publisher_html"
+            source_raw.mkdir(parents=True)
+            (source_dir / "papers.db").write_text("db", encoding="utf-8")
+            (source_raw / "page.html").write_text("<html></html>", encoding="utf-8")
+            bundle_path = Path(tmpdir) / "bundle.zip"
+            create_cache_bundle(bundle_path, data_dir=source_dir)
+
+            install_dir = Path(tmpdir) / "install"
+            result = install_cache_bundle(bundle_path, data_dir=install_dir)
+
+            self.assertEqual(result["extracted_files"], 2)
+            self.assertEqual((install_dir / "papers.db").read_text(encoding="utf-8"), "db")
+            self.assertEqual(
+                (install_dir / "raw" / "publisher_html" / "page.html").read_text(encoding="utf-8"),
+                "<html></html>",
+            )
+
+    def test_install_cache_bundle_rejects_path_traversal(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            bundle_path = Path(tmpdir) / "bundle.zip"
+            with zipfile.ZipFile(bundle_path, "w") as bundle:
+                bundle.writestr("../escape.txt", "bad")
+
+            with self.assertRaises(ValueError):
+                install_cache_bundle(bundle_path, data_dir=Path(tmpdir) / "install")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_search.py b/tests/test_search.py
new file mode 100644
index 0000000..292a3f0
--- /dev/null
+++ b/tests/test_search.py
@@ -0,0 +1,56 @@
+import unittest
+from types import SimpleNamespace
+
+from top4grep.search import paper_matches, paper_sort_key, parse_query
+
+
+def make_paper(**kwargs):
+    defaults = {
+        "title": "Exploiting Linux Kernel Drivers",
+        "abstract": "This paper studies exploit chains in kernel subsystems.",
+        "authors": "Alice Example, Bob Example",
+        "conference": "CCS",
+        "year": 2024,
+    }
+    defaults.update(kwargs)
+    return SimpleNamespace(**defaults)
+
+
+class SearchTests(unittest.TestCase):
+    def test_and_query_matches_title(self):
+        paper = make_paper()
+        self.assertTrue(paper_matches(paper, parse_query("linux,exploit"), "title"))
+
+    def test_or_query_matches_title(self):
+        paper = make_paper()
+        self.assertTrue(paper_matches(paper, parse_query("kernel|driver"), "title"))
+
+    def test_stemming_matches_variants(self):
+        paper = make_paper(title="Exploitability Signals for Linux Drivers")
+        self.assertTrue(paper_matches(paper, parse_query("exploiting"), "title"))
+
+    def test_abstract_field_can_match_without_title_hit(self):
+        paper = make_paper(title="Artifact-Centric Analysis", abstract="Dependency risk in package supply chains.")
+        self.assertTrue(paper_matches(paper, parse_query("dependency,supply"), "abstract"))
+
+    def test_all_field_can_match_across_non_title_content(self):
+        paper = make_paper(title="Artifact-Centric Analysis", abstract="Dependency risk in package supply chains.")
+        self.assertTrue(paper_matches(paper, parse_query("dependency,supply"), "all"))
+
+    def test_missing_term_rejects_match(self):
+        paper = make_paper()
+        self.assertFalse(paper_matches(paper, parse_query("windows"), "title"))
+
+    def test_sort_order_matches_original_cli_behavior(self):
+        papers = [
+            make_paper(conference="NDSS", title="ndss"),
+            make_paper(conference="IEEE S&P", title="sp"),
+            make_paper(conference="USENIX", title="usenix"),
+            make_paper(conference="CCS", title="ccs"),
+        ]
+        ordered = sorted(papers, key=paper_sort_key)
+        self.assertEqual([paper.conference for paper in ordered], ["CCS", "USENIX", "IEEE S&P", "NDSS"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/top4grep/__main__.py b/top4grep/__main__.py
index 9a97ceb..6a6f82f 100644
--- a/top4grep/__main__.py
+++ b/top4grep/__main__.py
@@ -1,66 +1,40 @@
+import argparse
+from datetime import datetime
+from pathlib import Path
 
 import sqlalchemy
 from sqlalchemy.orm import sessionmaker
-from nltk import download, word_tokenize
-from nltk.data import find
-from nltk.stem import PorterStemmer
 
+from .bundle import create_cache_bundle, install_cache_bundle
 from .db import Base, Paper
-from .build_db import build_db, DB_PATH
+from .build_db import DB_PATH, build_db, migrate_database
+from .search import SEARCH_FIELDS, paper_matches, paper_sort_key, parse_query
 from .utils import new_logger
-import argparse
-
-
 
 engine = sqlalchemy.create_engine(f'sqlite:///{str(DB_PATH)}')
 Base.metadata.create_all(engine)
 Session = sessionmaker(bind=engine)
 
 logger = new_logger("Top4Grep")
-stemmer = PorterStemmer()
 
 CONFERENCES = ["NDSS", "IEEE S&P", "USENIX", "CCS"]
 
-# Function to check and download 'punkt' if not already available
-def check_and_download_punkt():
-    try:
-        # Check if 'punkt' is available, this will raise a LookupError if not found
-        find('tokenizers/punkt')
-        #print("'punkt' tokenizer models are already installed.")
-    except LookupError:
-        print("'punkt' tokenizer models not found. Downloading...")
-        # Download 'punkt' tokenizer models
-        download('punkt')
-        
-# trim word tokens from tokenizer to stem i.e. exploiting to exploit
-def fuzzy_match(title):
-    tokens = word_tokenize(title)
-    return [stemmer.stem(token) for token in tokens]
-
-def existed_in_tokens(tokens, keywords):
-    return all(map(lambda k: stemmer.stem(k.lower()) in tokens, keywords))
-
-def grep(keywords, abstract):
-    # TODO: currently we only grep either from title or from abstract, also grep from other fields in the future maybe?
-    if abstract:
-        constraints = [Paper.abstract.contains(x) for x in keywords]
-        with Session() as session:
-            papers = session.query(Paper).filter(*constraints).all()
-        filter_paper = filter(lambda p: existed_in_tokens(fuzzy_match(p.abstract.lower()), keywords), papers)
-    else:
-        constraints = [Paper.title.contains(x) for x in keywords]
-        with Session() as session:
-            papers = session.query(Paper).filter(*constraints).all()
-        #check whether whether nltk tokenizer data is downloaded
-        check_and_download_punkt()
-        #tokenize the title and filter out the substring matches
-        filter_paper = []
-        for paper in papers:
-            if all([stemmer.stem(x.lower()) in fuzzy_match(paper.title.lower()) for x in keywords]):
-                filter_paper.append(paper)
-    # perform customized sorthing
-    papers = sorted(filter_paper, key=lambda paper: paper.year + CONFERENCES.index(paper.conference)/10, reverse=True)
-    return papers
+
+def grep(query, field, conferences=None, year_from=2000, year_to=None):
+    query_groups = parse_query(query)
+
+    with Session() as session:
+        paper_query = session.query(Paper)
+        if conferences:
+            paper_query = paper_query.filter(Paper.conference.in_(conferences))
+        if year_from is not None:
+            paper_query = paper_query.filter(Paper.year >= year_from)
+        if year_to is not None:
+            paper_query = paper_query.filter(Paper.year <= year_to)
+        papers = paper_query.all()
+
+    filtered = [paper for paper in papers if paper_matches(paper, query_groups, field)]
+    return sorted(filtered, key=paper_sort_key)
 
 
 def show_papers(papers):
@@ -71,26 +45,84 @@ def show_papers(papers):
 def main():
     parser = argparse.ArgumentParser(description='Scripts to query the paper database',
                                      usage="%(prog)s [options] -k <keywords>")
-    parser.add_argument('-k', type=str, help="keywords to grep, separated by ','. For example, 'linux,kernel,exploit'", default='')
+    parser.add_argument('-k', type=str, help="keywords to grep. Use ',' for AND and '|' for OR, for example 'linux,kernel|driver'", default='')
     parser.add_argument('--build-db', action="store_true", help="Builds the database of conference papers")
-    parser.add_argument('--abstract', action="store_true", help="Involve abstract into the database's building or query (Need Chrome for building)")
+    parser.add_argument('--abstract', action="store_true", help="During builds, fetch abstracts. During queries, search abstracts unless --field is provided.")
+    parser.add_argument('--field', choices=SEARCH_FIELDS, help="Field to search for queries. Default is 'title' unless --abstract is set.")
+    parser.add_argument('--conference', action='append', choices=CONFERENCES, help="Restrict builds or queries to one or more conferences. May be repeated.")
+    parser.add_argument('--year-from', type=int, default=2000, help="Inclusive lower year bound for builds or queries.")
+    parser.add_argument('--year-to', type=int, default=datetime.now().year, help="Inclusive upper year bound for builds or queries.")
+    parser.add_argument('--export-cache-bundle', type=Path, help="Create a single zip bundle containing papers.db and all cached raw fetches.")
+    parser.add_argument('--install-cache-bundle', type=Path, help="Install a previously exported cache bundle into the package data directory.")
+    parser.add_argument('--replace-data', action='store_true', help="When installing a cache bundle, replace existing papers.db and raw cache first.")
     args = parser.parse_args()
 
+    if args.year_from > args.year_to:
+        parser.error("--year-from must be less than or equal to --year-to")
+    if args.replace_data and not args.install_cache_bundle:
+        parser.error("--replace-data can only be used with --install-cache-bundle")
+
+    special_actions = [bool(args.export_cache_bundle), bool(args.install_cache_bundle)]
+    if sum(special_actions) > 1:
+        parser.error("--export-cache-bundle and --install-cache-bundle are mutually exclusive")
+    if any(special_actions) and (args.k or args.build_db):
+        parser.error("cache bundle operations cannot be combined with queries or --build-db")
+
+    if args.export_cache_bundle:
+        result = create_cache_bundle(args.export_cache_bundle)
+        print(
+            f"Created cache bundle at {result['output_path']} "
+            f"({result['archive_bytes']} bytes, {result['file_count']} files)."
+        )
+        return
+    if args.install_cache_bundle:
+        result = install_cache_bundle(args.install_cache_bundle, replace=args.replace_data)
+        normalized_rows, duplicate_rows = migrate_database()
+        print(
+            f"Installed cache bundle from {result['bundle_path']} into {result['data_dir']} "
+            f"({result['extracted_files']} files)."
+        )
+        if normalized_rows or duplicate_rows:
+            print(f"Normalized {normalized_rows} rows and removed {duplicate_rows} duplicates after install.")
+        return
+
     if args.k:
-        assert DB_PATH.exists(), f"need to build a paper database first to perform wanted queries"
-        keywords = [x.strip() for x in args.k.split(',')]
-        if keywords:
-            logger.info("Grep based on the following keywords: %s", ', '.join(keywords))
+        if not DB_PATH.exists():
+            parser.error("need to build a paper database first to perform queries")
+        normalized_rows, duplicate_rows = migrate_database()
+        if normalized_rows or duplicate_rows:
+            logger.info(
+                "Normalized %s records and removed %s duplicates.",
+                normalized_rows,
+                duplicate_rows,
+            )
+        field = args.field or ('abstract' if args.abstract else 'title')
+        query_groups = parse_query(args.k)
+        if query_groups:
+            logger.info("Query: %s", args.k)
+            logger.info("Field: %s", field)
         else:
             logger.warning("No keyword is provided. Return all the papers.")
 
-        papers = grep(keywords, args.abstract)
+        papers = grep(
+            args.k,
+            field=field,
+            conferences=args.conference,
+            year_from=args.year_from,
+            year_to=args.year_to,
+        )
         logger.debug(f"Found {len(papers)} papers")
 
         show_papers(papers)
     elif args.build_db:
         print("Building db...")
-        build_db(args.abstract)
+        processed, added = build_db(
+            args.abstract,
+            conferences=args.conference,
+            start_year=args.year_from,
+            end_year=args.year_to,
+        )
+        print(f"Build complete. Processed {processed} papers and changed {added} records.")
 
 
 if __name__ == "__main__":
diff --git a/top4grep/abstract.py b/top4grep/abstract.py
index 70321a6..b51117c 100644
--- a/top4grep/abstract.py
+++ b/top4grep/abstract.py
@@ -1,37 +1,495 @@
-"""
-Test: python3 -m top4grep.abstract
-"""
+import os
 import re
-import requests
+import shutil
+import subprocess
+import tempfile
+import unicodedata
 from abc import ABC, abstractmethod
 from bs4 import BeautifulSoup
-from selenium import webdriver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import TimeoutException
-from urllib.parse import urlparse, urlunparse
+from html import unescape
+from urllib.parse import unquote, urljoin, urlparse
+
+import requests
 
+from .cache import DATA_DIR, cache_key_path, cached_get_bytes, cached_get_json, cached_get_text
 from .utils import new_logger
 
 logger = new_logger('PaperAbstract')
 logger.setLevel('WARNING')
 
+RAW_PUBLISHER_DIR = DATA_DIR / "raw" / "publisher_html"
+RAW_PDF_DIR = DATA_DIR / "raw" / "pdf"
+RAW_OPENALEX_DIR = DATA_DIR / "raw" / "openalex"
+RAW_OPENALEX_SEARCH_DIR = RAW_OPENALEX_DIR / "title_search"
+RAW_SEMANTIC_SCHOLAR_DIR = DATA_DIR / "raw" / "semantic_scholar"
+RAW_SEMANTIC_SCHOLAR_SEARCH_DIR = RAW_SEMANTIC_SCHOLAR_DIR / "title_search"
+DOI_PATTERN = re.compile(r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+")
+
+
+def normalize_abstract(text):
+    if not text:
+        return ""
+    lines = [re.sub(r"\s+", " ", unescape(line)).strip() for line in text.splitlines()]
+    return "\n".join(line for line in lines if line)
+
+
+def extract_doi(value):
+    if not value:
+        return ""
+    match = DOI_PATTERN.search(unquote(value))
+    if not match:
+        return ""
+    return match.group(0).rstrip(").,;")
+
+
+def reconstruct_abstract(inverted_index):
+    if not inverted_index:
+        return ""
+
+    tokens = []
+    for word, positions in inverted_index.items():
+        for position in positions:
+            tokens.append((position, word))
+    return " ".join(word for _, word in sorted(tokens))
+
+
+def normalize_title_key(title):
+    if not title:
+        return ""
+    text = re.sub(r"<[^>]+>", " ", unescape(title))
+    text = unicodedata.normalize("NFKD", text)
+    text = "".join(ch for ch in text if not unicodedata.combining(ch))
+    return re.sub(r"[^a-z0-9]+", "", text.casefold())
+
+
+def publisher_cache_path(url):
+    parsed = urlparse(url)
+    hint = parsed.netloc or "publisher"
+    return cache_key_path(RAW_PUBLISHER_DIR, url, ".html", hint)
+
+
+def pdf_cache_path(url):
+    parsed = urlparse(url)
+    hint = parsed.netloc or "paper"
+    return cache_key_path(RAW_PDF_DIR, url, ".pdf", hint)
+
+
+def is_pdf_url(url):
+    if not url:
+        return False
+    return urlparse(url).path.lower().endswith(".pdf")
+
+
+def normalize_source_url(url):
+    if not url:
+        return ""
+    normalized = url.strip()
+    if normalized.startswith("http://wp.internetsociety.org/ndss/wp-content/uploads/sites/25/"):
+        return normalized.replace(
+            "http://wp.internetsociety.org/ndss/wp-content/uploads/sites/25/",
+            "https://www.ndss-symposium.org/wp-content/uploads/",
+            1,
+        )
+    if normalized.startswith("https://wp.internetsociety.org/ndss/wp-content/uploads/sites/25/"):
+        return normalized.replace(
+            "https://wp.internetsociety.org/ndss/wp-content/uploads/sites/25/",
+            "https://www.ndss-symposium.org/wp-content/uploads/",
+            1,
+        )
+    return normalized
+
+
+def fetch_publisher_html(url):
+    if not url:
+        return None
+    url = normalize_source_url(url)
+    return cached_get_text(url, publisher_cache_path(url))
+
+
+def fetch_pdf_bytes(url):
+    if not url:
+        return None
+    url = normalize_source_url(url)
+    return cached_get_bytes(url, pdf_cache_path(url))
+
+
+def load_publisher_soup(url):
+    html = fetch_publisher_html(url)
+    if html is None:
+        return None
+    return BeautifulSoup(html, 'html.parser')
+
+
+def openalex_cache_path(doi):
+    return cache_key_path(RAW_OPENALEX_DIR, doi, ".json", doi.replace("/", "-"))
+
+
+def openalex_title_search_cache_path(title):
+    return cache_key_path(RAW_OPENALEX_SEARCH_DIR, title, ".json", title)
+
+
+def semantic_scholar_cache_path(identifier):
+    return cache_key_path(RAW_SEMANTIC_SCHOLAR_DIR, identifier, ".json", identifier.replace("/", "-"))
+
+
+def semantic_scholar_title_search_cache_path(title):
+    return cache_key_path(RAW_SEMANTIC_SCHOLAR_SEARCH_DIR, title, ".json", title)
+
+
+def resolve_doi_from_html(url):
+    soup = load_publisher_soup(url)
+    if soup is None:
+        return ""
+
+    candidates = []
+    for attrs in (
+        {"name": "citation_doi"},
+        {"name": "dc.Identifier"},
+        {"name": "dc.identifier"},
+        {"property": "og:url"},
+    ):
+        meta = soup.find("meta", attrs=attrs)
+        if meta and meta.get("content"):
+            candidates.append(meta["content"])
+
+    for anchor in soup.find_all("a", href=True):
+        candidates.append(anchor["href"])
+
+    for candidate in candidates:
+        doi = extract_doi(candidate)
+        if doi:
+            return doi
+    return ""
+
+
+def resolve_doi(url):
+    url = normalize_source_url(url)
+    doi = extract_doi(url)
+    if doi:
+        return doi
+    return resolve_doi_from_html(url)
+
+
+def get_openalex_abstract(doi):
+    if not doi:
+        return ""
+
+    url = f"https://api.openalex.org/works/https://doi.org/{doi}"
+    try:
+        data = cached_get_json(url, openalex_cache_path(doi))
+    except (requests.RequestException, ValueError) as e:
+        logger.debug(f"Failed to fetch OpenAlex abstract for DOI {doi}: {e}")
+        return ""
+    if not data:
+        return ""
+    return normalize_abstract(reconstruct_abstract(data.get("abstract_inverted_index", {})))
+
+
+def get_openalex_abstract_for_title(title, authors=None):
+    if not title:
+        return ""
+
+    url = requests.Request(
+        "GET",
+        "https://api.openalex.org/works",
+        params={"search": title, "per-page": 10},
+    ).prepare().url
+    try:
+        data = cached_get_json(url, openalex_title_search_cache_path(title))
+    except (requests.RequestException, ValueError) as e:
+        logger.debug(f"Failed to search OpenAlex by title for {title}: {e}")
+        return ""
+    if not data:
+        return ""
+
+    target_key = normalize_title_key(title)
+    for result in data.get("results", []):
+        if normalize_title_key(result.get("display_name")) != target_key:
+            continue
+        abstract = reconstruct_abstract(result.get("abstract_inverted_index", {}))
+        if abstract:
+            return normalize_abstract(abstract)
+    return ""
+
+
+def normalize_pdf_text(text):
+    if not text:
+        return ""
+    text = text.replace("\f", "\n")
+    lines = [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()]
+    return "\n".join(line for line in lines if line)
+
+
+def extract_meta_description(soup):
+    if soup is None:
+        return ""
+
+    for attrs in (
+        {"name": "citation_abstract"},
+        {"name": "description"},
+        {"property": "og:description"},
+    ):
+        meta = soup.find("meta", attrs=attrs)
+        if not meta or not meta.get("content"):
+            continue
+        content = normalize_abstract(meta["content"])
+        lowered = content.casefold()
+        if not content:
+            continue
+        if lowered.startswith("author(s):"):
+            continue
+        if lowered.startswith("download:"):
+            continue
+        if lowered.startswith("usenix is a nonprofit organization"):
+            continue
+        return content
+    return ""
+
+
+def extract_abstract_from_pdf_text(text):
+    normalized = normalize_pdf_text(text)
+    if not normalized:
+        return ""
+
+    lines = [line.strip() for line in normalized.splitlines() if line.strip()]
+    collecting = False
+    collected = []
+
+    for line in lines:
+        heading_key = re.sub(r"[^a-z0-9]+", "", line.casefold())
+        if not collecting:
+            if "abstract" not in heading_key:
+                continue
+            collecting = True
+            content = re.sub(r"^\s*abstract\b[\s:.\-–—]*", "", line, flags=re.IGNORECASE).strip()
+            if content:
+                collected.append(content)
+            continue
+
+        if heading_key in {
+            "1introduction",
+            "iintroduction",
+            "introduction",
+            "keywords",
+            "indexterms",
+            "categoriesandsubjectdescriptors",
+        }:
+            break
+        if heading_key in {"1", "i"}:
+            continue
+        collected.append(line)
+
+    if collected:
+        return normalize_abstract(" ".join(collected))
+    return ""
+
+
+def extract_abstract_from_pdf_url(url):
+    if not url:
+        return ""
+    url = normalize_source_url(url)
+
+    pdftotext = shutil.which("pdftotext")
+    if not pdftotext:
+        return ""
+
+    pdf_bytes = fetch_pdf_bytes(url)
+    if not pdf_bytes:
+        return ""
+
+    temp_path = None
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as handle:
+            handle.write(pdf_bytes)
+            temp_path = handle.name
+
+        result = subprocess.run(
+            [pdftotext, "-f", "1", "-l", "2", temp_path, "-"],
+            check=False,
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        if result.returncode != 0:
+            return ""
+        return extract_abstract_from_pdf_text(result.stdout)
+    except (OSError, subprocess.SubprocessError):
+        return ""
+    finally:
+        if temp_path:
+            try:
+                os.unlink(temp_path)
+            except OSError:
+                pass
+
+
+def find_pdf_url_in_soup(base_url, soup):
+    if soup is None:
+        return ""
+
+    candidates = []
+    for attrs in (
+        {"name": "citation_pdf_url"},
+        {"name": "wkhealth_pdf_url"},
+        {"property": "og:pdf"},
+    ):
+        meta = soup.find("meta", attrs=attrs)
+        if meta and meta.get("content"):
+            candidates.append(meta["content"])
+
+    for anchor in soup.find_all("a", href=True):
+        href = anchor["href"]
+        label = anchor.get_text(" ", strip=True).casefold()
+        if href.lower().endswith(".pdf") or label in {"pdf", "download pdf", "paper"}:
+            candidates.append(href)
+
+    for candidate in candidates:
+        absolute = normalize_source_url(urljoin(base_url, candidate))
+        if is_pdf_url(absolute):
+            return absolute
+    return ""
+
+
+def extract_semantic_scholar_abstract(data):
+    if not data:
+        return ""
+    abstract = data.get("abstract")
+    if abstract:
+        return normalize_abstract(abstract)
+    return ""
+
+
+def semantic_scholar_authors_match(result, authors):
+    if not authors:
+        return True
+
+    expected = {normalize_title_key(author) for author in authors if author}
+    if not expected:
+        return True
+
+    actual = {
+        normalize_title_key(author.get("name"))
+        for author in result.get("authors", [])
+        if isinstance(author, dict) and author.get("name")
+    }
+    return bool(expected & actual)
+
+
+def get_semantic_scholar_abstract(doi):
+    if not doi:
+        return ""
+
+    url = f"https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}?fields=title,abstract,authors,externalIds,year"
+    try:
+        data = cached_get_json(url, semantic_scholar_cache_path(doi))
+    except (requests.RequestException, ValueError) as e:
+        logger.debug(f"Failed to fetch Semantic Scholar abstract for DOI {doi}: {e}")
+        return ""
+    return extract_semantic_scholar_abstract(data)
+
+
+def get_semantic_scholar_abstract_for_title(title, authors=None):
+    if not title:
+        return ""
+
+    url = requests.Request(
+        "GET",
+        "https://api.semanticscholar.org/graph/v1/paper/search/match",
+        params={"query": title, "fields": "title,abstract,authors,externalIds,year"},
+    ).prepare().url
+    try:
+        data = cached_get_json(url, semantic_scholar_title_search_cache_path(title))
+    except (requests.RequestException, ValueError) as e:
+        logger.debug(f"Failed to search Semantic Scholar by title for {title}: {e}")
+        return ""
+    if not data:
+        return ""
+
+    results = data.get("data", [])
+    if isinstance(results, dict):
+        results = [results]
+
+    target_key = normalize_title_key(title)
+    for result in results:
+        if normalize_title_key(result.get("title")) != target_key:
+            continue
+        if not semantic_scholar_authors_match(result, authors):
+            continue
+        abstract = extract_semantic_scholar_abstract(result)
+        if abstract:
+            return abstract
+    return ""
+
+
 class BasePaperAbstract(ABC):
     def get_abstract(self, paper_html, title, authors):
-        # import ipdb; ipdb.set_trace()
-        # logger.debug(f"abstracting {paper_html}, title: {title}")
         try:
             publisher_url = self.get_publisher_url(paper_html)
         except Exception as e:
             logger.debug(f"Failed to obtain publisher URL. Paper: {title}")
             return ""
-        else:
-            try:
-                return self.get_abstract_from_publisher(publisher_url, authors)
-            except Exception as e:
-                logger.debug(f"Failed to extract abstract from publisher URL {publisher_url}.")
-                return ""
+        return self.get_abstract_from_url(publisher_url, title, authors)
+
+    def get_abstract_from_url(self, publisher_url, title, authors):
+        publisher_url = normalize_source_url(publisher_url)
+        if not publisher_url:
+            abstract = get_openalex_abstract_for_title(title, authors)
+            if abstract:
+                return abstract
+            return get_semantic_scholar_abstract_for_title(title, authors)
+
+        if is_pdf_url(publisher_url):
+            abstract = extract_abstract_from_pdf_url(publisher_url)
+            if abstract:
+                return abstract
+
+        direct_doi = extract_doi(publisher_url)
+        try:
+            abstract = self.get_abstract_from_publisher(publisher_url, authors)
+        except Exception as e:
+            logger.debug(f"Failed to extract abstract from publisher URL {publisher_url}. Paper: {title}")
+            abstract = ""
+
+        if abstract:
+            return normalize_abstract(abstract)
+
+        pdf_url = ""
+        try:
+            pdf_url = find_pdf_url_in_soup(publisher_url, load_publisher_soup(publisher_url))
+        except Exception as e:
+            logger.debug(f"Failed to locate PDF from publisher URL {publisher_url}. Paper: {title}")
+            pdf_url = ""
+        if pdf_url:
+            abstract = extract_abstract_from_pdf_url(pdf_url)
+            if abstract:
+                return abstract
+
+        if direct_doi:
+            abstract = get_semantic_scholar_abstract(direct_doi)
+            if abstract:
+                return abstract
+            abstract = get_openalex_abstract_for_title(title, authors)
+            if abstract:
+                return abstract
+            return get_semantic_scholar_abstract_for_title(title, authors)
+
+        try:
+            doi = resolve_doi(publisher_url)
+        except Exception as e:
+            logger.debug(f"Failed to resolve DOI from publisher URL {publisher_url}. Paper: {title}")
+            doi = ""
+        if doi:
+            abstract = get_openalex_abstract(doi)
+            if abstract:
+                return abstract
+            abstract = get_semantic_scholar_abstract(doi)
+            if abstract:
+                return abstract
+
+        abstract = get_openalex_abstract_for_title(title, authors)
+        if abstract:
+            return abstract
+        return get_semantic_scholar_abstract_for_title(title, authors)
 
     def get_publisher_url(self, paper_html):
         ee = paper_html.find('li', {'class': 'ee'})
@@ -45,113 +503,82 @@ def get_abstract_from_publisher(self, url, authors):
 class AbstractNDSS(BasePaperAbstract):
     def get_abstract_from_publisher(self, url, authors):
         logger.debug(f'URL: {url}')
-        r = requests.get(url)
-        assert r.status_code == 200
-
-        html = BeautifulSoup(r.text, 'html.parser')
+        html = load_publisher_soup(url)
+        if html is None:
+            return ""
         paper_data = html.find('div', {'class': 'paper-data'})
         if paper_data is not None:
-            abstract_paragraphs = filter(lambda x: x.text != '' and not authors[0] in x.text, paper_data.find_all('p'))
-            ap_list = [x.text for x in abstract_paragraphs]
+            first_author = authors[0] if authors else None
+            abstract_paragraphs = filter(
+                lambda x: x.get_text(strip=True) != '' and (first_author is None or first_author not in x.get_text()),
+                paper_data.find_all('p'),
+            )
+            ap_list = [x.get_text(separator='\n', strip=True) for x in abstract_paragraphs]
             return '\n'.join(ap_list)
-        else:
-            abstract_paragraphs = html.find(string=re.compile("Abstract:")).find_next(recursive=False)
-            return abstract_paragraphs.get_text(separator='\n')
+        abstract_marker = html.find(string=re.compile(r"Abstract:?", re.I))
+        if abstract_marker is None:
+            return extract_meta_description(html)
+        abstract_paragraphs = abstract_marker.find_next(recursive=False)
+        return abstract_paragraphs.get_text(separator='\n', strip=True)
 
 
 class AbstractSP(BasePaperAbstract):
-    def has_abstract_sibling(self, tag):
-        return any(sibling for sibling in tag.find_all_next() if sibling.get_text(strip=True) == 'Abstract')
-   
-    def update_url(self, url):
-        parsed_url = urlparse(url)
-        ieee_netloc = 'doi.ieeecomputersociety.org'
-        doi_netlog = 'doi.org'
-        if parsed_url.netloc != ieee_netloc and parsed_url.netloc != 'doi.org':
-            modified_url = urlunparse((parsed_url.scheme, ieee_netloc, parsed_url.path,
-                            parsed_url.params, parsed_url.query, parsed_url.fragment))
-            return modified_url
-        else:
-            return url
-
-    def _get_abstract_from_computerorg(self, url):
-        # TODO: handle the case when Chrome is not available
-        driver = webdriver.Chrome()
-        url = self.update_url(url)
-        driver.get(url)
-
-
-        # Wait for the dynamic element to be present on the page
-        element = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.TAG_NAME, 'article')))
-        # TODO: I'm not sure if this can handle abstracts with multiple paragraphs
-        abstract = element.find_element(By.CLASS_NAME, 'article-content').text
-        driver.quit()
-        return abstract
-    
-    def _get_abstract_from_ieeexplore(self, url):
-        # TODO: handle the case when Chrome is not available
-        driver = webdriver.Chrome()
-        url = self.update_url(url)
-        logger.debug(f'URL: {url}')
-        driver.get(url)
-
-        # Wait for the dynamic element to be present on the page
-        element = WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.CLASS_NAME, 'abstract-text')))
-        temp = element.find_elements(By.CLASS_NAME, 'abstract-text-view-all')
-        if len(temp) > 0:
-            # If there's a view all button
-            view_all = temp[0]
-            driver.execute_script("arguments[0].scrollIntoView(true);", view_all)
-            view_all.click()
-            text = driver.find_element(By.CLASS_NAME, 'abstract-text').text
-        else:
-            text = element.text
-        
-        if text.find('Abstract:\n') >= 0:
-            text = text[text.find('Abstract:\n') + len('Abstract:\n'):]
-        if text.find('\n(Show Less)') >= 0:
-            text = text[:text.find('\n(Show Less)')]
-        
-        driver.close()
-        return text
-    
     def get_abstract_from_publisher(self, url, _):
-        # TODO: this is super slow. Maybe not Selenium?
-        parsed_url = urlparse(url)
-        ieee_netloc = 'doi.ieeecomputersociety.org'
-        doi_netlog = 'doi.org'
-        if parsed_url.netloc == ieee_netloc:  
-            return self._get_abstract_from_computerorg(url)
-        elif parsed_url.netloc == doi_netlog:
-            return self._get_abstract_from_ieeexplore(url)
-        else:
-            raise NotImplementedError
+        doi = resolve_doi(url)
+        if doi:
+            return get_openalex_abstract(doi)
+
+        html = load_publisher_soup(url)
+        if html is None:
+            return ""
+
+        for attrs in (
+            {"name": "description"},
+            {"property": "og:description"},
+        ):
+            meta = html.find("meta", attrs=attrs)
+            if meta and meta.get("content"):
+                return meta["content"]
+        return ""
 
 
 class AbstractUSENIX(BasePaperAbstract):
     def get_abstract_from_publisher(self, url, authors):
-        r = requests.get(url)
         logger.debug(f'URL: {url}')
-        assert r.status_code == 200
+        html = load_publisher_soup(url)
+        if html is None:
+            return ""
+
+        abstract_div = html.find('div', {'class': 'field-name-field-paper-description'})
+        if abstract_div:
+            return abstract_div.get_text(separator='\n', strip=True)
 
-        html = BeautifulSoup(r.text, 'html.parser')
+        # Fallback for older page layouts
+        abstract_marker = html.find(string=re.compile(r"Abstract:?", re.I))
+        if abstract_marker:
+            return abstract_marker.find_next(recursive=False).get_text(separator='\n', strip=True)
 
-        abstract_paragraphs = html.find(string=re.compile("Abstract:")).find_next(recursive=False)
-        return abstract_paragraphs.get_text(separator='\n')
+        return extract_meta_description(html)
 
 
 class AbstractCCS(BasePaperAbstract):
     def get_abstract_from_publisher(self, url, authors):
-        # TODO: ACM library doesn't like me to crawl and will ban me when upset.
         logger.debug(f'URL: {url}')
-        r = requests.get(url)
-        assert r.status_code == 200
+        doi = resolve_doi(url)
+        if doi:
+            abstract = get_openalex_abstract(doi)
+            if abstract:
+                return abstract
 
-        html = BeautifulSoup(r.text, 'html.parser')
-        abstract_paragraphs = html.find('div', {'class': 'abstractInFull'})
-        return abstract_paragraphs.get_text(separator='\n')
-        # ap_list = [x.text for x in abstract_paragraphs]
-        # return '\n'.join(ap_list)
+        html = load_publisher_soup(url)
+        if html is not None:
+            abstract_paragraphs = html.find('div', {'class': 'abstractInFull'})
+            if abstract_paragraphs is not None:
+                return abstract_paragraphs.get_text(separator='\n', strip=True)
+            meta_description = extract_meta_description(html)
+            if meta_description:
+                return meta_description
+        return ""
 
 NDSS = AbstractNDSS()
 SP = AbstractSP()
@@ -165,9 +592,5 @@ def get_abstract_from_publisher(self, url, authors):
 
 if __name__ == '__main__':
     logger.setLevel('DEBUG')
-    # SP.get_abstract_from_publisher('https://doi.ieeecomputersociety.org/10.1109/SP46215.2023.00131', [])
-    # SP.get_abstract_from_publisher('https://doi.org/10.1109/SP46215.2023.10179411', [])
-    # print(SP.get_abstract_from_publisher('https://doi.org/10.1109/SP46215.2023.10179381', []))
-    # print(USENIX.get_abstract_from_publisher('https://www.usenix.org/conference/usenixsecurity20/presentation/cremers', []))
-    # print(CCS.get_abstract_from_publisher('https://doi.org/10.1145/3576915.3616615', []))
+    print(SP.get_abstract_from_publisher('https://doi.org/10.1109/SP46215.2023.10179381', []))
     print(NDSS.get_abstract_from_publisher('https://www.ndss-symposium.org/ndss2015/i-do-not-know-what-you-visited-last-summer-protecting-users-third-party-web-tracking', []))
diff --git a/top4grep/build_db.py b/top4grep/build_db.py
index 37ff8eb..25aada9 100644
--- a/top4grep/build_db.py
+++ b/top4grep/build_db.py
@@ -1,14 +1,17 @@
 from datetime import datetime
+from html import unescape
 from pathlib import Path
+from urllib.parse import quote
 
 import requests
 import sqlalchemy
-from sqlalchemy.orm import sessionmaker
 from bs4 import BeautifulSoup
+from sqlalchemy.orm import sessionmaker
 
+from .cache import DATA_DIR, cache_key_path, cached_get_json, cached_get_text
 from .utils import new_logger
 from .db import Base, Paper
-from .abstract import Abstracts
+from .abstract import Abstracts, normalize_abstract
 
 logger = new_logger("DB")
 logger.setLevel('WARNING')
@@ -20,60 +23,365 @@
         "USENIX": "uss",
         "CCS": "ccs",
         }
+FRONT_MATTER_TITLES = {
+    "Conference Organizers.",
+    "External Reviewers.",
+    "Message from the Program Chairs.",
+    "Program Committee.",
+}
 PACKAGE_DIR = Path(__file__).resolve().parent
 DB_PATH = PACKAGE_DIR / "data" / "papers.db"
+DBLP_BASE_URL = "https://dblp.uni-trier.de"
+TOC_API_HOSTS = [
+    "https://dblp.org",
+    "https://dblp.dagstuhl.de",
+]
+TOC_FIRST_YEARS = {
+    "ndss": {2011, 2012, 2013, 2014},
+}
+RAW_DBLP_API_DIR = DATA_DIR / "raw" / "dblp" / "api"
+RAW_DBLP_HTML_DIR = DATA_DIR / "raw" / "dblp" / "html"
 
+DB_PATH.parent.mkdir(parents=True, exist_ok=True)
 engine = sqlalchemy.create_engine(f'sqlite:///{str(DB_PATH)}')
 Base.metadata.create_all(engine)
 Session = sessionmaker(bind=engine)
 
-def save_paper(conf, year, title, authors, abstract):
-    logger.debug(f'Adding paper {title} with abstract {abstract[:20]}...')
-    session = Session()
-    paper = Paper(conference=conf, year=year, title=title, authors=", ".join(authors), abstract=abstract)
-    session.add(paper)
-    session.commit()
-    session.close()
 
-def paper_exist(conf, year, title, authors, abstract):
-    session = Session()
-    paper = session.query(Paper).filter(Paper.conference==conf, Paper.year==year, Paper.title==title, Paper.abstract==abstract).first()
-    session.close()
-    return paper is not None
+def normalize_metadata_text(value):
+    if not value:
+        return ""
+    return " ".join(unescape(value).split())
 
-def get_papers(name, year, build_abstract):
-    cnt = 0
-    conf = NAME_MAP[name]
 
-    if build_abstract and name == "NDSS" and (year == 2018 or year == 2016):
-        logger.warning(f"Skipping the abstract for NDSS {year} becuase the website does not contain abstracts.")
-        extract_abstract = False
-    else:
-        extract_abstract = build_abstract
+def normalize_author_list(authors):
+    normalized = []
+    for author in authors:
+        cleaned = normalize_metadata_text(author)
+        if cleaned:
+            normalized.append(cleaned)
+    return normalized
+
+
+def preferred_text(values):
+    cleaned = [value for value in values if value]
+    if not cleaned:
+        return ""
+    return max(cleaned, key=len)
+
+
+def should_skip_record(title, publisher_url=""):
+    lowered_title = (title or "").casefold()
+    lowered_url = (publisher_url or "").casefold()
+
+    if title in FRONT_MATTER_TITLES:
+        return True
+    if lowered_title.startswith("poster:") or lowered_title.startswith("demo:"):
+        return True
+    if lowered_title.startswith("keynote:") or lowered_title.startswith("keynote address:"):
+        return True
+    if "(invited talk)" in lowered_title:
+        return True
+    if "/slides/" in lowered_url:
+        return True
+    return False
+
+
+def migrate_database():
+    normalized_rows = 0
+    duplicate_rows = 0
+
+    with Session.begin() as session:
+        rows = session.query(Paper).order_by(Paper.id).all()
+        grouped_rows = {}
+
+        for row in rows:
+            original = (row.title or "", row.authors or "", row.abstract or "")
+            row.title = normalize_metadata_text(row.title)
+            row.authors = normalize_metadata_text(row.authors)
+            row.abstract = normalize_abstract(unescape(row.abstract or ""))
+
+            if (row.title, row.authors, row.abstract) != original:
+                normalized_rows += 1
+
+            key = (row.conference or "", row.year, row.title)
+            grouped_rows.setdefault(key, []).append(row)
+
+        for row_group in grouped_rows.values():
+            if len(row_group) == 1:
+                continue
+
+            survivor = row_group[0]
+            survivor.authors = preferred_text([row.authors for row in row_group])
+            survivor.abstract = preferred_text([row.abstract for row in row_group])
+
+            for duplicate in row_group[1:]:
+                session.delete(duplicate)
+                duplicate_rows += 1
+
+    return normalized_rows, duplicate_rows
+
+def save_papers(papers):
+    if not papers:
+        return 0
+
+    logger.debug("Adding %s new papers...", len(papers))
+    with Session.begin() as session:
+        session.add_all(papers)
+    return len(papers)
+
+
+def update_papers(paper_updates):
+    if not paper_updates:
+        return 0
+
+    with Session.begin() as session:
+        for paper_id, authors, abstract in paper_updates:
+            paper = session.get(Paper, paper_id)
+            if paper is None:
+                continue
+            if authors:
+                paper.authors = authors
+            if abstract and not paper.abstract:
+                paper.abstract = abstract
+    return len(paper_updates)
+
+
+def load_existing_papers(conf, year):
+    with Session() as session:
+        rows = session.query(Paper).filter(
+            Paper.conference == conf,
+            Paper.year == year,
+        ).all()
+
+    existing = {}
+    for row in rows:
+        normalized_title = normalize_metadata_text(row.title)
+        current = existing.get(normalized_title)
+        candidate = {
+            "id": row.id,
+            "authors": normalize_metadata_text(row.authors),
+            "abstract": normalize_abstract(unescape(row.abstract or "")),
+        }
+        if current is None or (not current["abstract"] and candidate["abstract"]):
+            existing[normalized_title] = candidate
+    return existing
+
+
+def prune_stale_papers(conf, year, current_titles):
+    if not current_titles:
+        return 0
+
+    removed = 0
+    with Session.begin() as session:
+        rows = session.query(Paper).filter(
+            Paper.conference == conf,
+            Paper.year == year,
+        ).all()
+
+        for row in rows:
+            if normalize_metadata_text(row.title) in current_titles:
+                continue
+            session.delete(row)
+            removed += 1
+    return removed
+
+def build_toc_query(conf, year):
+    return f"toc:db/conf/{conf}/{conf}{year}.bht:"
+
+def fetch_toc_api_records(conf, year):
+    query = quote(build_toc_query(conf, year))
+    last_error = None
+    saw_not_found = False
+
+    for host in TOC_API_HOSTS:
+        url = f"{host}/search/publ/api?q={query}&h=1000&format=json"
+        cache_path = cache_key_path(RAW_DBLP_API_DIR, url, ".json", f"{conf}-{year}")
+        try:
+            data = cached_get_json(url, cache_path)
+            if data is None:
+                saw_not_found = True
+                continue
+            hits = data["result"]["hits"].get("hit", [])
+            if isinstance(hits, dict):
+                hits = [hits]
+            return hits
+        except (requests.RequestException, ValueError, KeyError) as e:
+            last_error = e
+
+    if last_error is not None and not saw_not_found:
+        raise requests.RequestException(last_error)
+    return []
+
+def normalize_api_authors(authors):
+    if not authors:
+        return []
+
+    author_entries = authors.get("author", [])
+    if isinstance(author_entries, dict):
+        author_entries = [author_entries]
+
+    normalized = []
+    for author in author_entries:
+        if isinstance(author, dict):
+            name = author.get("text", "").strip()
+        else:
+            name = str(author).strip()
+        if name:
+            normalized.append(name)
+    return normalize_author_list(normalized)
+
+def normalize_toc_api_records(hits):
+    records = []
+    for hit in hits:
+        info = hit.get("info", {})
+        if info.get("type") == "Editorship":
+            continue
+
+        title = normalize_metadata_text(info.get("title"))
+        if not title:
+            continue
+
+        publisher_url = info.get("ee") or ""
+        if should_skip_record(title, publisher_url):
+            continue
+
+        records.append({
+            "title": title,
+            "authors": normalize_api_authors(info.get("authors")),
+            "publisher_url": publisher_url,
+        })
+    return records
+
+def load_records(conf, year):
+    if year >= datetime.now().year or year in TOC_FIRST_YEARS.get(conf, set()):
+        hits = fetch_toc_api_records(conf, year)
+        return normalize_toc_api_records(hits)
+
     try:
-        r = requests.get(f"https://dblp.org/db/conf/{conf}/{conf}{year}.html")
-        assert r.status_code == 200
+        url = f"{DBLP_BASE_URL}/db/conf/{conf}/{conf}{year}.html"
+        cache_path = cache_key_path(RAW_DBLP_HTML_DIR, url, ".html", f"{conf}-{year}")
+        html_text = cached_get_text(url, cache_path)
+        if html_text is None:
+            return []
 
-        html = BeautifulSoup(r.text, 'html.parser')
+        html = BeautifulSoup(html_text, 'html.parser')
         paper_htmls = html.find_all("li", {'class': "inproceedings"})
+        records = []
         for paper_html in paper_htmls:
-            title = paper_html.find('span', {'class': 'title'}).text
-            authors = [x.text for x in paper_html.find_all('span', {'itemprop': 'author'})]
-            if extract_abstract:
-                abstract = Abstracts[name].get_abstract(paper_html, title, authors)
-            else:
+            title_tag = paper_html.find('span', {'class': 'title'})
+            if title_tag is None:
+                continue
+
+            ee = paper_html.find('li', {'class': 'ee'})
+            publisher_url = ee.find('a').get('href') if ee and ee.find('a') else ""
+            title = normalize_metadata_text(title_tag.get_text(" ", strip=True))
+            if should_skip_record(title, publisher_url):
+                continue
+
+            records.append({
+                "title": title,
+                "authors": normalize_author_list(
+                    [x.get_text(" ", strip=True) for x in paper_html.find_all('span', {'itemprop': 'author'})]
+                ),
+                "paper_html": paper_html,
+                "publisher_url": publisher_url,
+            })
+        return records
+    except requests.RequestException:
+        hits = fetch_toc_api_records(conf, year)
+        return normalize_toc_api_records(hits)
+
+def get_papers(name, year, build_abstract):
+    conf = NAME_MAP[name]
+    processed = 0
+    existing_papers = load_existing_papers(name, year)
+    extract_abstract = build_abstract
+
+    try:
+        records = load_records(conf, year)
+        new_papers = []
+        paper_updates = []
+
+        for record in records:
+            title = normalize_metadata_text(record["title"])
+            authors = normalize_author_list(record["authors"])
+            try:
+                if extract_abstract:
+                    if "paper_html" in record:
+                        abstract = Abstracts[name].get_abstract(record["paper_html"], title, authors) or ""
+                    else:
+                        abstract = Abstracts[name].get_abstract_from_url(record.get("publisher_url"), title, authors) or ""
+                else:
+                    abstract = ''
+            except Exception as e:
+                logger.debug(f"Failed to extract abstract for {name}-{year}: {title}: {e}")
                 abstract = ''
-            # insert the entry only if the paper does not exist
-            if not paper_exist(name, year, title, authors, abstract):
-                save_paper(name, year, title, authors, abstract)
-            cnt += 1
+
+            abstract = normalize_abstract(unescape(abstract))
+
+            authors_text = ", ".join(authors)
+            existing = existing_papers.get(title)
+            if existing is not None:
+                if (authors_text and authors_text != existing["authors"]) or (abstract and not existing["abstract"]):
+                    paper_updates.append((existing["id"], authors_text, abstract))
+                    existing["authors"] = authors_text
+                    if abstract:
+                        existing["abstract"] = abstract
+                processed += 1
+                continue
+
+            existing_papers[title] = {
+                "id": None,
+                "authors": authors_text,
+                "abstract": abstract,
+            }
+            new_papers.append(Paper(
+                conference=name,
+                year=year,
+                title=title,
+                authors=authors_text,
+                abstract=abstract,
+            ))
+            processed += 1
+    except requests.RequestException as e:
+        if existing_papers:
+            logger.debug(f"Keeping existing records for {name}-{year} after refresh failure: {e}")
+            return len(existing_papers), 0
+        logger.warning(f"Failed to obtain papers at {name}-{year}: {e}")
+        return 0, 0
     except Exception as e:
-        logger.warning(f"Failed to obtain papers at {name}-{year}")
+        if existing_papers:
+            logger.debug(f"Keeping existing records for {name}-{year} after parse failure: {e}")
+            return len(existing_papers), 0
+        logger.warning(f"Failed to parse papers at {name}-{year}: {e}")
+        return 0, 0
+
+    removed = prune_stale_papers(name, year, {record["title"] for record in records})
+    added = save_papers(new_papers)
+    updated = update_papers(paper_updates)
+    logger.debug(f"Found {processed} papers at {name}-{year}, added {added}, updated {updated}, removed {removed}...")
+    return processed, added + updated + removed
+
 
-    logger.debug(f"Found {cnt} papers at {name}-{year}...")
+def build_db(build_abstract, conferences=None, start_year=2000, end_year=None):
+    if end_year is None:
+        end_year = datetime.now().year
 
+    normalized_rows, duplicate_rows = migrate_database()
+    if normalized_rows or duplicate_rows:
+        logger.info(
+            "Normalized %s records and removed %s duplicates before rebuild.",
+            normalized_rows,
+            duplicate_rows,
+        )
 
-def build_db(build_abstract):
-    for conf in CONFERENCES:
-        for year in range(2000, datetime.now().year+1):
-            get_papers(conf, year, build_abstract)
+    selected_conferences = conferences or CONFERENCES
+    processed = 0
+    added = 0
+    for conf in selected_conferences:
+        for year in range(start_year, end_year + 1):
+            current_processed, current_added = get_papers(conf, year, build_abstract)
+            processed += current_processed
+            added += current_added
+    return processed, added
diff --git a/top4grep/bundle.py b/top4grep/bundle.py
new file mode 100644
index 0000000..fda9a1d
--- /dev/null
+++ b/top4grep/bundle.py
@@ -0,0 +1,109 @@
+import json
+import shutil
+import zipfile
+from datetime import datetime, timezone
+from pathlib import Path
+
+from .cache import DATA_DIR, RAW_DIR
+
+BUNDLE_FORMAT_VERSION = 1
+MANIFEST_NAME = "manifest.json"
+
+
+def default_db_path(data_dir=DATA_DIR):
+    return Path(data_dir) / "papers.db"
+
+
+def iter_bundle_files(data_dir=DATA_DIR):
+    data_dir = Path(data_dir)
+    db_path = default_db_path(data_dir)
+
+    if db_path.exists():
+        yield db_path, Path("papers.db")
+
+    raw_dir = data_dir / "raw"
+    if raw_dir.exists():
+        for path in sorted(raw_dir.rglob("*")):
+            if path.is_file():
+                yield path, Path("raw") / path.relative_to(raw_dir)
+
+
+def build_bundle_manifest(entries):
+    files = []
+    total_bytes = 0
+    for source_path, archive_path in entries:
+        size = source_path.stat().st_size
+        files.append({"path": archive_path.as_posix(), "size": size})
+        total_bytes += size
+
+    return {
+        "bundle_format_version": BUNDLE_FORMAT_VERSION,
+        "created_at_utc": datetime.now(timezone.utc).isoformat(),
+        "file_count": len(files),
+        "total_bytes": total_bytes,
+        "files": files,
+    }
+
+
+def create_cache_bundle(output_path, data_dir=DATA_DIR):
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    entries = list(iter_bundle_files(data_dir))
+    manifest = build_bundle_manifest(entries)
+
+    with zipfile.ZipFile(output_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=6) as bundle:
+        bundle.writestr(MANIFEST_NAME, json.dumps(manifest, indent=2, sort_keys=True))
+        for source_path, archive_path in entries:
+            bundle.write(source_path, archive_path.as_posix())
+
+    return {
+        "output_path": output_path,
+        "file_count": manifest["file_count"],
+        "total_bytes": manifest["total_bytes"],
+        "archive_bytes": output_path.stat().st_size,
+    }
+
+
+def resolve_extract_path(data_dir, archive_name):
+    data_dir = Path(data_dir).resolve()
+    target = (data_dir / archive_name).resolve()
+    if target != data_dir and data_dir not in target.parents:
+        raise ValueError(f"Refusing to extract outside data directory: {archive_name}")
+    return target
+
+
+def install_cache_bundle(bundle_path, data_dir=DATA_DIR, replace=False):
+    bundle_path = Path(bundle_path)
+    data_dir = Path(data_dir)
+    data_dir.mkdir(parents=True, exist_ok=True)
+
+    if replace:
+        raw_dir = data_dir / "raw"
+        if raw_dir.exists():
+            shutil.rmtree(raw_dir)
+        db_path = default_db_path(data_dir)
+        if db_path.exists():
+            db_path.unlink()
+
+    extracted_files = 0
+    manifest = None
+    with zipfile.ZipFile(bundle_path, "r") as bundle:
+        if MANIFEST_NAME in bundle.namelist():
+            manifest = json.loads(bundle.read(MANIFEST_NAME).decode("utf-8"))
+
+        for info in bundle.infolist():
+            if info.is_dir() or info.filename == MANIFEST_NAME:
+                continue
+            target = resolve_extract_path(data_dir, info.filename)
+            target.parent.mkdir(parents=True, exist_ok=True)
+            with bundle.open(info, "r") as src, target.open("wb") as dst:
+                shutil.copyfileobj(src, dst)
+            extracted_files += 1
+
+    return {
+        "bundle_path": bundle_path,
+        "data_dir": data_dir,
+        "extracted_files": extracted_files,
+        "manifest": manifest,
+    }
diff --git a/top4grep/cache.py b/top4grep/cache.py
new file mode 100644
index 0000000..2f4d009
--- /dev/null
+++ b/top4grep/cache.py
@@ -0,0 +1,153 @@
+import hashlib
+import json
+import re
+from pathlib import Path
+
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+
+PACKAGE_DIR = Path(__file__).resolve().parent
+DATA_DIR = PACKAGE_DIR / "data"
+RAW_DIR = DATA_DIR / "raw"
+USER_AGENT = "top4grep/0.0.0 (+https://github.com/Kyle-Kyle/top4grep)"
+HTTP_TIMEOUT_SECONDS = 15
+TEXT_MISSING_SENTINEL = "__top4grep_missing__"
+JSON_MISSING_SENTINEL = {"__top4grep_missing__": True}
+MISSING = object()
+
+DATA_DIR.mkdir(parents=True, exist_ok=True)
+RAW_DIR.mkdir(parents=True, exist_ok=True)
+
+
+def build_http_session():
+    retry_policy = Retry(
+        total=5,
+        connect=3,
+        read=3,
+        status=3,
+        backoff_factor=0.5,
+        status_forcelist=(429, 500, 502, 503, 504),
+        allowed_methods=frozenset(["GET"]),
+        respect_retry_after_header=True,
+    )
+    adapter = HTTPAdapter(max_retries=retry_policy)
+    session = requests.Session()
+    session.headers.update(
+        {
+            "User-Agent": USER_AGENT,
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,application/json;q=0.9,*/*;q=0.8",
+        }
+    )
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+    return session
+
+
+HTTP = build_http_session()
+
+
+def slugify(value, max_length=80):
+    slug = re.sub(r"[^A-Za-z0-9._-]+", "-", value).strip("-").lower()
+    if not slug:
+        slug = "item"
+    return slug[:max_length]
+
+
+def cache_key_path(root, key, suffix, hint=None):
+    root.mkdir(parents=True, exist_ok=True)
+    digest = hashlib.sha256(key.encode("utf-8")).hexdigest()[:12]
+    stem = slugify(hint or key)
+    return root / f"{stem}-{digest}{suffix}"
+
+
+def read_text(path):
+    if not path.exists():
+        return None
+    text = path.read_text(encoding="utf-8")
+    if text == TEXT_MISSING_SENTINEL:
+        return MISSING
+    return text
+
+
+def write_text(path, value):
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(value, encoding="utf-8")
+
+
+def read_bytes(path):
+    if not path.exists():
+        return None
+    missing_path = path.with_suffix(f"{path.suffix}.missing")
+    if missing_path.exists():
+        return MISSING
+    return path.read_bytes()
+
+
+def write_bytes(path, value):
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_bytes(value)
+
+
+def read_json(path):
+    if not path.exists():
+        return None
+    data = json.loads(path.read_text(encoding="utf-8"))
+    if data == JSON_MISSING_SENTINEL:
+        return MISSING
+    return data
+
+
+def write_json(path, value):
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(value, indent=2, sort_keys=True), encoding="utf-8")
+
+
+def cached_get_text(url, cache_path, timeout=HTTP_TIMEOUT_SECONDS):
+    cached = read_text(cache_path)
+    if cached is MISSING:
+        return None
+    if cached is not None:
+        return cached
+
+    response = HTTP.get(url, timeout=timeout)
+    if response.status_code == 404:
+        write_text(cache_path, TEXT_MISSING_SENTINEL)
+        return None
+    response.raise_for_status()
+    write_text(cache_path, response.text)
+    return response.text
+
+
+def cached_get_json(url, cache_path, timeout=HTTP_TIMEOUT_SECONDS):
+    cached = read_json(cache_path)
+    if cached is MISSING:
+        return None
+    if cached is not None:
+        return cached
+
+    response = HTTP.get(url, timeout=timeout)
+    if response.status_code == 404:
+        write_json(cache_path, JSON_MISSING_SENTINEL)
+        return None
+    response.raise_for_status()
+    data = response.json()
+    write_json(cache_path, data)
+    return data
+
+
+def cached_get_bytes(url, cache_path, timeout=HTTP_TIMEOUT_SECONDS):
+    cached = read_bytes(cache_path)
+    if cached is MISSING:
+        return None
+    if cached is not None:
+        return cached
+
+    response = HTTP.get(url, timeout=timeout)
+    if response.status_code == 404:
+        cache_path.parent.mkdir(parents=True, exist_ok=True)
+        cache_path.with_suffix(f"{cache_path.suffix}.missing").write_text("", encoding="utf-8")
+        return None
+    response.raise_for_status()
+    write_bytes(cache_path, response.content)
+    return response.content
diff --git a/top4grep/db.py b/top4grep/db.py
index eb45eb1..e4a0e9e 100644
--- a/top4grep/db.py
+++ b/top4grep/db.py
@@ -1,5 +1,5 @@
-from sqlalchemy import Column, String, Integer
-from sqlalchemy.ext.declarative import declarative_base, declared_attr
+from sqlalchemy import Column, Integer, String
+from sqlalchemy.orm import declarative_base, declared_attr
 
 class BaseTable:
     @declared_attr
diff --git a/top4grep/search.py b/top4grep/search.py
new file mode 100644
index 0000000..271a302
--- /dev/null
+++ b/top4grep/search.py
@@ -0,0 +1,83 @@
+import re
+
+from nltk.stem import PorterStemmer
+
+TOKEN_RE = re.compile(r"[A-Za-z0-9_+#.-]+")
+STEMMER = PorterStemmer()
+SEARCH_FIELDS = ("title", "abstract", "authors", "conference", "all")
+CONFERENCE_ORDER = {
+    "CCS": 3,
+    "USENIX": 2,
+    "IEEE S&P": 1,
+    "NDSS": 0,
+}
+
+
+def normalize_token(token):
+    token = token.strip().lower()
+    if not token:
+        return ""
+    return STEMMER.stem(token)
+
+
+def tokenize(text):
+    if not text:
+        return []
+    tokens = []
+    for raw in TOKEN_RE.findall(text.lower()):
+        normalized = normalize_token(raw)
+        if normalized:
+            tokens.append(normalized)
+    return tokens
+
+
+def normalize_term(term):
+    return [token for token in tokenize(term) if token]
+
+
+def parse_query(query):
+    groups = []
+    for clause in query.split(","):
+        clause = clause.strip()
+        if not clause:
+            continue
+
+        options = []
+        for option in clause.split("|"):
+            normalized = normalize_term(option)
+            if normalized:
+                options.append(normalized)
+
+        if options:
+            groups.append(options)
+    return groups
+
+
+def build_search_text(paper, field):
+    values = {
+        "title": paper.title or "",
+        "abstract": paper.abstract or "",
+        "authors": paper.authors or "",
+        "conference": paper.conference or "",
+    }
+    if field == "all":
+        return " ".join(values.values())
+    return values[field]
+
+
+def paper_matches(paper, query_groups, field):
+    if not query_groups:
+        return True
+
+    token_set = set(tokenize(build_search_text(paper, field)))
+    for group in query_groups:
+        if not any(all(token in token_set for token in option) for option in group):
+            return False
+    return True
+
+
+def paper_sort_key(paper):
+    conference_order = CONFERENCE_ORDER.get(paper.conference)
+    if conference_order is None:
+        return (-paper.year, 1, 0, (paper.title or "").lower())
+    return (-paper.year, 0, -conference_order, (paper.title or "").lower())
diff --git a/top4grep/utils.py b/top4grep/utils.py
index 3f81d40..889dd11 100644
--- a/top4grep/utils.py
+++ b/top4grep/utils.py
@@ -1,7 +1,9 @@
+import html
 import os
-import uuid
 import logging
+import re
 import tempfile
+import uuid
 from contextlib import contextmanager
 
 import colorlog
@@ -10,6 +12,17 @@
 logger_formatter = colorlog.ColoredFormatter(
     '[%(name)s][%(levelname)s]%(asctime)s %(log_color)s%(message)s',
     datefmt='%m-%d %H:%M')
+WHITESPACE_RE = re.compile(r"\s+")
+
+
+def normalize_text(value):
+    if value is None:
+        return ""
+    return WHITESPACE_RE.sub(" ", html.unescape(str(value))).strip()
+
+
+def normalize_title(value):
+    return normalize_text(value)
 
 def new_logger(name, level='DEBUG', new=True):
     # add custom level "VERBOSE"