diff --git a/README.md b/README.md index 4aded2a..2ce6fae 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # top4grep -A grep tool for the top 4 security conferences +A grep tool for the top 4 security conferences, with cached raw fetches for repeatable literature review. ## Installation ``` @@ -15,20 +15,99 @@ If you want to update the papers stored in `papers.db`, you can recreate it with top4grep --build-db ``` -Which will build the db wherever you run it. +The database and all cached raw fetches live under `top4grep/data/` inside the package directory, not in the current working directory. + +Useful build filters: +```bash +top4grep --build-db --conference "IEEE S&P" --conference CCS --year-from 2021 --year-to 2026 +top4grep --build-db --abstract --year-from 2023 --year-to 2024 +``` + +### Cache Bundle Bootstrap +You can package the local database plus all cached raw fetches into one zip file: + +```bash +top4grep --export-cache-bundle /tmp/top4grep-cache-bundle.zip +``` + +Current published bundle cutoff: + +- snapshot date: `2026-03-12` +- download: `https://drive.google.com/file/d/1vAeDHNnDDKyXWPQTGLnlWnQchoRt9-_s/view?usp=sharing` +- expected usage: install the bundle first, then run incremental refreshes for anything added or corrected after `2026-03-12` + +The bundle contains: + +- `papers.db` +- `raw/dblp/...` +- `raw/publisher_html/...` +- `raw/openalex/...` +- `raw/semantic_scholar/...` +- `raw/pdf/...` + +To install a downloaded bundle into the correct local package data directory: + +```bash +top4grep --install-cache-bundle ~/Downloads/top4grep-cache-bundle.zip +``` + +If you want the downloaded snapshot to fully replace your current local data first: + +```bash +top4grep --install-cache-bundle ~/Downloads/top4grep-cache-bundle.zip --replace-data +``` + +After installing a bundle, users can do only incremental refreshes for newer years instead of cold-crawling the full history again. For example: + +```bash +top4grep --build-db --abstract --year-from 2024 +``` + +Notes: + +- `--abstract` updates existing rows in place when abstracts are missing and keeps papers even when their abstracts remain unavailable after all configured fallbacks +- rebuilds also remove stale rows that no longer appear in the latest source data +- the first abstract build for a new slice can take noticeably longer because publisher and OpenAlex data are fetched live +- rerunning the same abstract build is much faster because raw responses are cached under `top4grep/data/raw/` +- stored titles and authors are normalized automatically, so escaped HTML entities such as `"` do not create duplicate rows +- abstract recovery currently tries publisher HTML, OpenAlex, Semantic Scholar, and PDF extraction from open conference paper links ### Query ```bash -top4grep -k +top4grep -k ``` -For example, `python top4grep.py -k linux,kernel` -Currently, the query is just a case-insensitive match (just like grep). The returned results must contains all the input keywords (papers containing keyword1 AND keyword2 AND ...). Support for `OR` operation (papers containing keyword1 OR keyword2) is missing, but will be added in the future. +For example: + +```bash +top4grep -k linux,kernel +top4grep -k "linux,kernel|driver" --field all --year-from 2021 +top4grep -k "supply,chain|dependency" --abstract --conference CCS +``` + +Query semantics: + +- `,` means `AND` +- `|` means `OR` +- matching is case-insensitive and stemmed, so `exploiting` matches `exploit` +- `--field` supports `title`, `abstract`, `authors`, `conference`, and `all` +- `--abstract` is kept for compatibility and defaults query field selection to `abstract` + +Raw source material is cached under: + +- `top4grep/data/raw/dblp/html/` +- `top4grep/data/raw/dblp/api/` +- `top4grep/data/raw/publisher_html/` +- `top4grep/data/raw/openalex/` +- `top4grep/data/raw/semantic_scholar/` +- `top4grep/data/raw/pdf/` + +The bundled cache installs into the same `top4grep/data/` location, so the normal build pipeline automatically reuses it. ## Screenshot ![screenshot](https://raw.githubusercontent.com/Kyle-Kyle/top4grep/master/img/screenshot.png) -## TODO -- [ ] grep in abstract -- [ ] fuzzy match -- [ ] complex search logic (`OR` operation) +## Status +- [x] grep in abstract +- [x] fuzzy match +- [x] complex search logic (`OR` operation) diff --git a/setup.cfg b/setup.cfg index ca2d8c1..e663f3d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -18,7 +18,6 @@ install_requires = colorlog beautifulsoup4 nltk - selenium requests python_requires = >= 3.8 packages = find: diff --git a/tests/test_abstract.py b/tests/test_abstract.py new file mode 100644 index 0000000..738d0b5 --- /dev/null +++ b/tests/test_abstract.py @@ -0,0 +1,194 @@ +import unittest +from requests import RequestException +from unittest.mock import patch + +from bs4 import BeautifulSoup +from top4grep.abstract import ( + AbstractCCS, + extract_abstract_from_pdf_text, + extract_doi, + find_pdf_url_in_soup, + get_openalex_abstract_for_title, + get_openalex_abstract, + get_semantic_scholar_abstract_for_title, + normalize_abstract, + normalize_source_url, + normalize_title_key, + reconstruct_abstract, +) + + +class AbstractTests(unittest.TestCase): + def test_extract_doi_from_url(self): + doi = extract_doi("https://doi.org/10.1145/3576915.3616615") + self.assertEqual(doi, "10.1145/3576915.3616615") + + def test_reconstruct_openalex_abstract(self): + text = reconstruct_abstract( + { + "hello": [0], + "world": [1], + "again": [2], + } + ) + self.assertEqual(text, "hello world again") + + def test_normalize_abstract_collapses_whitespace(self): + text = normalize_abstract("First line\n\n Second line ") + self.assertEqual(text, "First line\nSecond line") + + def test_normalize_abstract_unescapes_html_entities(self): + text = normalize_abstract(""Quoted" text") + self.assertEqual(text, '"Quoted" text') + + def test_normalize_title_key_ignores_entities_and_punctuation(self): + key = normalize_title_key('"AttackGNN": Red-Teaming GNNs!') + self.assertEqual(key, "attackgnnredteaminggnns") + + def test_normalize_title_key_strips_html_markup_and_diacritics(self): + key = normalize_title_key("Avara: Voge\u0308le and \u03bcCFI") + self.assertEqual(key, "avaravogeleandcfi") + + def test_extract_abstract_from_pdf_text(self): + text = extract_abstract_from_pdf_text( + "Paper Title\nAuthors\nAbstract\nThis is the abstract.\n1 Introduction\nBody" + ) + self.assertEqual(text, "This is the abstract.") + + def test_find_pdf_url_in_soup_prefers_pdf_links(self): + soup = BeautifulSoup( + '', + "html.parser", + ) + + self.assertEqual( + find_pdf_url_in_soup("https://example.com/paper", soup), + "https://example.com/paper.pdf", + ) + + def test_normalize_source_url_rewrites_legacy_ndss_pdf_host(self): + self.assertEqual( + normalize_source_url( + "http://wp.internetsociety.org/ndss/wp-content/uploads/sites/25/2017/09/paper.pdf" + ), + "https://www.ndss-symposium.org/wp-content/uploads/2017/09/paper.pdf", + ) + + def test_ccs_falls_back_to_non_boilerplate_meta_description(self): + soup = BeautifulSoup( + '', + "html.parser", + ) + with patch("top4grep.abstract.resolve_doi", return_value=""), \ + patch("top4grep.abstract.load_publisher_soup", return_value=soup): + text = AbstractCCS().get_abstract_from_publisher("https://example.com/paper", []) + + self.assertEqual(text, "A real page summary.") + + @patch("top4grep.abstract.cached_get_json", side_effect=RequestException("dns failed")) + def test_openalex_request_errors_return_empty_abstract(self, _cached_get_json): + self.assertEqual(get_openalex_abstract("10.1145/3658644.3670278"), "") + + @patch("top4grep.abstract.cached_get_json") + def test_openalex_title_search_returns_exact_title_match(self, cached_get_json): + cached_get_json.return_value = { + "results": [ + { + "display_name": "AttackGNN: Red-Teaming GNNs in Hardware Security Using Reinforcement Learning", + "abstract_inverted_index": { + "hello": [0], + "world": [1], + }, + } + ] + } + + text = get_openalex_abstract_for_title( + "AttackGNN: Red-Teaming GNNs in Hardware Security Using Reinforcement Learning" + ) + + self.assertEqual(text, "hello world") + + @patch("top4grep.abstract.cached_get_json") + def test_semantic_scholar_title_search_returns_exact_title_match(self, cached_get_json): + cached_get_json.return_value = { + "data": [ + { + "title": "Avara: A Uniform Evaluation System for Perceptibility Analysis Against Adversarial Object Evasion Attacks", + "authors": [{"name": "Xinyao Ma"}], + "abstract": "Semantic Scholar abstract", + } + ] + } + + text = get_semantic_scholar_abstract_for_title( + "Avara: A Uniform Evaluation System for Perceptibility Analysis Against Adversarial Object Evasion Attacks.", + ["Xinyao Ma"], + ) + + self.assertEqual(text, "Semantic Scholar abstract") + + @patch("top4grep.abstract.load_publisher_soup") + @patch("top4grep.abstract.get_openalex_abstract") + def test_ccs_doi_urls_use_openalex_before_publisher_html(self, get_openalex_abstract, load_publisher_soup): + get_openalex_abstract.return_value = "abstract from openalex" + + text = AbstractCCS().get_abstract_from_publisher("https://doi.org/10.1145/3658644.3670278", []) + + self.assertEqual(text, "abstract from openalex") + load_publisher_soup.assert_not_called() + get_openalex_abstract.assert_called_once_with("10.1145/3658644.3670278") + + @patch("top4grep.abstract.load_publisher_soup") + @patch("top4grep.abstract.get_openalex_abstract") + def test_ccs_falls_back_to_publisher_html_when_openalex_is_empty(self, get_openalex_abstract, load_publisher_soup): + get_openalex_abstract.return_value = "" + load_publisher_soup.return_value.find.return_value.get_text.return_value = "publisher abstract" + + text = AbstractCCS().get_abstract_from_publisher("https://doi.org/10.1145/3658644.3670278", []) + + self.assertEqual(text, "publisher abstract") + load_publisher_soup.assert_called_once() + + @patch.object(AbstractCCS, "get_abstract_from_publisher", return_value="") + @patch("top4grep.abstract.get_semantic_scholar_abstract", return_value="semantic scholar abstract") + @patch("top4grep.abstract.resolve_doi", side_effect=AssertionError("resolve_doi should not be retried")) + @patch("top4grep.abstract.get_openalex_abstract_for_title", return_value="") + def test_direct_doi_urls_use_semantic_scholar_before_title_search(self, _title_fallback, _resolve_doi, _semantic_doi_fallback, _get_abstract_from_publisher): + text = AbstractCCS().get_abstract_from_url( + "https://doi.org/10.1145/3658644.3670278", + "example title", + [], + ) + + self.assertEqual(text, "semantic scholar abstract") + + @patch.object(AbstractCCS, "get_abstract_from_publisher", return_value="") + @patch("top4grep.abstract.resolve_doi", side_effect=RuntimeError("publisher fetch failed")) + @patch("top4grep.abstract.get_semantic_scholar_abstract_for_title", return_value="semantic scholar title fallback") + @patch("top4grep.abstract.get_openalex_abstract_for_title", return_value="title fallback abstract") + def test_publisher_resolution_failures_fall_back_to_title_search(self, _title_fallback, _semantic_title_fallback, _resolve_doi, _get_abstract_from_publisher): + text = AbstractCCS().get_abstract_from_url( + "https://example.com/paper", + "example title", + [], + ) + + self.assertEqual(text, "title fallback abstract") + + @patch.object(AbstractCCS, "get_abstract_from_publisher", return_value="") + @patch("top4grep.abstract.resolve_doi", side_effect=RuntimeError("publisher fetch failed")) + @patch("top4grep.abstract.get_semantic_scholar_abstract_for_title", return_value="semantic scholar title fallback") + @patch("top4grep.abstract.get_openalex_abstract_for_title", return_value="") + def test_title_search_falls_back_to_semantic_scholar_when_openalex_is_empty(self, _title_fallback, _semantic_title_fallback, _resolve_doi, _get_abstract_from_publisher): + text = AbstractCCS().get_abstract_from_url( + "https://example.com/paper", + "example title", + [], + ) + + self.assertEqual(text, "semantic scholar title fallback") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_build_db.py b/tests/test_build_db.py new file mode 100644 index 0000000..d22a769 --- /dev/null +++ b/tests/test_build_db.py @@ -0,0 +1,217 @@ +import unittest +from unittest.mock import patch +from types import SimpleNamespace + +import sqlalchemy +from sqlalchemy.orm import sessionmaker +from requests import RequestException + +import top4grep.build_db as build_db +from top4grep.db import Base, Paper + + +class BuildDbTests(unittest.TestCase): + def test_normalize_toc_api_records_unescapes_titles_and_authors(self): + hits = [ + { + "info": { + "title": ""Do Anything Now": Testing & Evaluation.", + "authors": { + "author": [ + {"text": "Alice & Bob"}, + {"text": "Carol"}, + ] + }, + "ee": "https://doi.org/10.1145/3658644.3670278", + } + } + ] + + records = build_db.normalize_toc_api_records(hits) + + self.assertEqual(records[0]["title"], '"Do Anything Now": Testing & Evaluation.') + self.assertEqual(records[0]["authors"], ["Alice & Bob", "Carol"]) + + def test_normalize_toc_api_records_skips_front_matter_entries(self): + hits = [ + {"info": {"title": "Conference Organizers.", "authors": {"author": []}, "ee": ""}}, + {"info": {"title": "Poster: A Short Result.", "authors": {"author": [{"text": "Alice"}]}, "ee": ""}}, + {"info": {"title": "Real Paper.", "authors": {"author": [{"text": "Bob"}]}, "ee": ""}}, + ] + + records = build_db.normalize_toc_api_records(hits) + + self.assertEqual([record["title"] for record in records], ["Real Paper."]) + + def test_should_skip_record_detects_slides_and_keynotes(self): + self.assertTrue(build_db.should_skip_record("Keynote Address: Back to the Future.", "")) + self.assertTrue(build_db.should_skip_record("Interesting Topic.", "https://www.usenix.org/slides/talk.pdf")) + self.assertFalse(build_db.should_skip_record("Interesting Topic.", "https://example.com/paper.pdf")) + + def test_migrate_database_normalizes_and_deduplicates_titles(self): + engine = sqlalchemy.create_engine("sqlite:///:memory:") + Base.metadata.create_all(engine) + test_session = sessionmaker(bind=engine) + + with test_session.begin() as session: + session.add_all( + [ + Paper( + conference="CCS", + year=2024, + title='"Do Anything Now": Characterizing Jailbreak Prompts.', + authors="Alice", + abstract="", + ), + Paper( + conference="CCS", + year=2024, + title='"Do Anything Now": Characterizing Jailbreak Prompts.', + authors="Alice", + abstract="Useful abstract", + ), + ] + ) + + with patch.object(build_db, "Session", test_session): + normalized_rows, duplicate_rows = build_db.migrate_database() + + self.assertGreaterEqual(normalized_rows, 1) + self.assertEqual(duplicate_rows, 1) + + with test_session() as session: + rows = session.query(Paper).all() + + self.assertEqual(len(rows), 1) + self.assertEqual(rows[0].title, '"Do Anything Now": Characterizing Jailbreak Prompts.') + self.assertEqual(rows[0].abstract, "Useful abstract") + + def test_prune_stale_papers_removes_rows_missing_from_latest_source(self): + engine = sqlalchemy.create_engine("sqlite:///:memory:") + Base.metadata.create_all(engine) + test_session = sessionmaker(bind=engine) + + with test_session.begin() as session: + session.add_all( + [ + Paper(conference="CCS", year=2024, title="Current Title", authors="Alice", abstract=""), + Paper(conference="CCS", year=2024, title="Stale Title", authors="Bob", abstract=""), + ] + ) + + with patch.object(build_db, "Session", test_session): + removed = build_db.prune_stale_papers("CCS", 2024, {"Current Title"}) + + self.assertEqual(removed, 1) + with test_session() as session: + rows = session.query(Paper).order_by(Paper.title).all() + + self.assertEqual([row.title for row in rows], ["Current Title"]) + + @patch.object(build_db, "TOC_API_HOSTS", ["https://one.example", "https://two.example"]) + @patch("top4grep.build_db.cached_get_json") + def test_fetch_toc_api_records_retries_other_hosts_for_current_year(self, cached_get_json): + cached_get_json.side_effect = [ + RequestException("first host timeout"), + {"result": {"hits": {"hit": [{"info": {"title": "Paper"}}]}}}, + ] + + hits = build_db.fetch_toc_api_records("ccs", 2026) + + self.assertEqual(len(hits), 1) + self.assertEqual(cached_get_json.call_count, 2) + + @patch.object(build_db, "TOC_API_HOSTS", ["https://one.example", "https://two.example"]) + @patch("top4grep.build_db.cached_get_json", side_effect=RequestException("network down")) + def test_fetch_toc_api_records_raises_when_all_hosts_fail(self, _cached_get_json): + with self.assertRaises(RequestException): + build_db.fetch_toc_api_records("ccs", 2026) + + def test_get_papers_continues_when_one_record_extraction_fails(self): + engine = sqlalchemy.create_engine("sqlite:///:memory:") + Base.metadata.create_all(engine) + test_session = sessionmaker(bind=engine) + + with test_session.begin() as session: + session.add_all( + [ + Paper(conference="USENIX", year=2024, title="Broken Paper", authors="Alice", abstract=""), + Paper(conference="USENIX", year=2024, title="Recoverable Paper", authors="Bob", abstract=""), + ] + ) + + records = [ + {"title": "Broken Paper", "authors": ["Alice"], "paper_html": SimpleNamespace(marker="broken")}, + {"title": "Recoverable Paper", "authors": ["Bob"], "paper_html": SimpleNamespace(marker="ok")}, + ] + + def fake_get_abstract(paper_html, title, authors): + if paper_html.marker == "broken": + raise RuntimeError("boom") + return "Recovered abstract" + + with patch.object(build_db, "Session", test_session), \ + patch.object(build_db, "load_records", return_value=records), \ + patch.object(build_db, "prune_stale_papers", return_value=0), \ + patch.dict(build_db.Abstracts, {"USENIX": SimpleNamespace(get_abstract=fake_get_abstract)}): + processed, changed = build_db.get_papers("USENIX", 2024, True) + + self.assertEqual(processed, 2) + self.assertEqual(changed, 1) + with test_session() as session: + recoverable = session.query(Paper).filter(Paper.title == "Recoverable Paper").one() + broken = session.query(Paper).filter(Paper.title == "Broken Paper").one() + + self.assertEqual(recoverable.abstract, "Recovered abstract") + self.assertEqual(broken.abstract, "") + + def test_ndss_legacy_years_still_attempt_abstract_fallbacks(self): + engine = sqlalchemy.create_engine("sqlite:///:memory:") + Base.metadata.create_all(engine) + test_session = sessionmaker(bind=engine) + + records = [ + {"title": "Legacy NDSS Paper", "authors": ["Alice"], "paper_html": SimpleNamespace(marker="legacy")}, + ] + + with patch.object(build_db, "Session", test_session), \ + patch.object(build_db, "load_records", return_value=records), \ + patch.object(build_db, "prune_stale_papers", return_value=0), \ + patch.dict(build_db.Abstracts, {"NDSS": SimpleNamespace(get_abstract=lambda *_args: "Recovered abstract")}): + processed, changed = build_db.get_papers("NDSS", 2016, True) + + self.assertEqual(processed, 1) + self.assertEqual(changed, 1) + with test_session() as session: + row = session.query(Paper).filter(Paper.title == "Legacy NDSS Paper").one() + + self.assertEqual(row.abstract, "Recovered abstract") + + def test_abstract_build_keeps_rows_without_recoverable_abstracts(self): + engine = sqlalchemy.create_engine("sqlite:///:memory:") + Base.metadata.create_all(engine) + test_session = sessionmaker(bind=engine) + + with test_session.begin() as session: + session.add(Paper(conference="CCS", year=2024, title="Unrecoverable Paper", authors="Alice", abstract="")) + + records = [ + {"title": "Unrecoverable Paper", "authors": ["Alice"], "paper_html": SimpleNamespace(marker="missing")}, + ] + + with patch.object(build_db, "Session", test_session), \ + patch.object(build_db, "load_records", return_value=records), \ + patch.dict(build_db.Abstracts, {"CCS": SimpleNamespace(get_abstract=lambda *_args: "")}): + processed, changed = build_db.get_papers("CCS", 2024, True) + + self.assertEqual(processed, 1) + self.assertEqual(changed, 0) + with test_session() as session: + row = session.query(Paper).one() + + self.assertEqual(row.title, "Unrecoverable Paper") + self.assertEqual(row.abstract, "") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_cache.py b/tests/test_cache.py new file mode 100644 index 0000000..7d13222 --- /dev/null +++ b/tests/test_cache.py @@ -0,0 +1,84 @@ +import tempfile +import unittest +import zipfile +from pathlib import Path +from unittest.mock import Mock, patch + +from top4grep.bundle import create_cache_bundle, install_cache_bundle +from top4grep.cache import cached_get_json, cached_get_text + + +class CacheTests(unittest.TestCase): + def test_cached_get_json_negative_caches_404(self): + with tempfile.TemporaryDirectory() as tmpdir: + cache_path = Path(tmpdir) / "missing.json" + response = Mock(status_code=404) + + with patch("top4grep.cache.HTTP.get", return_value=response) as mock_get: + self.assertIsNone(cached_get_json("https://example.com/missing.json", cache_path)) + self.assertIsNone(cached_get_json("https://example.com/missing.json", cache_path)) + + mock_get.assert_called_once() + + def test_cached_get_text_negative_caches_404(self): + with tempfile.TemporaryDirectory() as tmpdir: + cache_path = Path(tmpdir) / "missing.html" + response = Mock(status_code=404) + + with patch("top4grep.cache.HTTP.get", return_value=response) as mock_get: + self.assertIsNone(cached_get_text("https://example.com/missing.html", cache_path)) + self.assertIsNone(cached_get_text("https://example.com/missing.html", cache_path)) + + mock_get.assert_called_once() + + def test_create_cache_bundle_includes_db_and_raw_cache(self): + with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) / "data" + raw_dir = data_dir / "raw" / "openalex" + raw_dir.mkdir(parents=True) + (data_dir / "papers.db").write_text("db", encoding="utf-8") + (raw_dir / "paper.json").write_text("{}", encoding="utf-8") + bundle_path = Path(tmpdir) / "bundle.zip" + + result = create_cache_bundle(bundle_path, data_dir=data_dir) + + self.assertEqual(result["file_count"], 2) + with zipfile.ZipFile(bundle_path, "r") as bundle: + names = set(bundle.namelist()) + + self.assertIn("manifest.json", names) + self.assertIn("papers.db", names) + self.assertIn("raw/openalex/paper.json", names) + + def test_install_cache_bundle_restores_files(self): + with tempfile.TemporaryDirectory() as tmpdir: + source_dir = Path(tmpdir) / "source" + source_raw = source_dir / "raw" / "publisher_html" + source_raw.mkdir(parents=True) + (source_dir / "papers.db").write_text("db", encoding="utf-8") + (source_raw / "page.html").write_text("", encoding="utf-8") + bundle_path = Path(tmpdir) / "bundle.zip" + create_cache_bundle(bundle_path, data_dir=source_dir) + + install_dir = Path(tmpdir) / "install" + result = install_cache_bundle(bundle_path, data_dir=install_dir) + + self.assertEqual(result["extracted_files"], 2) + self.assertEqual((install_dir / "papers.db").read_text(encoding="utf-8"), "db") + self.assertEqual( + (install_dir / "raw" / "publisher_html" / "page.html").read_text(encoding="utf-8"), + "", + ) + + def test_install_cache_bundle_rejects_path_traversal(self): + with tempfile.TemporaryDirectory() as tmpdir: + bundle_path = Path(tmpdir) / "bundle.zip" + with zipfile.ZipFile(bundle_path, "w") as bundle: + bundle.writestr("../escape.txt", "bad") + + with self.assertRaises(ValueError): + install_cache_bundle(bundle_path, data_dir=Path(tmpdir) / "install") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_search.py b/tests/test_search.py new file mode 100644 index 0000000..292a3f0 --- /dev/null +++ b/tests/test_search.py @@ -0,0 +1,56 @@ +import unittest +from types import SimpleNamespace + +from top4grep.search import paper_matches, paper_sort_key, parse_query + + +def make_paper(**kwargs): + defaults = { + "title": "Exploiting Linux Kernel Drivers", + "abstract": "This paper studies exploit chains in kernel subsystems.", + "authors": "Alice Example, Bob Example", + "conference": "CCS", + "year": 2024, + } + defaults.update(kwargs) + return SimpleNamespace(**defaults) + + +class SearchTests(unittest.TestCase): + def test_and_query_matches_title(self): + paper = make_paper() + self.assertTrue(paper_matches(paper, parse_query("linux,exploit"), "title")) + + def test_or_query_matches_title(self): + paper = make_paper() + self.assertTrue(paper_matches(paper, parse_query("kernel|driver"), "title")) + + def test_stemming_matches_variants(self): + paper = make_paper(title="Exploitability Signals for Linux Drivers") + self.assertTrue(paper_matches(paper, parse_query("exploiting"), "title")) + + def test_abstract_field_can_match_without_title_hit(self): + paper = make_paper(title="Artifact-Centric Analysis", abstract="Dependency risk in package supply chains.") + self.assertTrue(paper_matches(paper, parse_query("dependency,supply"), "abstract")) + + def test_all_field_can_match_across_non_title_content(self): + paper = make_paper(title="Artifact-Centric Analysis", abstract="Dependency risk in package supply chains.") + self.assertTrue(paper_matches(paper, parse_query("dependency,supply"), "all")) + + def test_missing_term_rejects_match(self): + paper = make_paper() + self.assertFalse(paper_matches(paper, parse_query("windows"), "title")) + + def test_sort_order_matches_original_cli_behavior(self): + papers = [ + make_paper(conference="NDSS", title="ndss"), + make_paper(conference="IEEE S&P", title="sp"), + make_paper(conference="USENIX", title="usenix"), + make_paper(conference="CCS", title="ccs"), + ] + ordered = sorted(papers, key=paper_sort_key) + self.assertEqual([paper.conference for paper in ordered], ["CCS", "USENIX", "IEEE S&P", "NDSS"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/top4grep/__main__.py b/top4grep/__main__.py index 9a97ceb..6a6f82f 100644 --- a/top4grep/__main__.py +++ b/top4grep/__main__.py @@ -1,66 +1,40 @@ +import argparse +from datetime import datetime +from pathlib import Path import sqlalchemy from sqlalchemy.orm import sessionmaker -from nltk import download, word_tokenize -from nltk.data import find -from nltk.stem import PorterStemmer +from .bundle import create_cache_bundle, install_cache_bundle from .db import Base, Paper -from .build_db import build_db, DB_PATH +from .build_db import DB_PATH, build_db, migrate_database +from .search import SEARCH_FIELDS, paper_matches, paper_sort_key, parse_query from .utils import new_logger -import argparse - - engine = sqlalchemy.create_engine(f'sqlite:///{str(DB_PATH)}') Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) logger = new_logger("Top4Grep") -stemmer = PorterStemmer() CONFERENCES = ["NDSS", "IEEE S&P", "USENIX", "CCS"] -# Function to check and download 'punkt' if not already available -def check_and_download_punkt(): - try: - # Check if 'punkt' is available, this will raise a LookupError if not found - find('tokenizers/punkt') - #print("'punkt' tokenizer models are already installed.") - except LookupError: - print("'punkt' tokenizer models not found. Downloading...") - # Download 'punkt' tokenizer models - download('punkt') - -# trim word tokens from tokenizer to stem i.e. exploiting to exploit -def fuzzy_match(title): - tokens = word_tokenize(title) - return [stemmer.stem(token) for token in tokens] - -def existed_in_tokens(tokens, keywords): - return all(map(lambda k: stemmer.stem(k.lower()) in tokens, keywords)) - -def grep(keywords, abstract): - # TODO: currently we only grep either from title or from abstract, also grep from other fields in the future maybe? - if abstract: - constraints = [Paper.abstract.contains(x) for x in keywords] - with Session() as session: - papers = session.query(Paper).filter(*constraints).all() - filter_paper = filter(lambda p: existed_in_tokens(fuzzy_match(p.abstract.lower()), keywords), papers) - else: - constraints = [Paper.title.contains(x) for x in keywords] - with Session() as session: - papers = session.query(Paper).filter(*constraints).all() - #check whether whether nltk tokenizer data is downloaded - check_and_download_punkt() - #tokenize the title and filter out the substring matches - filter_paper = [] - for paper in papers: - if all([stemmer.stem(x.lower()) in fuzzy_match(paper.title.lower()) for x in keywords]): - filter_paper.append(paper) - # perform customized sorthing - papers = sorted(filter_paper, key=lambda paper: paper.year + CONFERENCES.index(paper.conference)/10, reverse=True) - return papers + +def grep(query, field, conferences=None, year_from=2000, year_to=None): + query_groups = parse_query(query) + + with Session() as session: + paper_query = session.query(Paper) + if conferences: + paper_query = paper_query.filter(Paper.conference.in_(conferences)) + if year_from is not None: + paper_query = paper_query.filter(Paper.year >= year_from) + if year_to is not None: + paper_query = paper_query.filter(Paper.year <= year_to) + papers = paper_query.all() + + filtered = [paper for paper in papers if paper_matches(paper, query_groups, field)] + return sorted(filtered, key=paper_sort_key) def show_papers(papers): @@ -71,26 +45,84 @@ def show_papers(papers): def main(): parser = argparse.ArgumentParser(description='Scripts to query the paper database', usage="%(prog)s [options] -k ") - parser.add_argument('-k', type=str, help="keywords to grep, separated by ','. For example, 'linux,kernel,exploit'", default='') + parser.add_argument('-k', type=str, help="keywords to grep. Use ',' for AND and '|' for OR, for example 'linux,kernel|driver'", default='') parser.add_argument('--build-db', action="store_true", help="Builds the database of conference papers") - parser.add_argument('--abstract', action="store_true", help="Involve abstract into the database's building or query (Need Chrome for building)") + parser.add_argument('--abstract', action="store_true", help="During builds, fetch abstracts. During queries, search abstracts unless --field is provided.") + parser.add_argument('--field', choices=SEARCH_FIELDS, help="Field to search for queries. Default is 'title' unless --abstract is set.") + parser.add_argument('--conference', action='append', choices=CONFERENCES, help="Restrict builds or queries to one or more conferences. May be repeated.") + parser.add_argument('--year-from', type=int, default=2000, help="Inclusive lower year bound for builds or queries.") + parser.add_argument('--year-to', type=int, default=datetime.now().year, help="Inclusive upper year bound for builds or queries.") + parser.add_argument('--export-cache-bundle', type=Path, help="Create a single zip bundle containing papers.db and all cached raw fetches.") + parser.add_argument('--install-cache-bundle', type=Path, help="Install a previously exported cache bundle into the package data directory.") + parser.add_argument('--replace-data', action='store_true', help="When installing a cache bundle, replace existing papers.db and raw cache first.") args = parser.parse_args() + if args.year_from > args.year_to: + parser.error("--year-from must be less than or equal to --year-to") + if args.replace_data and not args.install_cache_bundle: + parser.error("--replace-data can only be used with --install-cache-bundle") + + special_actions = [bool(args.export_cache_bundle), bool(args.install_cache_bundle)] + if sum(special_actions) > 1: + parser.error("--export-cache-bundle and --install-cache-bundle are mutually exclusive") + if any(special_actions) and (args.k or args.build_db): + parser.error("cache bundle operations cannot be combined with queries or --build-db") + + if args.export_cache_bundle: + result = create_cache_bundle(args.export_cache_bundle) + print( + f"Created cache bundle at {result['output_path']} " + f"({result['archive_bytes']} bytes, {result['file_count']} files)." + ) + return + if args.install_cache_bundle: + result = install_cache_bundle(args.install_cache_bundle, replace=args.replace_data) + normalized_rows, duplicate_rows = migrate_database() + print( + f"Installed cache bundle from {result['bundle_path']} into {result['data_dir']} " + f"({result['extracted_files']} files)." + ) + if normalized_rows or duplicate_rows: + print(f"Normalized {normalized_rows} rows and removed {duplicate_rows} duplicates after install.") + return + if args.k: - assert DB_PATH.exists(), f"need to build a paper database first to perform wanted queries" - keywords = [x.strip() for x in args.k.split(',')] - if keywords: - logger.info("Grep based on the following keywords: %s", ', '.join(keywords)) + if not DB_PATH.exists(): + parser.error("need to build a paper database first to perform queries") + normalized_rows, duplicate_rows = migrate_database() + if normalized_rows or duplicate_rows: + logger.info( + "Normalized %s records and removed %s duplicates.", + normalized_rows, + duplicate_rows, + ) + field = args.field or ('abstract' if args.abstract else 'title') + query_groups = parse_query(args.k) + if query_groups: + logger.info("Query: %s", args.k) + logger.info("Field: %s", field) else: logger.warning("No keyword is provided. Return all the papers.") - papers = grep(keywords, args.abstract) + papers = grep( + args.k, + field=field, + conferences=args.conference, + year_from=args.year_from, + year_to=args.year_to, + ) logger.debug(f"Found {len(papers)} papers") show_papers(papers) elif args.build_db: print("Building db...") - build_db(args.abstract) + processed, added = build_db( + args.abstract, + conferences=args.conference, + start_year=args.year_from, + end_year=args.year_to, + ) + print(f"Build complete. Processed {processed} papers and changed {added} records.") if __name__ == "__main__": diff --git a/top4grep/abstract.py b/top4grep/abstract.py index 70321a6..b51117c 100644 --- a/top4grep/abstract.py +++ b/top4grep/abstract.py @@ -1,37 +1,495 @@ -""" -Test: python3 -m top4grep.abstract -""" +import os import re -import requests +import shutil +import subprocess +import tempfile +import unicodedata from abc import ABC, abstractmethod from bs4 import BeautifulSoup -from selenium import webdriver -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import TimeoutException -from urllib.parse import urlparse, urlunparse +from html import unescape +from urllib.parse import unquote, urljoin, urlparse + +import requests +from .cache import DATA_DIR, cache_key_path, cached_get_bytes, cached_get_json, cached_get_text from .utils import new_logger logger = new_logger('PaperAbstract') logger.setLevel('WARNING') +RAW_PUBLISHER_DIR = DATA_DIR / "raw" / "publisher_html" +RAW_PDF_DIR = DATA_DIR / "raw" / "pdf" +RAW_OPENALEX_DIR = DATA_DIR / "raw" / "openalex" +RAW_OPENALEX_SEARCH_DIR = RAW_OPENALEX_DIR / "title_search" +RAW_SEMANTIC_SCHOLAR_DIR = DATA_DIR / "raw" / "semantic_scholar" +RAW_SEMANTIC_SCHOLAR_SEARCH_DIR = RAW_SEMANTIC_SCHOLAR_DIR / "title_search" +DOI_PATTERN = re.compile(r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+") + + +def normalize_abstract(text): + if not text: + return "" + lines = [re.sub(r"\s+", " ", unescape(line)).strip() for line in text.splitlines()] + return "\n".join(line for line in lines if line) + + +def extract_doi(value): + if not value: + return "" + match = DOI_PATTERN.search(unquote(value)) + if not match: + return "" + return match.group(0).rstrip(").,;") + + +def reconstruct_abstract(inverted_index): + if not inverted_index: + return "" + + tokens = [] + for word, positions in inverted_index.items(): + for position in positions: + tokens.append((position, word)) + return " ".join(word for _, word in sorted(tokens)) + + +def normalize_title_key(title): + if not title: + return "" + text = re.sub(r"<[^>]+>", " ", unescape(title)) + text = unicodedata.normalize("NFKD", text) + text = "".join(ch for ch in text if not unicodedata.combining(ch)) + return re.sub(r"[^a-z0-9]+", "", text.casefold()) + + +def publisher_cache_path(url): + parsed = urlparse(url) + hint = parsed.netloc or "publisher" + return cache_key_path(RAW_PUBLISHER_DIR, url, ".html", hint) + + +def pdf_cache_path(url): + parsed = urlparse(url) + hint = parsed.netloc or "paper" + return cache_key_path(RAW_PDF_DIR, url, ".pdf", hint) + + +def is_pdf_url(url): + if not url: + return False + return urlparse(url).path.lower().endswith(".pdf") + + +def normalize_source_url(url): + if not url: + return "" + normalized = url.strip() + if normalized.startswith("http://wp.internetsociety.org/ndss/wp-content/uploads/sites/25/"): + return normalized.replace( + "http://wp.internetsociety.org/ndss/wp-content/uploads/sites/25/", + "https://www.ndss-symposium.org/wp-content/uploads/", + 1, + ) + if normalized.startswith("https://wp.internetsociety.org/ndss/wp-content/uploads/sites/25/"): + return normalized.replace( + "https://wp.internetsociety.org/ndss/wp-content/uploads/sites/25/", + "https://www.ndss-symposium.org/wp-content/uploads/", + 1, + ) + return normalized + + +def fetch_publisher_html(url): + if not url: + return None + url = normalize_source_url(url) + return cached_get_text(url, publisher_cache_path(url)) + + +def fetch_pdf_bytes(url): + if not url: + return None + url = normalize_source_url(url) + return cached_get_bytes(url, pdf_cache_path(url)) + + +def load_publisher_soup(url): + html = fetch_publisher_html(url) + if html is None: + return None + return BeautifulSoup(html, 'html.parser') + + +def openalex_cache_path(doi): + return cache_key_path(RAW_OPENALEX_DIR, doi, ".json", doi.replace("/", "-")) + + +def openalex_title_search_cache_path(title): + return cache_key_path(RAW_OPENALEX_SEARCH_DIR, title, ".json", title) + + +def semantic_scholar_cache_path(identifier): + return cache_key_path(RAW_SEMANTIC_SCHOLAR_DIR, identifier, ".json", identifier.replace("/", "-")) + + +def semantic_scholar_title_search_cache_path(title): + return cache_key_path(RAW_SEMANTIC_SCHOLAR_SEARCH_DIR, title, ".json", title) + + +def resolve_doi_from_html(url): + soup = load_publisher_soup(url) + if soup is None: + return "" + + candidates = [] + for attrs in ( + {"name": "citation_doi"}, + {"name": "dc.Identifier"}, + {"name": "dc.identifier"}, + {"property": "og:url"}, + ): + meta = soup.find("meta", attrs=attrs) + if meta and meta.get("content"): + candidates.append(meta["content"]) + + for anchor in soup.find_all("a", href=True): + candidates.append(anchor["href"]) + + for candidate in candidates: + doi = extract_doi(candidate) + if doi: + return doi + return "" + + +def resolve_doi(url): + url = normalize_source_url(url) + doi = extract_doi(url) + if doi: + return doi + return resolve_doi_from_html(url) + + +def get_openalex_abstract(doi): + if not doi: + return "" + + url = f"https://api.openalex.org/works/https://doi.org/{doi}" + try: + data = cached_get_json(url, openalex_cache_path(doi)) + except (requests.RequestException, ValueError) as e: + logger.debug(f"Failed to fetch OpenAlex abstract for DOI {doi}: {e}") + return "" + if not data: + return "" + return normalize_abstract(reconstruct_abstract(data.get("abstract_inverted_index", {}))) + + +def get_openalex_abstract_for_title(title, authors=None): + if not title: + return "" + + url = requests.Request( + "GET", + "https://api.openalex.org/works", + params={"search": title, "per-page": 10}, + ).prepare().url + try: + data = cached_get_json(url, openalex_title_search_cache_path(title)) + except (requests.RequestException, ValueError) as e: + logger.debug(f"Failed to search OpenAlex by title for {title}: {e}") + return "" + if not data: + return "" + + target_key = normalize_title_key(title) + for result in data.get("results", []): + if normalize_title_key(result.get("display_name")) != target_key: + continue + abstract = reconstruct_abstract(result.get("abstract_inverted_index", {})) + if abstract: + return normalize_abstract(abstract) + return "" + + +def normalize_pdf_text(text): + if not text: + return "" + text = text.replace("\f", "\n") + lines = [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()] + return "\n".join(line for line in lines if line) + + +def extract_meta_description(soup): + if soup is None: + return "" + + for attrs in ( + {"name": "citation_abstract"}, + {"name": "description"}, + {"property": "og:description"}, + ): + meta = soup.find("meta", attrs=attrs) + if not meta or not meta.get("content"): + continue + content = normalize_abstract(meta["content"]) + lowered = content.casefold() + if not content: + continue + if lowered.startswith("author(s):"): + continue + if lowered.startswith("download:"): + continue + if lowered.startswith("usenix is a nonprofit organization"): + continue + return content + return "" + + +def extract_abstract_from_pdf_text(text): + normalized = normalize_pdf_text(text) + if not normalized: + return "" + + lines = [line.strip() for line in normalized.splitlines() if line.strip()] + collecting = False + collected = [] + + for line in lines: + heading_key = re.sub(r"[^a-z0-9]+", "", line.casefold()) + if not collecting: + if "abstract" not in heading_key: + continue + collecting = True + content = re.sub(r"^\s*abstract\b[\s:.\-–—]*", "", line, flags=re.IGNORECASE).strip() + if content: + collected.append(content) + continue + + if heading_key in { + "1introduction", + "iintroduction", + "introduction", + "keywords", + "indexterms", + "categoriesandsubjectdescriptors", + }: + break + if heading_key in {"1", "i"}: + continue + collected.append(line) + + if collected: + return normalize_abstract(" ".join(collected)) + return "" + + +def extract_abstract_from_pdf_url(url): + if not url: + return "" + url = normalize_source_url(url) + + pdftotext = shutil.which("pdftotext") + if not pdftotext: + return "" + + pdf_bytes = fetch_pdf_bytes(url) + if not pdf_bytes: + return "" + + temp_path = None + try: + with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as handle: + handle.write(pdf_bytes) + temp_path = handle.name + + result = subprocess.run( + [pdftotext, "-f", "1", "-l", "2", temp_path, "-"], + check=False, + capture_output=True, + text=True, + timeout=30, + ) + if result.returncode != 0: + return "" + return extract_abstract_from_pdf_text(result.stdout) + except (OSError, subprocess.SubprocessError): + return "" + finally: + if temp_path: + try: + os.unlink(temp_path) + except OSError: + pass + + +def find_pdf_url_in_soup(base_url, soup): + if soup is None: + return "" + + candidates = [] + for attrs in ( + {"name": "citation_pdf_url"}, + {"name": "wkhealth_pdf_url"}, + {"property": "og:pdf"}, + ): + meta = soup.find("meta", attrs=attrs) + if meta and meta.get("content"): + candidates.append(meta["content"]) + + for anchor in soup.find_all("a", href=True): + href = anchor["href"] + label = anchor.get_text(" ", strip=True).casefold() + if href.lower().endswith(".pdf") or label in {"pdf", "download pdf", "paper"}: + candidates.append(href) + + for candidate in candidates: + absolute = normalize_source_url(urljoin(base_url, candidate)) + if is_pdf_url(absolute): + return absolute + return "" + + +def extract_semantic_scholar_abstract(data): + if not data: + return "" + abstract = data.get("abstract") + if abstract: + return normalize_abstract(abstract) + return "" + + +def semantic_scholar_authors_match(result, authors): + if not authors: + return True + + expected = {normalize_title_key(author) for author in authors if author} + if not expected: + return True + + actual = { + normalize_title_key(author.get("name")) + for author in result.get("authors", []) + if isinstance(author, dict) and author.get("name") + } + return bool(expected & actual) + + +def get_semantic_scholar_abstract(doi): + if not doi: + return "" + + url = f"https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}?fields=title,abstract,authors,externalIds,year" + try: + data = cached_get_json(url, semantic_scholar_cache_path(doi)) + except (requests.RequestException, ValueError) as e: + logger.debug(f"Failed to fetch Semantic Scholar abstract for DOI {doi}: {e}") + return "" + return extract_semantic_scholar_abstract(data) + + +def get_semantic_scholar_abstract_for_title(title, authors=None): + if not title: + return "" + + url = requests.Request( + "GET", + "https://api.semanticscholar.org/graph/v1/paper/search/match", + params={"query": title, "fields": "title,abstract,authors,externalIds,year"}, + ).prepare().url + try: + data = cached_get_json(url, semantic_scholar_title_search_cache_path(title)) + except (requests.RequestException, ValueError) as e: + logger.debug(f"Failed to search Semantic Scholar by title for {title}: {e}") + return "" + if not data: + return "" + + results = data.get("data", []) + if isinstance(results, dict): + results = [results] + + target_key = normalize_title_key(title) + for result in results: + if normalize_title_key(result.get("title")) != target_key: + continue + if not semantic_scholar_authors_match(result, authors): + continue + abstract = extract_semantic_scholar_abstract(result) + if abstract: + return abstract + return "" + + class BasePaperAbstract(ABC): def get_abstract(self, paper_html, title, authors): - # import ipdb; ipdb.set_trace() - # logger.debug(f"abstracting {paper_html}, title: {title}") try: publisher_url = self.get_publisher_url(paper_html) except Exception as e: logger.debug(f"Failed to obtain publisher URL. Paper: {title}") return "" - else: - try: - return self.get_abstract_from_publisher(publisher_url, authors) - except Exception as e: - logger.debug(f"Failed to extract abstract from publisher URL {publisher_url}.") - return "" + return self.get_abstract_from_url(publisher_url, title, authors) + + def get_abstract_from_url(self, publisher_url, title, authors): + publisher_url = normalize_source_url(publisher_url) + if not publisher_url: + abstract = get_openalex_abstract_for_title(title, authors) + if abstract: + return abstract + return get_semantic_scholar_abstract_for_title(title, authors) + + if is_pdf_url(publisher_url): + abstract = extract_abstract_from_pdf_url(publisher_url) + if abstract: + return abstract + + direct_doi = extract_doi(publisher_url) + try: + abstract = self.get_abstract_from_publisher(publisher_url, authors) + except Exception as e: + logger.debug(f"Failed to extract abstract from publisher URL {publisher_url}. Paper: {title}") + abstract = "" + + if abstract: + return normalize_abstract(abstract) + + pdf_url = "" + try: + pdf_url = find_pdf_url_in_soup(publisher_url, load_publisher_soup(publisher_url)) + except Exception as e: + logger.debug(f"Failed to locate PDF from publisher URL {publisher_url}. Paper: {title}") + pdf_url = "" + if pdf_url: + abstract = extract_abstract_from_pdf_url(pdf_url) + if abstract: + return abstract + + if direct_doi: + abstract = get_semantic_scholar_abstract(direct_doi) + if abstract: + return abstract + abstract = get_openalex_abstract_for_title(title, authors) + if abstract: + return abstract + return get_semantic_scholar_abstract_for_title(title, authors) + + try: + doi = resolve_doi(publisher_url) + except Exception as e: + logger.debug(f"Failed to resolve DOI from publisher URL {publisher_url}. Paper: {title}") + doi = "" + if doi: + abstract = get_openalex_abstract(doi) + if abstract: + return abstract + abstract = get_semantic_scholar_abstract(doi) + if abstract: + return abstract + + abstract = get_openalex_abstract_for_title(title, authors) + if abstract: + return abstract + return get_semantic_scholar_abstract_for_title(title, authors) def get_publisher_url(self, paper_html): ee = paper_html.find('li', {'class': 'ee'}) @@ -45,113 +503,82 @@ def get_abstract_from_publisher(self, url, authors): class AbstractNDSS(BasePaperAbstract): def get_abstract_from_publisher(self, url, authors): logger.debug(f'URL: {url}') - r = requests.get(url) - assert r.status_code == 200 - - html = BeautifulSoup(r.text, 'html.parser') + html = load_publisher_soup(url) + if html is None: + return "" paper_data = html.find('div', {'class': 'paper-data'}) if paper_data is not None: - abstract_paragraphs = filter(lambda x: x.text != '' and not authors[0] in x.text, paper_data.find_all('p')) - ap_list = [x.text for x in abstract_paragraphs] + first_author = authors[0] if authors else None + abstract_paragraphs = filter( + lambda x: x.get_text(strip=True) != '' and (first_author is None or first_author not in x.get_text()), + paper_data.find_all('p'), + ) + ap_list = [x.get_text(separator='\n', strip=True) for x in abstract_paragraphs] return '\n'.join(ap_list) - else: - abstract_paragraphs = html.find(string=re.compile("Abstract:")).find_next(recursive=False) - return abstract_paragraphs.get_text(separator='\n') + abstract_marker = html.find(string=re.compile(r"Abstract:?", re.I)) + if abstract_marker is None: + return extract_meta_description(html) + abstract_paragraphs = abstract_marker.find_next(recursive=False) + return abstract_paragraphs.get_text(separator='\n', strip=True) class AbstractSP(BasePaperAbstract): - def has_abstract_sibling(self, tag): - return any(sibling for sibling in tag.find_all_next() if sibling.get_text(strip=True) == 'Abstract') - - def update_url(self, url): - parsed_url = urlparse(url) - ieee_netloc = 'doi.ieeecomputersociety.org' - doi_netlog = 'doi.org' - if parsed_url.netloc != ieee_netloc and parsed_url.netloc != 'doi.org': - modified_url = urlunparse((parsed_url.scheme, ieee_netloc, parsed_url.path, - parsed_url.params, parsed_url.query, parsed_url.fragment)) - return modified_url - else: - return url - - def _get_abstract_from_computerorg(self, url): - # TODO: handle the case when Chrome is not available - driver = webdriver.Chrome() - url = self.update_url(url) - driver.get(url) - - - # Wait for the dynamic element to be present on the page - element = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.TAG_NAME, 'article'))) - # TODO: I'm not sure if this can handle abstracts with multiple paragraphs - abstract = element.find_element(By.CLASS_NAME, 'article-content').text - driver.quit() - return abstract - - def _get_abstract_from_ieeexplore(self, url): - # TODO: handle the case when Chrome is not available - driver = webdriver.Chrome() - url = self.update_url(url) - logger.debug(f'URL: {url}') - driver.get(url) - - # Wait for the dynamic element to be present on the page - element = WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.CLASS_NAME, 'abstract-text'))) - temp = element.find_elements(By.CLASS_NAME, 'abstract-text-view-all') - if len(temp) > 0: - # If there's a view all button - view_all = temp[0] - driver.execute_script("arguments[0].scrollIntoView(true);", view_all) - view_all.click() - text = driver.find_element(By.CLASS_NAME, 'abstract-text').text - else: - text = element.text - - if text.find('Abstract:\n') >= 0: - text = text[text.find('Abstract:\n') + len('Abstract:\n'):] - if text.find('\n(Show Less)') >= 0: - text = text[:text.find('\n(Show Less)')] - - driver.close() - return text - def get_abstract_from_publisher(self, url, _): - # TODO: this is super slow. Maybe not Selenium? - parsed_url = urlparse(url) - ieee_netloc = 'doi.ieeecomputersociety.org' - doi_netlog = 'doi.org' - if parsed_url.netloc == ieee_netloc: - return self._get_abstract_from_computerorg(url) - elif parsed_url.netloc == doi_netlog: - return self._get_abstract_from_ieeexplore(url) - else: - raise NotImplementedError + doi = resolve_doi(url) + if doi: + return get_openalex_abstract(doi) + + html = load_publisher_soup(url) + if html is None: + return "" + + for attrs in ( + {"name": "description"}, + {"property": "og:description"}, + ): + meta = html.find("meta", attrs=attrs) + if meta and meta.get("content"): + return meta["content"] + return "" class AbstractUSENIX(BasePaperAbstract): def get_abstract_from_publisher(self, url, authors): - r = requests.get(url) logger.debug(f'URL: {url}') - assert r.status_code == 200 + html = load_publisher_soup(url) + if html is None: + return "" + + abstract_div = html.find('div', {'class': 'field-name-field-paper-description'}) + if abstract_div: + return abstract_div.get_text(separator='\n', strip=True) - html = BeautifulSoup(r.text, 'html.parser') + # Fallback for older page layouts + abstract_marker = html.find(string=re.compile(r"Abstract:?", re.I)) + if abstract_marker: + return abstract_marker.find_next(recursive=False).get_text(separator='\n', strip=True) - abstract_paragraphs = html.find(string=re.compile("Abstract:")).find_next(recursive=False) - return abstract_paragraphs.get_text(separator='\n') + return extract_meta_description(html) class AbstractCCS(BasePaperAbstract): def get_abstract_from_publisher(self, url, authors): - # TODO: ACM library doesn't like me to crawl and will ban me when upset. logger.debug(f'URL: {url}') - r = requests.get(url) - assert r.status_code == 200 + doi = resolve_doi(url) + if doi: + abstract = get_openalex_abstract(doi) + if abstract: + return abstract - html = BeautifulSoup(r.text, 'html.parser') - abstract_paragraphs = html.find('div', {'class': 'abstractInFull'}) - return abstract_paragraphs.get_text(separator='\n') - # ap_list = [x.text for x in abstract_paragraphs] - # return '\n'.join(ap_list) + html = load_publisher_soup(url) + if html is not None: + abstract_paragraphs = html.find('div', {'class': 'abstractInFull'}) + if abstract_paragraphs is not None: + return abstract_paragraphs.get_text(separator='\n', strip=True) + meta_description = extract_meta_description(html) + if meta_description: + return meta_description + return "" NDSS = AbstractNDSS() SP = AbstractSP() @@ -165,9 +592,5 @@ def get_abstract_from_publisher(self, url, authors): if __name__ == '__main__': logger.setLevel('DEBUG') - # SP.get_abstract_from_publisher('https://doi.ieeecomputersociety.org/10.1109/SP46215.2023.00131', []) - # SP.get_abstract_from_publisher('https://doi.org/10.1109/SP46215.2023.10179411', []) - # print(SP.get_abstract_from_publisher('https://doi.org/10.1109/SP46215.2023.10179381', [])) - # print(USENIX.get_abstract_from_publisher('https://www.usenix.org/conference/usenixsecurity20/presentation/cremers', [])) - # print(CCS.get_abstract_from_publisher('https://doi.org/10.1145/3576915.3616615', [])) + print(SP.get_abstract_from_publisher('https://doi.org/10.1109/SP46215.2023.10179381', [])) print(NDSS.get_abstract_from_publisher('https://www.ndss-symposium.org/ndss2015/i-do-not-know-what-you-visited-last-summer-protecting-users-third-party-web-tracking', [])) diff --git a/top4grep/build_db.py b/top4grep/build_db.py index 37ff8eb..25aada9 100644 --- a/top4grep/build_db.py +++ b/top4grep/build_db.py @@ -1,14 +1,17 @@ from datetime import datetime +from html import unescape from pathlib import Path +from urllib.parse import quote import requests import sqlalchemy -from sqlalchemy.orm import sessionmaker from bs4 import BeautifulSoup +from sqlalchemy.orm import sessionmaker +from .cache import DATA_DIR, cache_key_path, cached_get_json, cached_get_text from .utils import new_logger from .db import Base, Paper -from .abstract import Abstracts +from .abstract import Abstracts, normalize_abstract logger = new_logger("DB") logger.setLevel('WARNING') @@ -20,60 +23,365 @@ "USENIX": "uss", "CCS": "ccs", } +FRONT_MATTER_TITLES = { + "Conference Organizers.", + "External Reviewers.", + "Message from the Program Chairs.", + "Program Committee.", +} PACKAGE_DIR = Path(__file__).resolve().parent DB_PATH = PACKAGE_DIR / "data" / "papers.db" +DBLP_BASE_URL = "https://dblp.uni-trier.de" +TOC_API_HOSTS = [ + "https://dblp.org", + "https://dblp.dagstuhl.de", +] +TOC_FIRST_YEARS = { + "ndss": {2011, 2012, 2013, 2014}, +} +RAW_DBLP_API_DIR = DATA_DIR / "raw" / "dblp" / "api" +RAW_DBLP_HTML_DIR = DATA_DIR / "raw" / "dblp" / "html" +DB_PATH.parent.mkdir(parents=True, exist_ok=True) engine = sqlalchemy.create_engine(f'sqlite:///{str(DB_PATH)}') Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) -def save_paper(conf, year, title, authors, abstract): - logger.debug(f'Adding paper {title} with abstract {abstract[:20]}...') - session = Session() - paper = Paper(conference=conf, year=year, title=title, authors=", ".join(authors), abstract=abstract) - session.add(paper) - session.commit() - session.close() -def paper_exist(conf, year, title, authors, abstract): - session = Session() - paper = session.query(Paper).filter(Paper.conference==conf, Paper.year==year, Paper.title==title, Paper.abstract==abstract).first() - session.close() - return paper is not None +def normalize_metadata_text(value): + if not value: + return "" + return " ".join(unescape(value).split()) -def get_papers(name, year, build_abstract): - cnt = 0 - conf = NAME_MAP[name] - if build_abstract and name == "NDSS" and (year == 2018 or year == 2016): - logger.warning(f"Skipping the abstract for NDSS {year} becuase the website does not contain abstracts.") - extract_abstract = False - else: - extract_abstract = build_abstract +def normalize_author_list(authors): + normalized = [] + for author in authors: + cleaned = normalize_metadata_text(author) + if cleaned: + normalized.append(cleaned) + return normalized + + +def preferred_text(values): + cleaned = [value for value in values if value] + if not cleaned: + return "" + return max(cleaned, key=len) + + +def should_skip_record(title, publisher_url=""): + lowered_title = (title or "").casefold() + lowered_url = (publisher_url or "").casefold() + + if title in FRONT_MATTER_TITLES: + return True + if lowered_title.startswith("poster:") or lowered_title.startswith("demo:"): + return True + if lowered_title.startswith("keynote:") or lowered_title.startswith("keynote address:"): + return True + if "(invited talk)" in lowered_title: + return True + if "/slides/" in lowered_url: + return True + return False + + +def migrate_database(): + normalized_rows = 0 + duplicate_rows = 0 + + with Session.begin() as session: + rows = session.query(Paper).order_by(Paper.id).all() + grouped_rows = {} + + for row in rows: + original = (row.title or "", row.authors or "", row.abstract or "") + row.title = normalize_metadata_text(row.title) + row.authors = normalize_metadata_text(row.authors) + row.abstract = normalize_abstract(unescape(row.abstract or "")) + + if (row.title, row.authors, row.abstract) != original: + normalized_rows += 1 + + key = (row.conference or "", row.year, row.title) + grouped_rows.setdefault(key, []).append(row) + + for row_group in grouped_rows.values(): + if len(row_group) == 1: + continue + + survivor = row_group[0] + survivor.authors = preferred_text([row.authors for row in row_group]) + survivor.abstract = preferred_text([row.abstract for row in row_group]) + + for duplicate in row_group[1:]: + session.delete(duplicate) + duplicate_rows += 1 + + return normalized_rows, duplicate_rows + +def save_papers(papers): + if not papers: + return 0 + + logger.debug("Adding %s new papers...", len(papers)) + with Session.begin() as session: + session.add_all(papers) + return len(papers) + + +def update_papers(paper_updates): + if not paper_updates: + return 0 + + with Session.begin() as session: + for paper_id, authors, abstract in paper_updates: + paper = session.get(Paper, paper_id) + if paper is None: + continue + if authors: + paper.authors = authors + if abstract and not paper.abstract: + paper.abstract = abstract + return len(paper_updates) + + +def load_existing_papers(conf, year): + with Session() as session: + rows = session.query(Paper).filter( + Paper.conference == conf, + Paper.year == year, + ).all() + + existing = {} + for row in rows: + normalized_title = normalize_metadata_text(row.title) + current = existing.get(normalized_title) + candidate = { + "id": row.id, + "authors": normalize_metadata_text(row.authors), + "abstract": normalize_abstract(unescape(row.abstract or "")), + } + if current is None or (not current["abstract"] and candidate["abstract"]): + existing[normalized_title] = candidate + return existing + + +def prune_stale_papers(conf, year, current_titles): + if not current_titles: + return 0 + + removed = 0 + with Session.begin() as session: + rows = session.query(Paper).filter( + Paper.conference == conf, + Paper.year == year, + ).all() + + for row in rows: + if normalize_metadata_text(row.title) in current_titles: + continue + session.delete(row) + removed += 1 + return removed + +def build_toc_query(conf, year): + return f"toc:db/conf/{conf}/{conf}{year}.bht:" + +def fetch_toc_api_records(conf, year): + query = quote(build_toc_query(conf, year)) + last_error = None + saw_not_found = False + + for host in TOC_API_HOSTS: + url = f"{host}/search/publ/api?q={query}&h=1000&format=json" + cache_path = cache_key_path(RAW_DBLP_API_DIR, url, ".json", f"{conf}-{year}") + try: + data = cached_get_json(url, cache_path) + if data is None: + saw_not_found = True + continue + hits = data["result"]["hits"].get("hit", []) + if isinstance(hits, dict): + hits = [hits] + return hits + except (requests.RequestException, ValueError, KeyError) as e: + last_error = e + + if last_error is not None and not saw_not_found: + raise requests.RequestException(last_error) + return [] + +def normalize_api_authors(authors): + if not authors: + return [] + + author_entries = authors.get("author", []) + if isinstance(author_entries, dict): + author_entries = [author_entries] + + normalized = [] + for author in author_entries: + if isinstance(author, dict): + name = author.get("text", "").strip() + else: + name = str(author).strip() + if name: + normalized.append(name) + return normalize_author_list(normalized) + +def normalize_toc_api_records(hits): + records = [] + for hit in hits: + info = hit.get("info", {}) + if info.get("type") == "Editorship": + continue + + title = normalize_metadata_text(info.get("title")) + if not title: + continue + + publisher_url = info.get("ee") or "" + if should_skip_record(title, publisher_url): + continue + + records.append({ + "title": title, + "authors": normalize_api_authors(info.get("authors")), + "publisher_url": publisher_url, + }) + return records + +def load_records(conf, year): + if year >= datetime.now().year or year in TOC_FIRST_YEARS.get(conf, set()): + hits = fetch_toc_api_records(conf, year) + return normalize_toc_api_records(hits) + try: - r = requests.get(f"https://dblp.org/db/conf/{conf}/{conf}{year}.html") - assert r.status_code == 200 + url = f"{DBLP_BASE_URL}/db/conf/{conf}/{conf}{year}.html" + cache_path = cache_key_path(RAW_DBLP_HTML_DIR, url, ".html", f"{conf}-{year}") + html_text = cached_get_text(url, cache_path) + if html_text is None: + return [] - html = BeautifulSoup(r.text, 'html.parser') + html = BeautifulSoup(html_text, 'html.parser') paper_htmls = html.find_all("li", {'class': "inproceedings"}) + records = [] for paper_html in paper_htmls: - title = paper_html.find('span', {'class': 'title'}).text - authors = [x.text for x in paper_html.find_all('span', {'itemprop': 'author'})] - if extract_abstract: - abstract = Abstracts[name].get_abstract(paper_html, title, authors) - else: + title_tag = paper_html.find('span', {'class': 'title'}) + if title_tag is None: + continue + + ee = paper_html.find('li', {'class': 'ee'}) + publisher_url = ee.find('a').get('href') if ee and ee.find('a') else "" + title = normalize_metadata_text(title_tag.get_text(" ", strip=True)) + if should_skip_record(title, publisher_url): + continue + + records.append({ + "title": title, + "authors": normalize_author_list( + [x.get_text(" ", strip=True) for x in paper_html.find_all('span', {'itemprop': 'author'})] + ), + "paper_html": paper_html, + "publisher_url": publisher_url, + }) + return records + except requests.RequestException: + hits = fetch_toc_api_records(conf, year) + return normalize_toc_api_records(hits) + +def get_papers(name, year, build_abstract): + conf = NAME_MAP[name] + processed = 0 + existing_papers = load_existing_papers(name, year) + extract_abstract = build_abstract + + try: + records = load_records(conf, year) + new_papers = [] + paper_updates = [] + + for record in records: + title = normalize_metadata_text(record["title"]) + authors = normalize_author_list(record["authors"]) + try: + if extract_abstract: + if "paper_html" in record: + abstract = Abstracts[name].get_abstract(record["paper_html"], title, authors) or "" + else: + abstract = Abstracts[name].get_abstract_from_url(record.get("publisher_url"), title, authors) or "" + else: + abstract = '' + except Exception as e: + logger.debug(f"Failed to extract abstract for {name}-{year}: {title}: {e}") abstract = '' - # insert the entry only if the paper does not exist - if not paper_exist(name, year, title, authors, abstract): - save_paper(name, year, title, authors, abstract) - cnt += 1 + + abstract = normalize_abstract(unescape(abstract)) + + authors_text = ", ".join(authors) + existing = existing_papers.get(title) + if existing is not None: + if (authors_text and authors_text != existing["authors"]) or (abstract and not existing["abstract"]): + paper_updates.append((existing["id"], authors_text, abstract)) + existing["authors"] = authors_text + if abstract: + existing["abstract"] = abstract + processed += 1 + continue + + existing_papers[title] = { + "id": None, + "authors": authors_text, + "abstract": abstract, + } + new_papers.append(Paper( + conference=name, + year=year, + title=title, + authors=authors_text, + abstract=abstract, + )) + processed += 1 + except requests.RequestException as e: + if existing_papers: + logger.debug(f"Keeping existing records for {name}-{year} after refresh failure: {e}") + return len(existing_papers), 0 + logger.warning(f"Failed to obtain papers at {name}-{year}: {e}") + return 0, 0 except Exception as e: - logger.warning(f"Failed to obtain papers at {name}-{year}") + if existing_papers: + logger.debug(f"Keeping existing records for {name}-{year} after parse failure: {e}") + return len(existing_papers), 0 + logger.warning(f"Failed to parse papers at {name}-{year}: {e}") + return 0, 0 + + removed = prune_stale_papers(name, year, {record["title"] for record in records}) + added = save_papers(new_papers) + updated = update_papers(paper_updates) + logger.debug(f"Found {processed} papers at {name}-{year}, added {added}, updated {updated}, removed {removed}...") + return processed, added + updated + removed + - logger.debug(f"Found {cnt} papers at {name}-{year}...") +def build_db(build_abstract, conferences=None, start_year=2000, end_year=None): + if end_year is None: + end_year = datetime.now().year + normalized_rows, duplicate_rows = migrate_database() + if normalized_rows or duplicate_rows: + logger.info( + "Normalized %s records and removed %s duplicates before rebuild.", + normalized_rows, + duplicate_rows, + ) -def build_db(build_abstract): - for conf in CONFERENCES: - for year in range(2000, datetime.now().year+1): - get_papers(conf, year, build_abstract) + selected_conferences = conferences or CONFERENCES + processed = 0 + added = 0 + for conf in selected_conferences: + for year in range(start_year, end_year + 1): + current_processed, current_added = get_papers(conf, year, build_abstract) + processed += current_processed + added += current_added + return processed, added diff --git a/top4grep/bundle.py b/top4grep/bundle.py new file mode 100644 index 0000000..fda9a1d --- /dev/null +++ b/top4grep/bundle.py @@ -0,0 +1,109 @@ +import json +import shutil +import zipfile +from datetime import datetime, timezone +from pathlib import Path + +from .cache import DATA_DIR, RAW_DIR + +BUNDLE_FORMAT_VERSION = 1 +MANIFEST_NAME = "manifest.json" + + +def default_db_path(data_dir=DATA_DIR): + return Path(data_dir) / "papers.db" + + +def iter_bundle_files(data_dir=DATA_DIR): + data_dir = Path(data_dir) + db_path = default_db_path(data_dir) + + if db_path.exists(): + yield db_path, Path("papers.db") + + raw_dir = data_dir / "raw" + if raw_dir.exists(): + for path in sorted(raw_dir.rglob("*")): + if path.is_file(): + yield path, Path("raw") / path.relative_to(raw_dir) + + +def build_bundle_manifest(entries): + files = [] + total_bytes = 0 + for source_path, archive_path in entries: + size = source_path.stat().st_size + files.append({"path": archive_path.as_posix(), "size": size}) + total_bytes += size + + return { + "bundle_format_version": BUNDLE_FORMAT_VERSION, + "created_at_utc": datetime.now(timezone.utc).isoformat(), + "file_count": len(files), + "total_bytes": total_bytes, + "files": files, + } + + +def create_cache_bundle(output_path, data_dir=DATA_DIR): + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + entries = list(iter_bundle_files(data_dir)) + manifest = build_bundle_manifest(entries) + + with zipfile.ZipFile(output_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=6) as bundle: + bundle.writestr(MANIFEST_NAME, json.dumps(manifest, indent=2, sort_keys=True)) + for source_path, archive_path in entries: + bundle.write(source_path, archive_path.as_posix()) + + return { + "output_path": output_path, + "file_count": manifest["file_count"], + "total_bytes": manifest["total_bytes"], + "archive_bytes": output_path.stat().st_size, + } + + +def resolve_extract_path(data_dir, archive_name): + data_dir = Path(data_dir).resolve() + target = (data_dir / archive_name).resolve() + if target != data_dir and data_dir not in target.parents: + raise ValueError(f"Refusing to extract outside data directory: {archive_name}") + return target + + +def install_cache_bundle(bundle_path, data_dir=DATA_DIR, replace=False): + bundle_path = Path(bundle_path) + data_dir = Path(data_dir) + data_dir.mkdir(parents=True, exist_ok=True) + + if replace: + raw_dir = data_dir / "raw" + if raw_dir.exists(): + shutil.rmtree(raw_dir) + db_path = default_db_path(data_dir) + if db_path.exists(): + db_path.unlink() + + extracted_files = 0 + manifest = None + with zipfile.ZipFile(bundle_path, "r") as bundle: + if MANIFEST_NAME in bundle.namelist(): + manifest = json.loads(bundle.read(MANIFEST_NAME).decode("utf-8")) + + for info in bundle.infolist(): + if info.is_dir() or info.filename == MANIFEST_NAME: + continue + target = resolve_extract_path(data_dir, info.filename) + target.parent.mkdir(parents=True, exist_ok=True) + with bundle.open(info, "r") as src, target.open("wb") as dst: + shutil.copyfileobj(src, dst) + extracted_files += 1 + + return { + "bundle_path": bundle_path, + "data_dir": data_dir, + "extracted_files": extracted_files, + "manifest": manifest, + } diff --git a/top4grep/cache.py b/top4grep/cache.py new file mode 100644 index 0000000..2f4d009 --- /dev/null +++ b/top4grep/cache.py @@ -0,0 +1,153 @@ +import hashlib +import json +import re +from pathlib import Path + +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + +PACKAGE_DIR = Path(__file__).resolve().parent +DATA_DIR = PACKAGE_DIR / "data" +RAW_DIR = DATA_DIR / "raw" +USER_AGENT = "top4grep/0.0.0 (+https://github.com/Kyle-Kyle/top4grep)" +HTTP_TIMEOUT_SECONDS = 15 +TEXT_MISSING_SENTINEL = "__top4grep_missing__" +JSON_MISSING_SENTINEL = {"__top4grep_missing__": True} +MISSING = object() + +DATA_DIR.mkdir(parents=True, exist_ok=True) +RAW_DIR.mkdir(parents=True, exist_ok=True) + + +def build_http_session(): + retry_policy = Retry( + total=5, + connect=3, + read=3, + status=3, + backoff_factor=0.5, + status_forcelist=(429, 500, 502, 503, 504), + allowed_methods=frozenset(["GET"]), + respect_retry_after_header=True, + ) + adapter = HTTPAdapter(max_retries=retry_policy) + session = requests.Session() + session.headers.update( + { + "User-Agent": USER_AGENT, + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,application/json;q=0.9,*/*;q=0.8", + } + ) + session.mount("http://", adapter) + session.mount("https://", adapter) + return session + + +HTTP = build_http_session() + + +def slugify(value, max_length=80): + slug = re.sub(r"[^A-Za-z0-9._-]+", "-", value).strip("-").lower() + if not slug: + slug = "item" + return slug[:max_length] + + +def cache_key_path(root, key, suffix, hint=None): + root.mkdir(parents=True, exist_ok=True) + digest = hashlib.sha256(key.encode("utf-8")).hexdigest()[:12] + stem = slugify(hint or key) + return root / f"{stem}-{digest}{suffix}" + + +def read_text(path): + if not path.exists(): + return None + text = path.read_text(encoding="utf-8") + if text == TEXT_MISSING_SENTINEL: + return MISSING + return text + + +def write_text(path, value): + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(value, encoding="utf-8") + + +def read_bytes(path): + if not path.exists(): + return None + missing_path = path.with_suffix(f"{path.suffix}.missing") + if missing_path.exists(): + return MISSING + return path.read_bytes() + + +def write_bytes(path, value): + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(value) + + +def read_json(path): + if not path.exists(): + return None + data = json.loads(path.read_text(encoding="utf-8")) + if data == JSON_MISSING_SENTINEL: + return MISSING + return data + + +def write_json(path, value): + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(value, indent=2, sort_keys=True), encoding="utf-8") + + +def cached_get_text(url, cache_path, timeout=HTTP_TIMEOUT_SECONDS): + cached = read_text(cache_path) + if cached is MISSING: + return None + if cached is not None: + return cached + + response = HTTP.get(url, timeout=timeout) + if response.status_code == 404: + write_text(cache_path, TEXT_MISSING_SENTINEL) + return None + response.raise_for_status() + write_text(cache_path, response.text) + return response.text + + +def cached_get_json(url, cache_path, timeout=HTTP_TIMEOUT_SECONDS): + cached = read_json(cache_path) + if cached is MISSING: + return None + if cached is not None: + return cached + + response = HTTP.get(url, timeout=timeout) + if response.status_code == 404: + write_json(cache_path, JSON_MISSING_SENTINEL) + return None + response.raise_for_status() + data = response.json() + write_json(cache_path, data) + return data + + +def cached_get_bytes(url, cache_path, timeout=HTTP_TIMEOUT_SECONDS): + cached = read_bytes(cache_path) + if cached is MISSING: + return None + if cached is not None: + return cached + + response = HTTP.get(url, timeout=timeout) + if response.status_code == 404: + cache_path.parent.mkdir(parents=True, exist_ok=True) + cache_path.with_suffix(f"{cache_path.suffix}.missing").write_text("", encoding="utf-8") + return None + response.raise_for_status() + write_bytes(cache_path, response.content) + return response.content diff --git a/top4grep/db.py b/top4grep/db.py index eb45eb1..e4a0e9e 100644 --- a/top4grep/db.py +++ b/top4grep/db.py @@ -1,5 +1,5 @@ -from sqlalchemy import Column, String, Integer -from sqlalchemy.ext.declarative import declarative_base, declared_attr +from sqlalchemy import Column, Integer, String +from sqlalchemy.orm import declarative_base, declared_attr class BaseTable: @declared_attr diff --git a/top4grep/search.py b/top4grep/search.py new file mode 100644 index 0000000..271a302 --- /dev/null +++ b/top4grep/search.py @@ -0,0 +1,83 @@ +import re + +from nltk.stem import PorterStemmer + +TOKEN_RE = re.compile(r"[A-Za-z0-9_+#.-]+") +STEMMER = PorterStemmer() +SEARCH_FIELDS = ("title", "abstract", "authors", "conference", "all") +CONFERENCE_ORDER = { + "CCS": 3, + "USENIX": 2, + "IEEE S&P": 1, + "NDSS": 0, +} + + +def normalize_token(token): + token = token.strip().lower() + if not token: + return "" + return STEMMER.stem(token) + + +def tokenize(text): + if not text: + return [] + tokens = [] + for raw in TOKEN_RE.findall(text.lower()): + normalized = normalize_token(raw) + if normalized: + tokens.append(normalized) + return tokens + + +def normalize_term(term): + return [token for token in tokenize(term) if token] + + +def parse_query(query): + groups = [] + for clause in query.split(","): + clause = clause.strip() + if not clause: + continue + + options = [] + for option in clause.split("|"): + normalized = normalize_term(option) + if normalized: + options.append(normalized) + + if options: + groups.append(options) + return groups + + +def build_search_text(paper, field): + values = { + "title": paper.title or "", + "abstract": paper.abstract or "", + "authors": paper.authors or "", + "conference": paper.conference or "", + } + if field == "all": + return " ".join(values.values()) + return values[field] + + +def paper_matches(paper, query_groups, field): + if not query_groups: + return True + + token_set = set(tokenize(build_search_text(paper, field))) + for group in query_groups: + if not any(all(token in token_set for token in option) for option in group): + return False + return True + + +def paper_sort_key(paper): + conference_order = CONFERENCE_ORDER.get(paper.conference) + if conference_order is None: + return (-paper.year, 1, 0, (paper.title or "").lower()) + return (-paper.year, 0, -conference_order, (paper.title or "").lower()) diff --git a/top4grep/utils.py b/top4grep/utils.py index 3f81d40..889dd11 100644 --- a/top4grep/utils.py +++ b/top4grep/utils.py @@ -1,7 +1,9 @@ +import html import os -import uuid import logging +import re import tempfile +import uuid from contextlib import contextmanager import colorlog @@ -10,6 +12,17 @@ logger_formatter = colorlog.ColoredFormatter( '[%(name)s][%(levelname)s]%(asctime)s %(log_color)s%(message)s', datefmt='%m-%d %H:%M') +WHITESPACE_RE = re.compile(r"\s+") + + +def normalize_text(value): + if value is None: + return "" + return WHITESPACE_RE.sub(" ", html.unescape(str(value))).strip() + + +def normalize_title(value): + return normalize_text(value) def new_logger(name, level='DEBUG', new=True): # add custom level "VERBOSE"