From ade7c1870b9fb3d6ee0e836710d7f38da94b96e8 Mon Sep 17 00:00:00 2001 From: Abhijeet Saharan Date: Sun, 31 May 2026 07:01:14 +0530 Subject: [PATCH 1/5] feat: implement structured extraction checkpoints B1 and B2 Signed-off-by: Abhijeet Saharan --- .../external_project_parsers/parsers/cheatsheet_extractor.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py index 384afe932..41b013bad 100644 --- a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py +++ b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py @@ -1,11 +1,9 @@ import os import re - from application.defs.cheatsheet_defs import CheatsheetRecord PARSER_VERSION = "v1" FALLBACK_USED = "false" - CANONICAL_BASE_URL = "https://cheatsheetseries.owasp.org/cheatsheets/" _TITLE_RE = re.compile(r"^#\s+(?P.+)$", re.MULTILINE) @@ -53,7 +51,6 @@ def _extract_summary(markdown: str) -> str: if body: return body - break for match in all_heading_matches: From a9e54a3d41e54c3a97d205ecfdd7c2358be1fdd1 Mon Sep 17 00:00:00 2001 From: Abhijeet Saharan <abhijeetsaharan2236@gmail.com> Date: Fri, 5 Jun 2026 14:00:57 +0530 Subject: [PATCH 2/5] docs: add docstrings Signed-off-by: Abhijeet Saharan <abhijeetsaharan2236@gmail.com> --- application/defs/cheatsheet_defs.py | 2 +- .../parsers/cheatsheet_extractor.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/application/defs/cheatsheet_defs.py b/application/defs/cheatsheet_defs.py index 0e1dc4267..162c4a068 100644 --- a/application/defs/cheatsheet_defs.py +++ b/application/defs/cheatsheet_defs.py @@ -78,4 +78,4 @@ def __post_init__(self): raise ValueError( "CheatsheetRecord: metadata keys and values must be strings, " f"got {key!r}: {value!r}" - ) + ) \ No newline at end of file diff --git a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py index 41b013bad..78ef319ea 100644 --- a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py +++ b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py @@ -1,9 +1,11 @@ import os import re + from application.defs.cheatsheet_defs import CheatsheetRecord PARSER_VERSION = "v1" FALLBACK_USED = "false" + CANONICAL_BASE_URL = "https://cheatsheetseries.owasp.org/cheatsheets/" _TITLE_RE = re.compile(r"^#\s+(?P<title>.+)$", re.MULTILINE) @@ -51,6 +53,7 @@ def _extract_summary(markdown: str) -> str: if body: return body + break for match in all_heading_matches: @@ -59,7 +62,9 @@ def _extract_summary(markdown: str) -> str: if body: return body - raise ValueError("_extract_summary: no summary could be extracted from markdown.") + raise ValueError( + "_extract_summary: no summary could be extracted from markdown." + ) def extract_cheatsheet_record( @@ -90,4 +95,4 @@ def extract_cheatsheet_record( "parser_version": PARSER_VERSION, "fallback_used": FALLBACK_USED, }, - ) + ) \ No newline at end of file From 26d7e92773574916ae8b734dc8b255e0c4e94a2a Mon Sep 17 00:00:00 2001 From: Abhijeet Saharan <abhijeetsaharan2236@gmail.com> Date: Fri, 5 Jun 2026 14:43:30 +0530 Subject: [PATCH 3/5] fix: validate normalized string field values correctly Signed-off-by: Abhijeet Saharan <abhijeetsaharan2236@gmail.com> --- application/defs/cheatsheet_defs.py | 2 +- .../parsers/cheatsheet_extractor.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/application/defs/cheatsheet_defs.py b/application/defs/cheatsheet_defs.py index 162c4a068..0e1dc4267 100644 --- a/application/defs/cheatsheet_defs.py +++ b/application/defs/cheatsheet_defs.py @@ -78,4 +78,4 @@ def __post_init__(self): raise ValueError( "CheatsheetRecord: metadata keys and values must be strings, " f"got {key!r}: {value!r}" - ) \ No newline at end of file + ) diff --git a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py index 78ef319ea..384afe932 100644 --- a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py +++ b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py @@ -62,9 +62,7 @@ def _extract_summary(markdown: str) -> str: if body: return body - raise ValueError( - "_extract_summary: no summary could be extracted from markdown." - ) + raise ValueError("_extract_summary: no summary could be extracted from markdown.") def extract_cheatsheet_record( @@ -95,4 +93,4 @@ def extract_cheatsheet_record( "parser_version": PARSER_VERSION, "fallback_used": FALLBACK_USED, }, - ) \ No newline at end of file + ) From dc9d2d0f39fff76136a7f1285eb04d78f50dec21 Mon Sep 17 00:00:00 2001 From: Abhijeet Saharan <abhijeetsaharan2236@gmail.com> Date: Fri, 5 Jun 2026 14:43:30 +0530 Subject: [PATCH 4/5] fix: validate normalized string field values correctly Signed-off-by: Abhijeet Saharan <abhijeetsaharan2236@gmail.com> --- application/defs/cheatsheet_defs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/defs/cheatsheet_defs.py b/application/defs/cheatsheet_defs.py index 0e1dc4267..91a9a3b1e 100644 --- a/application/defs/cheatsheet_defs.py +++ b/application/defs/cheatsheet_defs.py @@ -43,7 +43,7 @@ def __post_init__(self): } # Validate fields which require string values. - for field_name in required_str_fields: + for field_name in required_str_fields: value = getattr(self, field_name) if not isinstance(value, str) or not value: From a76be6138161d2ba7c8ebc46b3a489372e3e0992 Mon Sep 17 00:00:00 2001 From: Abhijeet Saharan <abhijeetsaharan2236@gmail.com> Date: Mon, 8 Jun 2026 13:31:35 +0530 Subject: [PATCH 5/5] feat: implement structured extraction checkpoint B3 Signed-off-by: Abhijeet Saharan <abhijeetsaharan2236@gmail.com> --- application/defs/cheatsheet_defs.py | 2 +- .../parsers/cheatsheet_extractor.py | 62 ++++++++++++++----- 2 files changed, 46 insertions(+), 18 deletions(-) diff --git a/application/defs/cheatsheet_defs.py b/application/defs/cheatsheet_defs.py index 91a9a3b1e..0e1dc4267 100644 --- a/application/defs/cheatsheet_defs.py +++ b/application/defs/cheatsheet_defs.py @@ -43,7 +43,7 @@ def __post_init__(self): } # Validate fields which require string values. - for field_name in required_str_fields: + for field_name in required_str_fields: value = getattr(self, field_name) if not isinstance(value, str) or not value: diff --git a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py index 384afe932..6b360fecc 100644 --- a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py +++ b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py @@ -1,10 +1,10 @@ +import logging import os import re from application.defs.cheatsheet_defs import CheatsheetRecord PARSER_VERSION = "v1" -FALLBACK_USED = "false" CANONICAL_BASE_URL = "https://cheatsheetseries.owasp.org/cheatsheets/" @@ -41,28 +41,44 @@ def _extract_body_after_heading(markdown: str, heading_match: re.Match) -> str: def _extract_summary(markdown: str) -> str: - """Extract a summary section from cheatsheet markdown.""" + """Extract summary from Introduction section in cheatsheet markdown.""" - all_heading_matches = list(_ANY_HEADING_RE.finditer(markdown)) - - for match in all_heading_matches: - heading_text = match.group().lstrip("#").strip() - - if heading_text.lower() == "introduction": + for match in _ANY_HEADING_RE.finditer(markdown): + if match.group().lstrip("#").strip().lower() == "introduction": body = _extract_body_after_heading(markdown, match) - if body: return body - break + raise ValueError( + "_extract_summary: no suitable summary section could be extracted from markdown." + ) + - for match in all_heading_matches: - body = _extract_body_after_heading(markdown, match) +def _extract_title(markdown: str) -> str: + """Extract H1 title from cheatsheet markdown.""" + + match = _TITLE_RE.search(markdown) + if not match: + raise ValueError("_extract_title: no title found in markdown.") + + return match.group("title").strip() + +def _fallback_title() -> str: + """Return fallback title for malformed markdown.""" + + return "No title found." + + +def _fallback_summary(markdown: str) -> str: + """Return first non-empty paragraph after any heading, or 'No summary found.'""" + + for match in _ANY_HEADING_RE.finditer(markdown): + body = _extract_body_after_heading(markdown, match) if body: return body - raise ValueError("_extract_summary: no summary could be extracted from markdown.") + return "No summary found." def extract_cheatsheet_record( @@ -71,12 +87,24 @@ def extract_cheatsheet_record( ) -> CheatsheetRecord: """Extract a structured CheatsheetRecord from markdown content.""" - title_match = _TITLE_RE.search(markdown) - title = title_match.group("title").strip() + fallback_used = "false" + + try: + title = _extract_title(markdown) + except ValueError as e: + logging.warning(str(e)) + title = _fallback_title() + fallback_used = "true" + # Headings can be empty. headings = [m.group("heading").strip() for m in _HEADING_RE.finditer(markdown)] - summary = _extract_summary(markdown) + try: + summary = _extract_summary(markdown) + except ValueError as e: + logging.warning(str(e)) + summary = _fallback_summary(markdown) + fallback_used = "true" source_id = _derive_source_id(source_path) hyperlink = _derive_hyperlink(source_path) @@ -91,6 +119,6 @@ def extract_cheatsheet_record( category_hints=[], metadata={ "parser_version": PARSER_VERSION, - "fallback_used": FALLBACK_USED, + "fallback_used": fallback_used, }, )