From ade7c1870b9fb3d6ee0e836710d7f38da94b96e8 Mon Sep 17 00:00:00 2001
From: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
Date: Sun, 31 May 2026 07:01:14 +0530
Subject: [PATCH 1/5] feat: implement structured extraction checkpoints B1 and
 B2

Signed-off-by: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
---
 .../external_project_parsers/parsers/cheatsheet_extractor.py   | 3 ---
 1 file changed, 3 deletions(-)
diff --git a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
index 384afe932..41b013bad 100644
--- a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
+++ b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
@@ -1,11 +1,9 @@
 import os
 import re
-
 from application.defs.cheatsheet_defs import CheatsheetRecord
 
 PARSER_VERSION = "v1"
 FALLBACK_USED = "false"
-
 CANONICAL_BASE_URL = "https://cheatsheetseries.owasp.org/cheatsheets/"
 
 _TITLE_RE = re.compile(r"^#\s+(?P<title>.+)$", re.MULTILINE)
@@ -53,7 +51,6 @@ def _extract_summary(markdown: str) -> str:
 
             if body:
                 return body
-
             break
 
     for match in all_heading_matches:

From a9e54a3d41e54c3a97d205ecfdd7c2358be1fdd1 Mon Sep 17 00:00:00 2001
From: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
Date: Fri, 5 Jun 2026 14:00:57 +0530
Subject: [PATCH 2/5] docs: add docstrings

Signed-off-by: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
---
 application/defs/cheatsheet_defs.py                      | 2 +-
 .../parsers/cheatsheet_extractor.py                      | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/application/defs/cheatsheet_defs.py b/application/defs/cheatsheet_defs.py
index 0e1dc4267..162c4a068 100644
--- a/application/defs/cheatsheet_defs.py
+++ b/application/defs/cheatsheet_defs.py
@@ -78,4 +78,4 @@ def __post_init__(self):
                 raise ValueError(
                     "CheatsheetRecord: metadata keys and values must be strings, "
                     f"got {key!r}: {value!r}"
-                )
+                )
\ No newline at end of file
diff --git a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
index 41b013bad..78ef319ea 100644
--- a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
+++ b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
@@ -1,9 +1,11 @@
 import os
 import re
+
 from application.defs.cheatsheet_defs import CheatsheetRecord
 
 PARSER_VERSION = "v1"
 FALLBACK_USED = "false"
+
 CANONICAL_BASE_URL = "https://cheatsheetseries.owasp.org/cheatsheets/"
 
 _TITLE_RE = re.compile(r"^#\s+(?P<title>.+)$", re.MULTILINE)
@@ -51,6 +53,7 @@ def _extract_summary(markdown: str) -> str:
 
             if body:
                 return body
+
             break
 
     for match in all_heading_matches:
@@ -59,7 +62,9 @@ def _extract_summary(markdown: str) -> str:
         if body:
             return body
 
-    raise ValueError("_extract_summary: no summary could be extracted from markdown.")
+    raise ValueError(
+        "_extract_summary: no summary could be extracted from markdown."
+    )
 
 
 def extract_cheatsheet_record(
@@ -90,4 +95,4 @@ def extract_cheatsheet_record(
             "parser_version": PARSER_VERSION,
             "fallback_used": FALLBACK_USED,
         },
-    )
+    )
\ No newline at end of file

From 26d7e92773574916ae8b734dc8b255e0c4e94a2a Mon Sep 17 00:00:00 2001
From: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
Date: Fri, 5 Jun 2026 14:43:30 +0530
Subject: [PATCH 3/5] fix: validate normalized string field values correctly

Signed-off-by: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
---
 application/defs/cheatsheet_defs.py                         | 2 +-
 .../parsers/cheatsheet_extractor.py                         | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/application/defs/cheatsheet_defs.py b/application/defs/cheatsheet_defs.py
index 162c4a068..0e1dc4267 100644
--- a/application/defs/cheatsheet_defs.py
+++ b/application/defs/cheatsheet_defs.py
@@ -78,4 +78,4 @@ def __post_init__(self):
                 raise ValueError(
                     "CheatsheetRecord: metadata keys and values must be strings, "
                     f"got {key!r}: {value!r}"
-                )
\ No newline at end of file
+                )
diff --git a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
index 78ef319ea..384afe932 100644
--- a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
+++ b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
@@ -62,9 +62,7 @@ def _extract_summary(markdown: str) -> str:
         if body:
             return body
 
-    raise ValueError(
-        "_extract_summary: no summary could be extracted from markdown."
-    )
+    raise ValueError("_extract_summary: no summary could be extracted from markdown.")
 
 
 def extract_cheatsheet_record(
@@ -95,4 +93,4 @@ def extract_cheatsheet_record(
             "parser_version": PARSER_VERSION,
             "fallback_used": FALLBACK_USED,
         },
-    )
\ No newline at end of file
+    )

From dc9d2d0f39fff76136a7f1285eb04d78f50dec21 Mon Sep 17 00:00:00 2001
From: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
Date: Fri, 5 Jun 2026 14:43:30 +0530
Subject: [PATCH 4/5] fix: validate normalized string field values correctly

Signed-off-by: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
---
 application/defs/cheatsheet_defs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/application/defs/cheatsheet_defs.py b/application/defs/cheatsheet_defs.py
index 0e1dc4267..91a9a3b1e 100644
--- a/application/defs/cheatsheet_defs.py
+++ b/application/defs/cheatsheet_defs.py
@@ -43,7 +43,7 @@ def __post_init__(self):
         }
 
         # Validate fields which require string values.
-        for field_name in required_str_fields:
+        for field_name in required_str_fields:  
             value = getattr(self, field_name)
 
             if not isinstance(value, str) or not value:

From a76be6138161d2ba7c8ebc46b3a489372e3e0992 Mon Sep 17 00:00:00 2001
From: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
Date: Mon, 8 Jun 2026 13:31:35 +0530
Subject: [PATCH 5/5] feat: implement structured extraction checkpoint B3

Signed-off-by: Abhijeet Saharan <abhijeetsaharan2236@gmail.com>
---
 application/defs/cheatsheet_defs.py           |  2 +-
 .../parsers/cheatsheet_extractor.py           | 62 ++++++++++++++-----
 2 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/application/defs/cheatsheet_defs.py b/application/defs/cheatsheet_defs.py
index 91a9a3b1e..0e1dc4267 100644
--- a/application/defs/cheatsheet_defs.py
+++ b/application/defs/cheatsheet_defs.py
@@ -43,7 +43,7 @@ def __post_init__(self):
         }
 
         # Validate fields which require string values.
-        for field_name in required_str_fields:  
+        for field_name in required_str_fields:
             value = getattr(self, field_name)
 
             if not isinstance(value, str) or not value:
diff --git a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
index 384afe932..6b360fecc 100644
--- a/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
+++ b/application/utils/external_project_parsers/parsers/cheatsheet_extractor.py
@@ -1,10 +1,10 @@
+import logging
 import os
 import re
 
 from application.defs.cheatsheet_defs import CheatsheetRecord
 
 PARSER_VERSION = "v1"
-FALLBACK_USED = "false"
 
 CANONICAL_BASE_URL = "https://cheatsheetseries.owasp.org/cheatsheets/"
 
@@ -41,28 +41,44 @@ def _extract_body_after_heading(markdown: str, heading_match: re.Match) -> str:
 
 
 def _extract_summary(markdown: str) -> str:
-    """Extract a summary section from cheatsheet markdown."""
+    """Extract summary from Introduction section in cheatsheet markdown."""
 
-    all_heading_matches = list(_ANY_HEADING_RE.finditer(markdown))
-
-    for match in all_heading_matches:
-        heading_text = match.group().lstrip("#").strip()
-
-        if heading_text.lower() == "introduction":
+    for match in _ANY_HEADING_RE.finditer(markdown):
+        if match.group().lstrip("#").strip().lower() == "introduction":
             body = _extract_body_after_heading(markdown, match)
-
             if body:
                 return body
 
-            break
+    raise ValueError(
+        "_extract_summary: no suitable summary section could be extracted from markdown."
+    )
+
 
-    for match in all_heading_matches:
-        body = _extract_body_after_heading(markdown, match)
+def _extract_title(markdown: str) -> str:
+    """Extract H1 title from cheatsheet markdown."""
+
+    match = _TITLE_RE.search(markdown)
+    if not match:
+        raise ValueError("_extract_title: no title found in markdown.")
+
+    return match.group("title").strip()
 
+
+def _fallback_title() -> str:
+    """Return fallback title for malformed markdown."""
+
+    return "No title found."
+
+
+def _fallback_summary(markdown: str) -> str:
+    """Return first non-empty paragraph after any heading, or 'No summary found.'"""
+
+    for match in _ANY_HEADING_RE.finditer(markdown):
+        body = _extract_body_after_heading(markdown, match)
         if body:
             return body
 
-    raise ValueError("_extract_summary: no summary could be extracted from markdown.")
+    return "No summary found."
 
 
 def extract_cheatsheet_record(
@@ -71,12 +87,24 @@ def extract_cheatsheet_record(
 ) -> CheatsheetRecord:
     """Extract a structured CheatsheetRecord from markdown content."""
 
-    title_match = _TITLE_RE.search(markdown)
-    title = title_match.group("title").strip()
+    fallback_used = "false"
+
+    try:
+        title = _extract_title(markdown)
+    except ValueError as e:
+        logging.warning(str(e))
+        title = _fallback_title()
+        fallback_used = "true"
 
+    # Headings can be empty.
     headings = [m.group("heading").strip() for m in _HEADING_RE.finditer(markdown)]
 
-    summary = _extract_summary(markdown)
+    try:
+        summary = _extract_summary(markdown)
+    except ValueError as e:
+        logging.warning(str(e))
+        summary = _fallback_summary(markdown)
+        fallback_used = "true"
 
     source_id = _derive_source_id(source_path)
     hyperlink = _derive_hyperlink(source_path)
@@ -91,6 +119,6 @@ def extract_cheatsheet_record(
         category_hints=[],
         metadata={
             "parser_version": PARSER_VERSION,
-            "fallback_used": FALLBACK_USED,
+            "fallback_used": fallback_used,
         },
     )