Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import logging
import os
import re

from application.defs.cheatsheet_defs import CheatsheetRecord

PARSER_VERSION = "v1"
FALLBACK_USED = "false"

CANONICAL_BASE_URL = "https://cheatsheetseries.owasp.org/cheatsheets/"

Expand Down Expand Up @@ -41,28 +41,44 @@ def _extract_body_after_heading(markdown: str, heading_match: re.Match) -> str:


def _extract_summary(markdown: str) -> str:
"""Extract a summary section from cheatsheet markdown."""
"""Extract summary from Introduction section in cheatsheet markdown."""

all_heading_matches = list(_ANY_HEADING_RE.finditer(markdown))

for match in all_heading_matches:
heading_text = match.group().lstrip("#").strip()

if heading_text.lower() == "introduction":
for match in _ANY_HEADING_RE.finditer(markdown):
if match.group().lstrip("#").strip().lower() == "introduction":
body = _extract_body_after_heading(markdown, match)

if body:
return body

break
raise ValueError(
"_extract_summary: no suitable summary section could be extracted from markdown."
)


for match in all_heading_matches:
body = _extract_body_after_heading(markdown, match)
def _extract_title(markdown: str) -> str:
"""Extract H1 title from cheatsheet markdown."""

match = _TITLE_RE.search(markdown)
if not match:
raise ValueError("_extract_title: no title found in markdown.")

return match.group("title").strip()


def _fallback_title() -> str:
"""Return fallback title for malformed markdown."""

return "No title found."


def _fallback_summary(markdown: str) -> str:
"""Return first non-empty paragraph after any heading, or 'No summary found.'"""

for match in _ANY_HEADING_RE.finditer(markdown):
body = _extract_body_after_heading(markdown, match)
if body:
return body

raise ValueError("_extract_summary: no summary could be extracted from markdown.")
return "No summary found."


def extract_cheatsheet_record(
Expand All @@ -71,12 +87,24 @@ def extract_cheatsheet_record(
) -> CheatsheetRecord:
"""Extract a structured CheatsheetRecord from markdown content."""

title_match = _TITLE_RE.search(markdown)
title = title_match.group("title").strip()
fallback_used = "false"

try:
title = _extract_title(markdown)
except ValueError as e:
logging.warning(str(e))
title = _fallback_title()
fallback_used = "true"

# Headings can be empty.
headings = [m.group("heading").strip() for m in _HEADING_RE.finditer(markdown)]

summary = _extract_summary(markdown)
try:
summary = _extract_summary(markdown)
except ValueError as e:
logging.warning(str(e))
summary = _fallback_summary(markdown)
fallback_used = "true"

source_id = _derive_source_id(source_path)
hyperlink = _derive_hyperlink(source_path)
Expand All @@ -91,6 +119,6 @@ def extract_cheatsheet_record(
category_hints=[],
metadata={
"parser_version": PARSER_VERSION,
"fallback_used": FALLBACK_USED,
"fallback_used": fallback_used,
},
)
Loading