From 01571cf110a76b37d44c3e78054bfdd3efefb313 Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Tue, 12 May 2026 15:45:48 +0200 Subject: [PATCH 1/3] solve problem with long headers and improve header analisys confidence. Fixes #529, Fixes #138 --- src/somef/header_analysis.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/somef/header_analysis.py b/src/somef/header_analysis.py index a7489a9c..5f6ae582 100644 --- a/src/somef/header_analysis.py +++ b/src/somef/header_analysis.py @@ -327,11 +327,20 @@ def is_false_positive_header(text: str, category: str) -> bool: text_lower = text.lower() + if '?' in text or '!' in text: + return True + # false positives for bibliographic citations if category == constants.CAT_CITATION: for pattern in constants.NEGATIVE_PATTERNS_CITATION_HEADERS: if pattern in text_lower: return True + + if category in constants.MAX_HEADER_WORDS: + num_words = len(text.split()) + if num_words > constants.MAX_HEADER_WORDS[category]: + return True + return False @@ -460,10 +469,11 @@ def extract_categories(repo_data: str, repository_metadata: Result) -> Tuple[Res if row[constants.PROP_PARENT_HEADER]: result[constants.PROP_PARENT_HEADER] = row[constants.PROP_PARENT_HEADER] + confidence = calculate_header_confidence(row[constants.PROP_ORIGINAL_HEADER]) repository_metadata.add_result( row['Group'], result, - 1, + confidence, constants.TECHNIQUE_HEADER_ANALYSIS, source, ) @@ -567,3 +577,12 @@ def build_wordnet_groups() -> Dict[str, List]: ] return g + + +def calculate_header_confidence(header: str) -> float: + """Returns a confidence value based on the header length.""" + num_words = len(header.split()) + for max_words, confidence in constants.HEADER_CONFIDENCE_THRESHOLDS: + if num_words <= max_words: + return confidence + return 0.1 From 93dbde0fad58149551db2504faa0d4d66cf9caca Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Tue, 12 May 2026 15:46:18 +0200 Subject: [PATCH 2/3] forgot files --- src/somef/test/test_codemeta_export.py | 9 ++++--- src/somef/test/test_header_analysis.py | 33 ++++++++++++++++++++++++++ src/somef/utils/constants.py | 18 ++++++++++++++ 3 files changed, 57 insertions(+), 3 deletions(-) diff --git a/src/somef/test/test_codemeta_export.py b/src/somef/test/test_codemeta_export.py index dd557adf..ca1392e9 100644 --- a/src/somef/test/test_codemeta_export.py +++ b/src/somef/test/test_codemeta_export.py @@ -531,12 +531,15 @@ def test_issue_417(self): json_content = json.loads(data) issue_tracker = json_content["issueTracker"] # JSON is in Codemeta format - #len(json_content["citation"]) - #codemeta category citation is now referencePublication + + # buildInstructions was previously generated from the header "Browser issues (Why can't I see + # the generated documentation / visualization?)" which was incorrectly classified as documentation + # due to the word "documentation" in the header. This was a false positive fixed in issue #529 + # (long headers with punctuation are now discarded), so buildInstructions is no longer expected here. + # len(json_content["buildInstructions"]) > 0 and assert issue_tracker == 'https://github.com/dgarijo/Widoco/issues' and len(json_content["referencePublication"]) > 0 and \ len(json_content["name"]) > 0 and len(json_content["identifier"]) > 0 and \ len(json_content["description"]) > 0 and len(json_content["readme"]) > 0 and \ - len(json_content["buildInstructions"]) > 0 and \ len(json_content["softwareRequirements"]) > 0 and len(json_content["programmingLanguage"]) > 0 and \ len(json_content["keywords"]) > 0 and len(json_content["logo"]) > 0 and \ len(json_content["license"]) > 0 and len(json_content["dateCreated"]) > 0 diff --git a/src/somef/test/test_header_analysis.py b/src/somef/test/test_header_analysis.py index b5259205..dab3e296 100644 --- a/src/somef/test/test_header_analysis.py +++ b/src/somef/test/test_header_analysis.py @@ -149,3 +149,36 @@ def test_extract_headers_with_separators(self): assert 'Installation' in headers assert 'Citation' in headers assert 'Funding' in headers + + + def test_issue_529(self): + """ + Test that ensures long headers or headers with punctuation are not incorrectly + classified. 'Browser issues (Why can't I see...)' should not appear in documentation. + """ + with open(test_data_path + "widoco_readme.md", "r") as data_file: + file_text = data_file.read() + json_test, results = extract_categories(file_text, Result()) + if constants.CAT_DOCUMENTATION in json_test.results: + headers = [e[constants.PROP_RESULT].get(constants.PROP_ORIGINAL_HEADER, "") + for e in json_test.results[constants.CAT_DOCUMENTATION]] + assert not any("Browser issues" in h for h in headers) + + + def test_issue_138(self): + """ + Test that ensures header analysis returns a confidence lower than 1 + for long headers (4+ words). + """ + with open(test_data_path + "widoco_readme.md", "r") as data_file: + file_text = data_file.read() + json_test, results = extract_categories(file_text, Result()) + for category, entries in json_test.results.items(): + if category == constants.PROP_PROVENANCE: + continue + for entry in entries: + if entry[constants.PROP_TECHNIQUE] == constants.TECHNIQUE_HEADER_ANALYSIS: + header = entry[constants.PROP_RESULT].get(constants.PROP_ORIGINAL_HEADER, "") + if header and len(header.split()) > 3: + print(f"Header: '{header}' | words: {len(header.split())} | confidence: {entry[constants.PROP_CONFIDENCE]}") + assert entry[constants.PROP_CONFIDENCE] < 1.0 \ No newline at end of file diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index 94d3ce6e..0310a2b9 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -541,3 +541,21 @@ class RepositoryType(Enum): DEPENDENCY_TYPE_RUNTIME = "runtime" DEPENDENCY_TYPE_DEVELOPMENT = "development" + +# same length for all categories or different length depending on the category????????. +# This is used in the header analysis technique, to determine how many words from the header should be included in the analysis +# and avoid including false positives. +MAX_HEADER_WORDS = { + CAT_DOCUMENTATION: 5, + CAT_REQUIREMENTS: 3, + CAT_CITATION: 5, +} + +# Confidence thresholds for header analysis based on header length +# not sure about this values of confidence +HEADER_CONFIDENCE_THRESHOLDS = [ + (3, 1.0), # 1-3 words -> confidence 1.0 + (6, 0.8), # 4-6 words -> confidence 0.8 + (10, 0.5), # 7-10 words -> confidence 0.5 + (11, 0.1), # 11+ words -> confidence 0.1 +] \ No newline at end of file From accfa0bdd7031ea264a6c7a68bf0eeef57bc9fbd Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Wed, 13 May 2026 08:19:13 +0200 Subject: [PATCH 3/3] confidence values in readme --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 68cad3d1..5383bb09 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,19 @@ We recognize the following properties: We use different supervised classifiers, header analysis, regular expressions, the GitHub/Gitlab API to retrieve all these fields (more than one technique may be used for each field) and language specific metadata parsers (e.g., for package files). Each extraction records its provenance, with the confidence and technique used on each step. For more information check the [output format description](https://somef.readthedocs.io/en/latest/output/) +### Confidence values in header analysis + +When extracting metadata using header analysis, SOMEF assigns a confidence value based on the length +of the header. Shorter headers are more likely to be a good fit for a category, while longer headers +may contain additional context that makes the classification less reliable: + +| Header length | Confidence | +|---------------|------------| +| 1–3 words | 1.0 | +| 4–6 words | 0.8 | +| 7–10 words | 0.5 | +| 11+ words | 0.1 | + ## Documentation See full documentation at [https://somef.readthedocs.io/en/latest/](https://somef.readthedocs.io/en/latest/)