KnowledgeCaptureAndDiscovery · juanjemdIos · May 12, 2026 · May 12, 2026 · May 13, 2026 · dgarijo
diff --git a/README.md b/README.md
@@ -95,6 +95,19 @@ We recognize the following properties:
 
 We use different supervised classifiers, header analysis, regular expressions, the GitHub/Gitlab API to retrieve all these fields (more than one technique may be used for each field) and language specific metadata parsers (e.g., for package files). Each extraction records its provenance, with the confidence and technique used on each step. For more information check the [output format description](https://somef.readthedocs.io/en/latest/output/)
 
+### Confidence values in header analysis
+
+When extracting metadata using header analysis, SOMEF assigns a confidence value based on the length 
+of the header. Shorter headers are more likely to be a good fit for a category, while longer headers 
+may contain additional context that makes the classification less reliable:
+
+| Header length | Confidence |
+|---------------|------------|
+| 1–3 words     | 1.0        |
+| 4–6 words     | 0.8        |
+| 7–10 words    | 0.5        |
+| 11+ words     | 0.1        |
+
 ## Documentation
 
 See full documentation at [https://somef.readthedocs.io/en/latest/](https://somef.readthedocs.io/en/latest/)

diff --git a/src/somef/header_analysis.py b/src/somef/header_analysis.py
@@ -327,11 +327,20 @@ def is_false_positive_header(text: str, category: str) -> bool:
 
     text_lower = text.lower()
 
+    if '?' in text or '!' in text:
+        return True
+
     # false positives for bibliographic citations
     if category == constants.CAT_CITATION:
         for pattern in constants.NEGATIVE_PATTERNS_CITATION_HEADERS:
             if pattern in text_lower:
                 return True
+
+    if category in constants.MAX_HEADER_WORDS:
+        num_words = len(text.split())
+        if num_words > constants.MAX_HEADER_WORDS[category]:
+            return True
+
     return False
 
 
@@ -460,10 +469,11 @@ def extract_categories(repo_data: str, repository_metadata: Result) -> Tuple[Res
             if row[constants.PROP_PARENT_HEADER]:
                 result[constants.PROP_PARENT_HEADER] = row[constants.PROP_PARENT_HEADER]
 
+            confidence = calculate_header_confidence(row[constants.PROP_ORIGINAL_HEADER])
             repository_metadata.add_result(
                 row['Group'],
                 result,
-                1,
+                confidence,
                 constants.TECHNIQUE_HEADER_ANALYSIS,
                 source,
             )
@@ -567,3 +577,12 @@ def build_wordnet_groups() -> Dict[str, List]:
     ]
 
     return g
+
+
+def calculate_header_confidence(header: str) -> float:
+    """Returns a confidence value based on the header length."""
+    num_words = len(header.split())
+    for max_words, confidence in constants.HEADER_CONFIDENCE_THRESHOLDS:
+        if num_words <= max_words:
+            return confidence
+    return 0.1
diff --git a/src/somef/test/test_codemeta_export.py b/src/somef/test/test_codemeta_export.py
@@ -531,12 +531,15 @@ def test_issue_417(self):
         json_content = json.loads(data)
         issue_tracker = json_content["issueTracker"]  # JSON is in Codemeta format
 
-        #len(json_content["citation"]) 
-        #codemeta category citation is now referencePublication
+
+        # buildInstructions was previously generated from the header "Browser issues (Why can't I see
+        # the generated documentation / visualization?)" which was incorrectly classified as documentation
+        # due to the word "documentation" in the header. This was a false positive fixed in issue #529
+        # (long headers with punctuation are now discarded), so buildInstructions is no longer expected here.
+        # len(json_content["buildInstructions"]) > 0 and 
         assert issue_tracker == 'https://github.com/dgarijo/Widoco/issues' and len(json_content["referencePublication"]) > 0 and \
             len(json_content["name"]) > 0 and len(json_content["identifier"]) > 0 and \
             len(json_content["description"]) > 0 and len(json_content["readme"]) > 0 and \
-            len(json_content["buildInstructions"]) > 0 and \
             len(json_content["softwareRequirements"]) > 0 and len(json_content["programmingLanguage"]) > 0 and \
             len(json_content["keywords"]) > 0 and len(json_content["logo"]) > 0 and \
             len(json_content["license"]) > 0 and len(json_content["dateCreated"]) > 0

diff --git a/src/somef/test/test_header_analysis.py b/src/somef/test/test_header_analysis.py
@@ -149,3 +149,36 @@ def test_extract_headers_with_separators(self):
             assert 'Installation' in headers
             assert 'Citation' in headers
             assert 'Funding' in headers
+
+
+    def test_issue_529(self):
+        """
+        Test that ensures long headers or headers with punctuation are not incorrectly
+        classified. 'Browser issues (Why can't I see...)' should not appear in documentation.
+        """
+        with open(test_data_path + "widoco_readme.md", "r") as data_file:
+            file_text = data_file.read()
+            json_test, results = extract_categories(file_text, Result())
+            if constants.CAT_DOCUMENTATION in json_test.results:
+                headers = [e[constants.PROP_RESULT].get(constants.PROP_ORIGINAL_HEADER, "")
+                        for e in json_test.results[constants.CAT_DOCUMENTATION]]
+                assert not any("Browser issues" in h for h in headers)
+
+
+    def test_issue_138(self):
+        """
+        Test that ensures header analysis returns a confidence lower than 1
+        for long headers (4+ words).
+        """
+        with open(test_data_path + "widoco_readme.md", "r") as data_file:
+            file_text = data_file.read()
+            json_test, results = extract_categories(file_text, Result())
+            for category, entries in json_test.results.items():
+                if category == constants.PROP_PROVENANCE:
+                    continue
+                for entry in entries:
+                    if entry[constants.PROP_TECHNIQUE] == constants.TECHNIQUE_HEADER_ANALYSIS:
+                        header = entry[constants.PROP_RESULT].get(constants.PROP_ORIGINAL_HEADER, "")
+                        if header and len(header.split()) > 3:
+                            print(f"Header: '{header}' | words: {len(header.split())} | confidence: {entry[constants.PROP_CONFIDENCE]}")
+                            assert entry[constants.PROP_CONFIDENCE] < 1.0
diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py
@@ -541,3 +541,21 @@ class RepositoryType(Enum):
 
 DEPENDENCY_TYPE_RUNTIME = "runtime"
 DEPENDENCY_TYPE_DEVELOPMENT = "development"
+
+# same length for all categories or different length depending on the category????????.
+# This is used in the header analysis technique, to determine how many words from the header should be included in the analysis
+# and avoid including false positives.
+MAX_HEADER_WORDS = {
+    CAT_DOCUMENTATION: 5,
+    CAT_REQUIREMENTS: 3,
+    CAT_CITATION: 5,
+}
+
+# Confidence thresholds for header analysis based on header length
+# not sure about this values of confidence
+HEADER_CONFIDENCE_THRESHOLDS = [
+    (3, 1.0),   # 1-3 words -> confidence 1.0
+    (6, 0.8),   # 4-6 words -> confidence 0.8
+    (10, 0.5),  # 7-10 words -> confidence 0.5
+    (11, 0.1),  # 11+ words -> confidence 0.1
+]