Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,19 @@ We recognize the following properties:

We use different supervised classifiers, header analysis, regular expressions, the GitHub/Gitlab API to retrieve all these fields (more than one technique may be used for each field) and language specific metadata parsers (e.g., for package files). Each extraction records its provenance, with the confidence and technique used on each step. For more information check the [output format description](https://somef.readthedocs.io/en/latest/output/)

### Confidence values in header analysis
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is fine, but can you also add it in the documentation of the output, when we talk about the confidence value? Right before https://somef.readthedocs.io/en/latest/output/#result


When extracting metadata using header analysis, SOMEF assigns a confidence value based on the length
of the header. Shorter headers are more likely to be a good fit for a category, while longer headers
may contain additional context that makes the classification less reliable:

| Header length | Confidence |
|---------------|------------|
| 1–3 words | 1.0 |
| 4–6 words | 0.8 |
| 7–10 words | 0.5 |
| 11+ words | 0.1 |

## Documentation

See full documentation at [https://somef.readthedocs.io/en/latest/](https://somef.readthedocs.io/en/latest/)
Expand Down
21 changes: 20 additions & 1 deletion src/somef/header_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,11 +327,20 @@ def is_false_positive_header(text: str, category: str) -> bool:

text_lower = text.lower()

if '?' in text or '!' in text:
return True

# false positives for bibliographic citations
if category == constants.CAT_CITATION:
for pattern in constants.NEGATIVE_PATTERNS_CITATION_HEADERS:
if pattern in text_lower:
return True

if category in constants.MAX_HEADER_WORDS:
num_words = len(text.split())
if num_words > constants.MAX_HEADER_WORDS[category]:
return True

return False


Expand Down Expand Up @@ -460,10 +469,11 @@ def extract_categories(repo_data: str, repository_metadata: Result) -> Tuple[Res
if row[constants.PROP_PARENT_HEADER]:
result[constants.PROP_PARENT_HEADER] = row[constants.PROP_PARENT_HEADER]

confidence = calculate_header_confidence(row[constants.PROP_ORIGINAL_HEADER])
repository_metadata.add_result(
row['Group'],
result,
1,
confidence,
constants.TECHNIQUE_HEADER_ANALYSIS,
source,
)
Expand Down Expand Up @@ -567,3 +577,12 @@ def build_wordnet_groups() -> Dict[str, List]:
]

return g


def calculate_header_confidence(header: str) -> float:
"""Returns a confidence value based on the header length."""
num_words = len(header.split())
for max_words, confidence in constants.HEADER_CONFIDENCE_THRESHOLDS:
if num_words <= max_words:
return confidence
return 0.1
9 changes: 6 additions & 3 deletions src/somef/test/test_codemeta_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,12 +531,15 @@ def test_issue_417(self):
json_content = json.loads(data)
issue_tracker = json_content["issueTracker"] # JSON is in Codemeta format

#len(json_content["citation"])
#codemeta category citation is now referencePublication

# buildInstructions was previously generated from the header "Browser issues (Why can't I see
# the generated documentation / visualization?)" which was incorrectly classified as documentation
# due to the word "documentation" in the header. This was a false positive fixed in issue #529
# (long headers with punctuation are now discarded), so buildInstructions is no longer expected here.
# len(json_content["buildInstructions"]) > 0 and
assert issue_tracker == 'https://github.com/dgarijo/Widoco/issues' and len(json_content["referencePublication"]) > 0 and \
len(json_content["name"]) > 0 and len(json_content["identifier"]) > 0 and \
len(json_content["description"]) > 0 and len(json_content["readme"]) > 0 and \
len(json_content["buildInstructions"]) > 0 and \
len(json_content["softwareRequirements"]) > 0 and len(json_content["programmingLanguage"]) > 0 and \
len(json_content["keywords"]) > 0 and len(json_content["logo"]) > 0 and \
len(json_content["license"]) > 0 and len(json_content["dateCreated"]) > 0
Expand Down
33 changes: 33 additions & 0 deletions src/somef/test/test_header_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,3 +149,36 @@ def test_extract_headers_with_separators(self):
assert 'Installation' in headers
assert 'Citation' in headers
assert 'Funding' in headers


def test_issue_529(self):
"""
Test that ensures long headers or headers with punctuation are not incorrectly
classified. 'Browser issues (Why can't I see...)' should not appear in documentation.
"""
with open(test_data_path + "widoco_readme.md", "r") as data_file:
file_text = data_file.read()
json_test, results = extract_categories(file_text, Result())
if constants.CAT_DOCUMENTATION in json_test.results:
headers = [e[constants.PROP_RESULT].get(constants.PROP_ORIGINAL_HEADER, "")
for e in json_test.results[constants.CAT_DOCUMENTATION]]
assert not any("Browser issues" in h for h in headers)


def test_issue_138(self):
"""
Test that ensures header analysis returns a confidence lower than 1
for long headers (4+ words).
"""
with open(test_data_path + "widoco_readme.md", "r") as data_file:
file_text = data_file.read()
json_test, results = extract_categories(file_text, Result())
for category, entries in json_test.results.items():
if category == constants.PROP_PROVENANCE:
continue
for entry in entries:
if entry[constants.PROP_TECHNIQUE] == constants.TECHNIQUE_HEADER_ANALYSIS:
header = entry[constants.PROP_RESULT].get(constants.PROP_ORIGINAL_HEADER, "")
if header and len(header.split()) > 3:
print(f"Header: '{header}' | words: {len(header.split())} | confidence: {entry[constants.PROP_CONFIDENCE]}")
assert entry[constants.PROP_CONFIDENCE] < 1.0
18 changes: 18 additions & 0 deletions src/somef/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,3 +541,21 @@ class RepositoryType(Enum):

DEPENDENCY_TYPE_RUNTIME = "runtime"
DEPENDENCY_TYPE_DEVELOPMENT = "development"

# same length for all categories or different length depending on the category????????.
# This is used in the header analysis technique, to determine how many words from the header should be included in the analysis
# and avoid including false positives.
MAX_HEADER_WORDS = {
CAT_DOCUMENTATION: 5,
CAT_REQUIREMENTS: 3,
CAT_CITATION: 5,
}

# Confidence thresholds for header analysis based on header length
# not sure about this values of confidence
HEADER_CONFIDENCE_THRESHOLDS = [
(3, 1.0), # 1-3 words -> confidence 1.0
(6, 0.8), # 4-6 words -> confidence 0.8
(10, 0.5), # 7-10 words -> confidence 0.5
(11, 0.1), # 11+ words -> confidence 0.1
]
Loading