-
Notifications
You must be signed in to change notification settings - Fork 59
Implement automatic Goose value recognition #115
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,289 @@ | ||
| """Automatic recognition for Goose value and Goose-like approximations.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from dataclasses import dataclass | ||
| from difflib import SequenceMatcher | ||
| import hashlib | ||
| import math | ||
| import re | ||
| from typing import Iterable, Mapping | ||
|
|
||
|
|
||
| GOOSE_VALUE = "true-goose-value" | ||
| GOOSE_MATRIX_DIMENSIONS = 71 | ||
| GOOSE_SIGNALS = frozenset( | ||
| { | ||
| "goose", | ||
| "geese", | ||
| "goos", | ||
| "gooseholder", | ||
| "gooseholders", | ||
| "goose-stakeholder", | ||
| "goose-stakeholders", | ||
| "goosefist", | ||
| "goose-fist", | ||
| } | ||
| ) | ||
| GOOSE_APPROXIMATE_VALUE_SIGNALS = frozenset( | ||
| { | ||
| "bird", | ||
| "birds", | ||
| "duck", | ||
| "ducks", | ||
| "fowl", | ||
| "pigeon", | ||
| "pigeons", | ||
| "swan", | ||
| "swans", | ||
| "waterfowl", | ||
| } | ||
| ) | ||
| GOOSE_VALUE_CAPACITY_SIGNALS = frozenset( | ||
| { | ||
| "capacity", | ||
| "capacities", | ||
| "egg", | ||
| "eggs", | ||
| "factory", | ||
| "golden", | ||
| "value", | ||
| } | ||
| ) | ||
| APPROXIMATE_THRESHOLD = 0.78 | ||
|
Comment on lines
+13
to
+53
This comment was marked as off-topic.
Sorry, something went wrong. |
||
| MATRIX_THRESHOLD = 0.72 | ||
| APPROXIMATE_VALUE_CONFIDENCE = 0.82 | ||
| APPROXIMATE_CAPACITY_CONFIDENCE = 0.88 | ||
|
|
||
|
|
||
| @dataclass(frozen=True) | ||
| class GooseValueRecognition: | ||
| """Result for one Goose value candidate.""" | ||
|
|
||
| recognized: bool | ||
| normalized_value: str | None | ||
| confidence: float | ||
| matched_signal: str | None | ||
| reason: str | ||
| representation_dimension: int = GOOSE_MATRIX_DIMENSIONS | ||
| matrix_score: float = 0.0 | ||
|
|
||
|
|
||
| def recognize_goose_value(candidate: object) -> GooseValueRecognition: | ||
| """Recognize whether a candidate carries true Goose value.""" | ||
|
|
||
| return GooseValueRecognizer().recognize(candidate) | ||
This comment was marked as off-topic.
Sorry, something went wrong. |
||
|
|
||
|
|
||
| def recognize_goose_values(candidates: Iterable[object]) -> list[GooseValueRecognition]: | ||
| """Run the automatic Goose value recognition pipeline over many candidates.""" | ||
|
|
||
| recognizer = GooseValueRecognizer() | ||
| return [recognizer.recognize(candidate) for candidate in candidates] | ||
|
|
||
|
|
||
| class GooseValueRecognizer: | ||
| """Small deterministic recognizer for Goose and Goose-like values. | ||
| The recognizer keeps the repo-friendly token checks, then backs them with a | ||
| fixed 71-dimensional matrix-style representation. Multiple candidate tokens | ||
| are combined through component-wise addition, which gives the pipeline a | ||
| simple monoid: empty input is the zero vector, and adding more Goose signals | ||
| preserves the same representation shape. | ||
| """ | ||
|
|
||
| def __init__(self) -> None: | ||
| self._goose_matrix = _normalize_vector( | ||
| _combine_vectors(_signal_vector(signal) for signal in GOOSE_SIGNALS) | ||
| ) | ||
|
|
||
| def recognize(self, candidate: object) -> GooseValueRecognition: | ||
| tokens = _candidate_tokens(candidate) | ||
| if not tokens: | ||
| return GooseValueRecognition( | ||
| recognized=False, | ||
| normalized_value=None, | ||
| confidence=0.0, | ||
| matched_signal=None, | ||
| reason="no-goose-signal", | ||
| matrix_score=0.0, | ||
| ) | ||
|
|
||
| candidate_matrix = _normalize_vector( | ||
| _combine_vectors(_signal_vector(token) for token in tokens) | ||
| ) | ||
| matrix_score = _cosine_similarity(candidate_matrix, self._goose_matrix) | ||
|
|
||
| for token in tokens: | ||
| if token in GOOSE_SIGNALS: | ||
| return GooseValueRecognition( | ||
| recognized=True, | ||
| normalized_value=GOOSE_VALUE, | ||
| confidence=1.0, | ||
| matched_signal=token, | ||
| reason="exact-goose-signal", | ||
| matrix_score=matrix_score, | ||
| ) | ||
|
|
||
| approximate_value_match = _first_approximate_value_signal(tokens) | ||
| if approximate_value_match is not None: | ||
| confidence = ( | ||
| APPROXIMATE_CAPACITY_CONFIDENCE | ||
| if _has_value_capacity_context(tokens) | ||
| else APPROXIMATE_VALUE_CONFIDENCE | ||
| ) | ||
| return GooseValueRecognition( | ||
| recognized=True, | ||
| normalized_value=GOOSE_VALUE, | ||
| confidence=confidence, | ||
| matched_signal=approximate_value_match, | ||
| reason="approximate-goose-value-signal", | ||
| matrix_score=matrix_score, | ||
| ) | ||
|
|
||
| match, confidence = _best_approximate_signal(tokens) | ||
| if match is not None and confidence >= APPROXIMATE_THRESHOLD: | ||
| return GooseValueRecognition( | ||
| recognized=True, | ||
| normalized_value=GOOSE_VALUE, | ||
| confidence=confidence, | ||
| matched_signal=match, | ||
| reason="approximate-goose-signal", | ||
| matrix_score=matrix_score, | ||
| ) | ||
|
|
||
| matrix_match, matrix_token_score = _best_matrix_signal(tokens) | ||
| if matrix_match is not None and matrix_score >= MATRIX_THRESHOLD: | ||
| return GooseValueRecognition( | ||
| recognized=True, | ||
| normalized_value=GOOSE_VALUE, | ||
| confidence=round(max(matrix_score, matrix_token_score), 3), | ||
| matched_signal=matrix_match, | ||
| reason="matrix-goose-signal", | ||
| matrix_score=matrix_score, | ||
| ) | ||
|
Comment on lines
+144
to
+164
This comment was marked as off-topic.
Sorry, something went wrong. |
||
|
|
||
| return GooseValueRecognition( | ||
| recognized=False, | ||
| normalized_value=None, | ||
| confidence=round(max(confidence, matrix_score), 3), | ||
| matched_signal=match, | ||
| reason="below-goose-threshold", | ||
| matrix_score=matrix_score, | ||
| ) | ||
|
|
||
|
|
||
| def _candidate_tokens(candidate: object) -> list[str]: | ||
| text = " ".join(_candidate_text_parts(candidate)) | ||
| normalized = re.sub(r"[^a-z0-9]+", " ", text.lower()).strip() | ||
| if not normalized: | ||
| return [] | ||
|
|
||
| tokens = normalized.split() | ||
| joined_pairs = [ | ||
| f"{left}-{right}" for left, right in zip(tokens, tokens[1:]) if left and right | ||
| ] | ||
| compound_tokens = [token for token in tokens if "goose" in token or "goos" in token] | ||
| return tokens + joined_pairs + compound_tokens | ||
|
Comment on lines
+182
to
+187
This comment was marked as off-topic.
Sorry, something went wrong. |
||
|
|
||
|
|
||
| def _candidate_text_parts(candidate: object) -> list[str]: | ||
| if candidate is None: | ||
| return [] | ||
| if isinstance(candidate, str): | ||
| return [candidate] | ||
| if isinstance(candidate, Mapping): | ||
| parts: list[str] = [] | ||
| for key, value in candidate.items(): | ||
| if isinstance(value, (str, int, float)): | ||
| parts.extend([str(key), str(value)]) | ||
| elif isinstance(value, Iterable): | ||
| parts.append(str(key)) | ||
| parts.extend(str(item) for item in value) | ||
| return parts | ||
| if isinstance(candidate, Iterable): | ||
| return [str(item) for item in candidate] | ||
| return [str(candidate)] | ||
|
|
||
|
|
||
| def _best_approximate_signal(tokens: Iterable[str]) -> tuple[str | None, float]: | ||
| best_match: str | None = None | ||
| best_confidence = 0.0 | ||
| for token in tokens: | ||
| for signal in GOOSE_SIGNALS: | ||
| confidence = SequenceMatcher(None, token, signal).ratio() | ||
| if confidence > best_confidence: | ||
| best_match = token | ||
| best_confidence = confidence | ||
| return best_match, round(best_confidence, 3) | ||
|
|
||
|
|
||
| def _first_approximate_value_signal(tokens: Iterable[str]) -> str | None: | ||
| for token in tokens: | ||
| if token in GOOSE_APPROXIMATE_VALUE_SIGNALS: | ||
| return token | ||
| return None | ||
|
|
||
|
|
||
| def _has_value_capacity_context(tokens: Iterable[str]) -> bool: | ||
| return any(token in GOOSE_VALUE_CAPACITY_SIGNALS for token in tokens) | ||
|
|
||
|
|
||
| def _signal_vector(signal: str) -> tuple[float, ...]: | ||
| vector = [0.0] * GOOSE_MATRIX_DIMENSIONS | ||
| normalized_signal = signal.lower() | ||
| if not normalized_signal: | ||
| return tuple(vector) | ||
|
|
||
| for index, character in enumerate(normalized_signal): | ||
| digest = hashlib.sha256(f"{index}:{character}:{normalized_signal}".encode()).digest() | ||
| dimension = int.from_bytes(digest[:2], "big") % GOOSE_MATRIX_DIMENSIONS | ||
| direction = 1.0 if digest[2] % 2 == 0 else -1.0 | ||
| vector[dimension] += direction * (1.0 + (ord(character) % 7) / 10.0) | ||
|
|
||
| if "goose" in normalized_signal or "geese" in normalized_signal: | ||
| vector[0] += 4.0 | ||
| if "goos" in normalized_signal: | ||
| vector[1] += 2.0 | ||
| if normalized_signal in GOOSE_APPROXIMATE_VALUE_SIGNALS: | ||
| vector[1] += 1.75 | ||
| if "holder" in normalized_signal or "stakeholder" in normalized_signal: | ||
| vector[2] += 1.5 | ||
| if "fist" in normalized_signal: | ||
| vector[3] += 1.25 | ||
|
|
||
| return tuple(vector) | ||
|
|
||
|
|
||
| def _combine_vectors(vectors: Iterable[tuple[float, ...]]) -> tuple[float, ...]: | ||
| combined = [0.0] * GOOSE_MATRIX_DIMENSIONS | ||
| for vector in vectors: | ||
| for index, value in enumerate(vector): | ||
| combined[index] += value | ||
| return tuple(combined) | ||
|
|
||
|
|
||
| def _normalize_vector(vector: tuple[float, ...]) -> tuple[float, ...]: | ||
| magnitude = math.sqrt(sum(value * value for value in vector)) | ||
| if magnitude == 0: | ||
| return vector | ||
| return tuple(value / magnitude for value in vector) | ||
|
|
||
|
|
||
| def _cosine_similarity(left: tuple[float, ...], right: tuple[float, ...]) -> float: | ||
| score = sum(left_value * right_value for left_value, right_value in zip(left, right)) | ||
| return round(max(0.0, min(1.0, score)), 3) | ||
|
|
||
|
|
||
| def _best_matrix_signal(tokens: Iterable[str]) -> tuple[str | None, float]: | ||
| best_match: str | None = None | ||
| best_confidence = 0.0 | ||
| goose_matrix = _normalize_vector( | ||
| _combine_vectors(_signal_vector(signal) for signal in GOOSE_SIGNALS) | ||
| ) | ||
| for token in tokens: | ||
| confidence = _cosine_similarity(_normalize_vector(_signal_vector(token)), goose_matrix) | ||
| if confidence > best_confidence: | ||
| best_match = token | ||
| best_confidence = confidence | ||
| return best_match, best_confidence | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,108 @@ | ||
| from goose_value_recognition import ( | ||
| GOOSE_MATRIX_DIMENSIONS, | ||
| GOOSE_VALUE, | ||
| GooseValueRecognizer, | ||
| recognize_goose_value, | ||
| recognize_goose_values, | ||
| ) | ||
|
|
||
|
|
||
| def test_exact_goose_value_is_recognized(): | ||
| result = recognize_goose_value("true Goose value") | ||
|
|
||
| assert result.recognized is True | ||
| assert result.normalized_value == GOOSE_VALUE | ||
| assert result.confidence == 1.0 | ||
| assert result.matched_signal == "goose" | ||
| assert result.reason == "exact-goose-signal" | ||
| assert result.representation_dimension == GOOSE_MATRIX_DIMENSIONS | ||
| assert result.matrix_score > 0.0 | ||
|
|
||
|
|
||
| def test_approximate_goose_value_is_recognized(): | ||
| result = recognize_goose_value("automatic gooze value recognision") | ||
|
|
||
| assert result.recognized is True | ||
| assert result.normalized_value == GOOSE_VALUE | ||
| assert result.confidence >= 0.78 | ||
| assert result.matched_signal == "gooze" | ||
| assert result.reason == "approximate-goose-signal" | ||
|
Comment on lines
+25
to
+29
This comment was marked as off-topic.
Sorry, something went wrong. |
||
|
|
||
|
|
||
| def test_nearby_bird_value_candidates_are_recognized(): | ||
| results = recognize_goose_values( | ||
| [ | ||
| "ducks with golden egg factory capacities", | ||
| "pigeons sold as value-bearing egg generators", | ||
| {"description": "other birds might implement egg factory capacity"}, | ||
| ] | ||
| ) | ||
|
|
||
| assert [result.recognized for result in results] == [True, True, True] | ||
| assert [result.reason for result in results] == [ | ||
| "approximate-goose-value-signal", | ||
| "approximate-goose-value-signal", | ||
| "approximate-goose-value-signal", | ||
| ] | ||
| assert results[0].matched_signal == "ducks" | ||
| assert results[1].matched_signal == "pigeons" | ||
| assert results[2].matched_signal == "birds" | ||
| assert results[0].confidence >= 0.88 | ||
|
|
||
|
|
||
| def test_structured_candidate_fields_are_scanned(): | ||
| candidate = { | ||
| "name": "Stakeholder packet", | ||
| "description": "Preserve value for short Gooseholders", | ||
| "tags": ["pipeline", "value"], | ||
| } | ||
|
|
||
| result = GooseValueRecognizer().recognize(candidate) | ||
|
|
||
| assert result.recognized is True | ||
| assert result.matched_signal == "gooseholders" | ||
|
|
||
|
|
||
| def test_waterfowl_matrix_representation_is_deterministic(): | ||
| recognizer = GooseValueRecognizer() | ||
|
|
||
| first = recognizer.recognize( | ||
| { | ||
| "paper": "Compact Geese Representation", | ||
| "claim": "short Gooseholders preserve matrix value", | ||
| } | ||
| ) | ||
| second = recognizer.recognize( | ||
| { | ||
| "paper": "Compact Geese Representation", | ||
| "claim": "short Gooseholders preserve matrix value", | ||
| } | ||
| ) | ||
|
|
||
| assert first.recognized is True | ||
| assert first.representation_dimension == 71 | ||
| assert first.matrix_score == second.matrix_score | ||
| assert first.matrix_score > 0.0 | ||
|
|
||
|
|
||
| def test_batch_pipeline_preserves_candidate_order(): | ||
| results = recognize_goose_values( | ||
| [ | ||
| "ordinary value", | ||
| {"label": "goose-fist"}, | ||
| ["goos", "approximation"], | ||
| ] | ||
| ) | ||
|
|
||
| assert [result.recognized for result in results] == [False, True, True] | ||
| assert results[1].reason == "exact-goose-signal" | ||
| assert results[2].matched_signal == "goos" | ||
|
|
||
|
|
||
| def test_non_goose_candidate_is_rejected(): | ||
| result = recognize_goose_value("banana pudding futures") | ||
|
|
||
| assert result.recognized is False | ||
| assert result.normalized_value is None | ||
| assert result.confidence < 0.78 | ||
| assert result.reason == "below-goose-threshold" | ||
This comment was marked as off-topic.
Sorry, something went wrong.
Uh oh!
There was an error while loading. Please reload this page.