diff --git a/application/tests/harvester_test/__init__.py b/application/tests/harvester_test/__init__.py new file mode 100644 index 000000000..06b0f7440 --- /dev/null +++ b/application/tests/harvester_test/__init__.py @@ -0,0 +1,3 @@ +""" +tests for Module A configuration layer (empty for now) +""" diff --git a/application/tests/harvester_test/fixtures/duplicate_include_paths.yaml b/application/tests/harvester_test/fixtures/duplicate_include_paths.yaml new file mode 100644 index 000000000..7b8cfef04 --- /dev/null +++ b/application/tests/harvester_test/fixtures/duplicate_include_paths.yaml @@ -0,0 +1,22 @@ +repositories: + - id: duplicate-includes + type: github + enabled: true + + owner: OWASP + repo: ASVS + branch: master + + paths: + include: + - "docs/**/*.md" + - "docs/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1200 + overlap_tokens: 100 + + polling: + mode: incremental + interval_minutes: 60 diff --git a/application/tests/harvester_test/fixtures/duplicate_repo_ids.yaml b/application/tests/harvester_test/fixtures/duplicate_repo_ids.yaml new file mode 100644 index 000000000..b9f76c2e5 --- /dev/null +++ b/application/tests/harvester_test/fixtures/duplicate_repo_ids.yaml @@ -0,0 +1,34 @@ +repositories: + - id: asvs + type: github + owner: OWASP + repo: ASVS + + paths: + include: + - "4.0/en/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1200 + + polling: + mode: incremental + interval_minutes: 60 + + - id: asvs + type: github + owner: OWASP + repo: CheatSheetSeries + + paths: + include: + - "cheatsheets/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1200 + + polling: + mode: incremental + interval_minutes: 60 diff --git a/application/tests/harvester_test/fixtures/duplicate_repositories.yaml b/application/tests/harvester_test/fixtures/duplicate_repositories.yaml new file mode 100644 index 000000000..62f019db8 --- /dev/null +++ b/application/tests/harvester_test/fixtures/duplicate_repositories.yaml @@ -0,0 +1,34 @@ +repositories: + - id: asvs-1 + type: github + owner: OWASP + repo: ASVS + + paths: + include: + - "4.0/en/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1200 + + polling: + mode: incremental + interval_minutes: 60 + + - id: asvs-2 + type: github + owner: OWASP + repo: ASVS + + paths: + include: + - "4.0/en/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1200 + + polling: + mode: incremental + interval_minutes: 60 diff --git a/application/tests/harvester_test/fixtures/empty_include_paths.yaml b/application/tests/harvester_test/fixtures/empty_include_paths.yaml new file mode 100644 index 000000000..afe23361a --- /dev/null +++ b/application/tests/harvester_test/fixtures/empty_include_paths.yaml @@ -0,0 +1,16 @@ +repositories: + - id: asvs + type: github + owner: OWASP + repo: ASVS + + paths: + include: [] + + chunking: + strategy: markdown + max_tokens: 1200 + + polling: + mode: incremental + interval_minutes: 60 diff --git a/application/tests/harvester_test/fixtures/empty_owner.yaml b/application/tests/harvester_test/fixtures/empty_owner.yaml new file mode 100644 index 000000000..a32fe69da --- /dev/null +++ b/application/tests/harvester_test/fixtures/empty_owner.yaml @@ -0,0 +1,21 @@ +repositories: + - id: empty-owner + type: github + enabled: true + + owner: "" + repo: ASVS + branch: master + + paths: + include: + - "docs/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1200 + overlap_tokens: 100 + + polling: + mode: incremental + interval_minutes: 60 diff --git a/application/tests/harvester_test/fixtures/invalid_chunk_size.yaml b/application/tests/harvester_test/fixtures/invalid_chunk_size.yaml new file mode 100644 index 000000000..b06fdc0ad --- /dev/null +++ b/application/tests/harvester_test/fixtures/invalid_chunk_size.yaml @@ -0,0 +1,18 @@ +repositories: + - id: owasp-asvs + + type: github + owner: OWASP + repo: ASVS + + paths: + include: + - "4.0/en/**/*.md" + + chunking: + strategy: markdown + max_tokens: 0 + + polling: + mode: incremental + interval_minutes: 60 diff --git a/application/tests/harvester_test/fixtures/invalid_chunking_strategy.yaml b/application/tests/harvester_test/fixtures/invalid_chunking_strategy.yaml new file mode 100644 index 000000000..c4cb10905 --- /dev/null +++ b/application/tests/harvester_test/fixtures/invalid_chunking_strategy.yaml @@ -0,0 +1,21 @@ +repositories: + - id: invalid-strategy + type: github + enabled: true + + owner: OWASP + repo: ASVS + branch: master + + paths: + include: + - "docs/**/*.md" + + chunking: + strategy: invalid_strategy + max_tokens: 1200 + overlap_tokens: 100 + + polling: + mode: incremental + interval_minutes: 60 diff --git a/application/tests/harvester_test/fixtures/invalid_missing_id.yaml b/application/tests/harvester_test/fixtures/invalid_missing_id.yaml new file mode 100644 index 000000000..4c81538f4 --- /dev/null +++ b/application/tests/harvester_test/fixtures/invalid_missing_id.yaml @@ -0,0 +1,17 @@ +repositories: + - type: github + + owner: OWASP + repo: ASVS + + paths: + include: + - "4.0/en/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1200 + + polling: + mode: incremental + interval_minutes: 60 diff --git a/application/tests/harvester_test/fixtures/invalid_polling_interval.yaml b/application/tests/harvester_test/fixtures/invalid_polling_interval.yaml new file mode 100644 index 000000000..94923046c --- /dev/null +++ b/application/tests/harvester_test/fixtures/invalid_polling_interval.yaml @@ -0,0 +1,17 @@ +repositories: + - id: asvs + type: github + owner: OWASP + repo: ASVS + + paths: + include: + - "4.0/en/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1200 + + polling: + mode: incremental + interval_minutes: 0 diff --git a/application/tests/harvester_test/fixtures/invalid_polling_mode.yaml b/application/tests/harvester_test/fixtures/invalid_polling_mode.yaml new file mode 100644 index 000000000..d21b643f4 --- /dev/null +++ b/application/tests/harvester_test/fixtures/invalid_polling_mode.yaml @@ -0,0 +1,21 @@ +repositories: + - id: invalid-polling + type: github + enabled: true + + owner: OWASP + repo: ASVS + branch: master + + paths: + include: + - "docs/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1200 + overlap_tokens: 100 + + polling: + mode: realtime + interval_minutes: 60 diff --git a/application/tests/harvester_test/fixtures/invalid_yaml.yaml b/application/tests/harvester_test/fixtures/invalid_yaml.yaml new file mode 100644 index 000000000..c60d3e23b --- /dev/null +++ b/application/tests/harvester_test/fixtures/invalid_yaml.yaml @@ -0,0 +1,4 @@ +repositories: + - id: broken + paths: + include: [unclosed diff --git a/application/tests/harvester_test/fixtures/valid_repos.yaml b/application/tests/harvester_test/fixtures/valid_repos.yaml new file mode 100644 index 000000000..98ddf7775 --- /dev/null +++ b/application/tests/harvester_test/fixtures/valid_repos.yaml @@ -0,0 +1,16 @@ +repositories: + - id: owasp-asvs + type: github + owner: OWASP + repo: ASVS + paths: + include: + - "4.0/en/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1200 + + polling: + mode: incremental + interval_minutes: 60 diff --git a/application/tests/harvester_test/test_config_loader.py b/application/tests/harvester_test/test_config_loader.py new file mode 100644 index 000000000..b6ca0e51a --- /dev/null +++ b/application/tests/harvester_test/test_config_loader.py @@ -0,0 +1,75 @@ +from pathlib import Path +import pytest +from application.utils.harvester.config_loader import ( + ConfigLoaderError, + load_repo_config, +) + +FIXTURES_DIR = Path(__file__).parent / "fixtures" + + +def test_load_valid_config(): + config_path = FIXTURES_DIR / "valid_repos.yaml" + + config = load_repo_config(config_path) + + assert len(config.repositories) == 1 + + repo = config.repositories[0] + + assert repo.id == "owasp-asvs" + assert repo.owner == "OWASP" + assert repo.repo == "ASVS" + + +def test_missing_repository_id(): + config_path = FIXTURES_DIR / "invalid_missing_id.yaml" + with pytest.raises(ConfigLoaderError): + load_repo_config(config_path) + + +def test_invalid_chunk_size(): + config_path = FIXTURES_DIR / "invalid_chunk_size.yaml" + with pytest.raises(ConfigLoaderError, match="max_tokens"): + load_repo_config(config_path) + + +def test_invalid_yaml_syntax(): + config_path = FIXTURES_DIR / "invalid_yaml.yaml" + with pytest.raises(ConfigLoaderError): + load_repo_config(config_path) + + +def test_missing_config_file(): + with pytest.raises(FileNotFoundError): + load_repo_config("does_not_exist.yaml") + + +def test_empty_owner(): + config_path = FIXTURES_DIR / "empty_owner.yaml" + + with pytest.raises( + ConfigLoaderError, + match="owner", + ): + load_repo_config(config_path) + + +def test_invalid_chunking_strategy(): + config_path = FIXTURES_DIR / "invalid_chunking_strategy.yaml" + + with pytest.raises( + ConfigLoaderError, + match="strategy", + ): + load_repo_config(config_path) + + +def test_invalid_polling_mode(): + config_path = FIXTURES_DIR / "invalid_polling_mode.yaml" + + with pytest.raises( + ConfigLoaderError, + match="mode", + ): + load_repo_config(config_path) diff --git a/application/tests/harvester_test/test_repos_validator.py b/application/tests/harvester_test/test_repos_validator.py new file mode 100644 index 000000000..8c03a9de8 --- /dev/null +++ b/application/tests/harvester_test/test_repos_validator.py @@ -0,0 +1,53 @@ +from pathlib import Path +import pytest +from application.utils.harvester.config_loader import ( + load_repo_config, +) +from application.utils.harvester.repos_validator import ( + RepositoryValidationError, + validate_repositories, +) + +FIXTURES_DIR = Path(__file__).parent / "fixtures" + + +def test_duplicate_repository_ids(): + config_path = FIXTURES_DIR / "duplicate_repo_ids.yaml" + + config = load_repo_config(config_path) + + with pytest.raises( + RepositoryValidationError, + match="Duplicate repository id", + ): + validate_repositories(config) + + +def test_duplicate_repositories(): + config_path = FIXTURES_DIR / "duplicate_repositories.yaml" + config = load_repo_config(config_path) + + with pytest.raises( + RepositoryValidationError, + match="Duplicate repository detected", + ): + validate_repositories(config) + + +def test_duplicate_include_paths(): + config_path = FIXTURES_DIR / "duplicate_include_paths.yaml" + config = load_repo_config(config_path) + + with pytest.raises( + RepositoryValidationError, + match="duplicate include paths", + ): + validate_repositories(config) + + +def test_validate_valid_repositories(): + config_path = FIXTURES_DIR / "valid_repos.yaml" + + config = load_repo_config(config_path) + + validate_repositories(config) diff --git a/application/utils/harvester/__init__.py b/application/utils/harvester/__init__.py new file mode 100644 index 000000000..9c74939c7 --- /dev/null +++ b/application/utils/harvester/__init__.py @@ -0,0 +1,27 @@ +from .config_loader import ( + ConfigLoaderError, + load_repo_config, +) +from .repos_validator import ( + RepositoryValidationError, + validate_repositories, +) +from .schemas import ( + ChunkingConfig, + PathRules, + PollingConfig, + RepositoryConfig, + ReposFile, +) + +__all__ = [ + "ChunkingConfig", + "ConfigLoaderError", + "PathRules", + "PollingConfig", + "RepositoryConfig", + "RepositoryValidationError", + "ReposFile", + "load_repo_config", + "validate_repositories", +] diff --git a/application/utils/harvester/config_loader.py b/application/utils/harvester/config_loader.py new file mode 100644 index 000000000..77dba72bb --- /dev/null +++ b/application/utils/harvester/config_loader.py @@ -0,0 +1,28 @@ +from pathlib import Path +import yaml +from pydantic import ValidationError +from .schemas import ReposFile + + +class ConfigLoaderError(Exception): + # when repo config loading fails + pass + + +def load_repo_config(path: str | Path) -> ReposFile: + config_path = Path(path) + + if not config_path.is_file(): + raise FileNotFoundError(f"Configuration file not found: {config_path}") + try: + with config_path.open("r", encoding="utf-8") as file: + raw_config = yaml.safe_load(file) + except yaml.YAMLError as exc: + raise ConfigLoaderError(f"Invalid YAML syntax in {config_path}") from exc + + try: + return ReposFile.model_validate(raw_config) + except ValidationError as exc: + raise ConfigLoaderError( + f"schema validation failed for {config_path} : {exc}" + ) from exc diff --git a/application/utils/harvester/exclude_patterns.txt b/application/utils/harvester/exclude_patterns.txt new file mode 100644 index 000000000..cd7f642f8 --- /dev/null +++ b/application/utils/harvester/exclude_patterns.txt @@ -0,0 +1,10 @@ +**/.git/** +**/node_modules/** +**/__pycache__/** +**/*.png +**/*.jpg +**/*.jpeg +**/*.svg +**/*.gif +**/*.pdf +**/archive/** diff --git a/application/utils/harvester/repos.yaml b/application/utils/harvester/repos.yaml new file mode 100644 index 000000000..f39d07257 --- /dev/null +++ b/application/utils/harvester/repos.yaml @@ -0,0 +1,43 @@ +repositories: + - id: owasp-asvs + type: github + enabled: true + owner: OWASP + repo: ASVS + branch: master + paths: + include: + - "4.0/en/**/*.md" + + exclude: + - "**/archive/**" + + chunking: + strategy: markdown + max_tokens: 1200 + overlap_tokens: 100 + + polling: + mode: incremental + interval_minutes: 60 + + - id: owasp-cheatsheets + type: github + enabled: true + + owner: OWASP + repo: CheatSheetSeries + branch: master + + paths: + include: + - "cheatsheets/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1000 + overlap_tokens: 100 + + polling: + mode: incremental + interval_minutes: 120 diff --git a/application/utils/harvester/repos_validator.py b/application/utils/harvester/repos_validator.py new file mode 100644 index 000000000..81cdc8273 --- /dev/null +++ b/application/utils/harvester/repos_validator.py @@ -0,0 +1,33 @@ +# as the name suggests +from .schemas import ReposFile + + +class RepositoryValidationError(Exception): + """Raised when repository configuration fails semantic validation.""" + + +def validate_repositories(config: ReposFile) -> None: + seen_ids: set[str] = set() + seen_repositories: set[tuple[str, str]] = set() + + for repository in config.repositories: + repo_id_key = repository.id.casefold() + if repo_id_key in seen_ids: + raise RepositoryValidationError( + f"Duplicate repository id found: {repository.id}" + ) + seen_ids.add(repo_id_key) + repository_key = ( + repository.owner.casefold(), + repository.repo.casefold(), + ) + if repository_key in seen_repositories: + raise RepositoryValidationError( + f"Duplicate repository detected: {repository.owner}/{repository.repo}" + ) + seen_repositories.add(repository_key) + include_patterns = set(repository.paths.include) + if len(include_patterns) != len(repository.paths.include): + raise RepositoryValidationError( + f"Repository '{repository.id}' has duplicate include paths" + ) diff --git a/application/utils/harvester/schemas.py b/application/utils/harvester/schemas.py new file mode 100644 index 000000000..fd9d7d720 --- /dev/null +++ b/application/utils/harvester/schemas.py @@ -0,0 +1,104 @@ +from typing import Literal +from pydantic import BaseModel, Field, ConfigDict, model_validator + + +# this will control which repo paths are included and excluded during ingestions +class PathRules(BaseModel): + model_config = ConfigDict(extra="forbid") + + include: list[str] = Field( + ..., + min_length=1, + description="Glob patterns to include during ingestions", + ) + exclude: list[str] = Field( + default_factory=list, description="Glob patterns to exclude during ingestions" + ) + + +# this will define how the harvested data should be chunked before downstream +class ChunkingConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + strategy: Literal["markdown", "plaintext"] = Field( + ..., + description="Chunking strategy used for text segmentation", + ) + + max_tokens: int = Field(..., gt=0, description="max token size per chunk") + + overlap_tokens: int = Field( + ge=0, + default=100, + description="token overlap between adjacent chunks", + ) + + @model_validator(mode="after") + def overlap_must_be_less_than_max(self) -> "ChunkingConfig": + if self.overlap_tokens >= self.max_tokens: + raise ValueError( + f"overlap_tokens ({self.overlap_tokens}) must be less than " + f"max_tokens ({self.max_tokens})" + ) + return self + + +# this one defines repository synchronize behaviour +class PollingConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + mode: Literal["full", "incremental"] = Field( + ..., description="repository sync mode" + ) + + interval_minutes: int = Field(..., gt=0, description="polling interval in minutes") + + +# top level repository ingestion configuration +class RepositoryConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + id: str = Field( + ..., + min_length=1, + description="unique repository identifier.", + ) + + type: Literal["github"] = Field( + ..., + description="repository source type.", + ) + enabled: bool = Field( + default=True, + description="whether ingestion is enabled for this repository.", + ) + owner: str = Field( + ..., + min_length=1, + description="repository organization.", + ) + repo: str = Field( + ..., + min_length=1, + description="repository name.", + ) + branch: str = Field( + default="main", + min_length=1, + description="Repository branch to ingest.", + ) + + paths: PathRules + chunking: ChunkingConfig + polling: PollingConfig + + +# Root configuration object loaded from repos.yaml. +class ReposFile(BaseModel): + model_config = ConfigDict(extra="forbid") + + repositories: list[RepositoryConfig] = Field( + ..., + min_length=1, + description="List of repositories configured for ingestion.", + )