From aed4fe6fe4c8beb4a54b9534fae74db2418c06a8 Mon Sep 17 00:00:00 2001 From: ParthAggarwal16 Date: Wed, 27 May 2026 20:26:17 +0530 Subject: [PATCH 01/10] feat(harvester): implement repository configuration loader --- application/tests/harvester_test/__init__.py | 3 + .../fixtures/invalid_chunk_size.yaml | 17 +++ .../fixtures/invalid_missing_id.yaml | 17 +++ .../harvester_test/fixtures/invalid_yaml.yaml | 4 + .../harvester_test/fixtures/valid_repos.yaml | 16 +++ .../harvester_test/test_config_loader.py | 45 ++++++++ application/utils/harvester/__init__.py | 22 ++++ application/utils/harvester/config_loader.py | 26 +++++ application/utils/harvester/repos.yaml | 43 ++++++++ application/utils/harvester/schemas.py | 102 ++++++++++++++++++ 10 files changed, 295 insertions(+) create mode 100644 application/tests/harvester_test/__init__.py create mode 100644 application/tests/harvester_test/fixtures/invalid_chunk_size.yaml create mode 100644 application/tests/harvester_test/fixtures/invalid_missing_id.yaml create mode 100644 application/tests/harvester_test/fixtures/invalid_yaml.yaml create mode 100644 application/tests/harvester_test/fixtures/valid_repos.yaml create mode 100644 application/tests/harvester_test/test_config_loader.py create mode 100644 application/utils/harvester/__init__.py create mode 100644 application/utils/harvester/config_loader.py create mode 100644 application/utils/harvester/repos.yaml create mode 100644 application/utils/harvester/schemas.py diff --git a/application/tests/harvester_test/__init__.py b/application/tests/harvester_test/__init__.py new file mode 100644 index 000000000..06b0f7440 --- /dev/null +++ b/application/tests/harvester_test/__init__.py @@ -0,0 +1,3 @@ +""" +tests for Module A configuration layer (empty for now) +""" diff --git a/application/tests/harvester_test/fixtures/invalid_chunk_size.yaml b/application/tests/harvester_test/fixtures/invalid_chunk_size.yaml new file mode 100644 index 000000000..4c81538f4 --- /dev/null +++ b/application/tests/harvester_test/fixtures/invalid_chunk_size.yaml @@ -0,0 +1,17 @@ +repositories: + - type: github + + owner: OWASP + repo: ASVS + + paths: + include: + - "4.0/en/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1200 + + polling: + mode: incremental + interval_minutes: 60 diff --git a/application/tests/harvester_test/fixtures/invalid_missing_id.yaml b/application/tests/harvester_test/fixtures/invalid_missing_id.yaml new file mode 100644 index 000000000..4c81538f4 --- /dev/null +++ b/application/tests/harvester_test/fixtures/invalid_missing_id.yaml @@ -0,0 +1,17 @@ +repositories: + - type: github + + owner: OWASP + repo: ASVS + + paths: + include: + - "4.0/en/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1200 + + polling: + mode: incremental + interval_minutes: 60 diff --git a/application/tests/harvester_test/fixtures/invalid_yaml.yaml b/application/tests/harvester_test/fixtures/invalid_yaml.yaml new file mode 100644 index 000000000..3f74ab21c --- /dev/null +++ b/application/tests/harvester_test/fixtures/invalid_yaml.yaml @@ -0,0 +1,4 @@ +repositories: + - repository_id: owasp-top10 + source: + type github diff --git a/application/tests/harvester_test/fixtures/valid_repos.yaml b/application/tests/harvester_test/fixtures/valid_repos.yaml new file mode 100644 index 000000000..98ddf7775 --- /dev/null +++ b/application/tests/harvester_test/fixtures/valid_repos.yaml @@ -0,0 +1,16 @@ +repositories: + - id: owasp-asvs + type: github + owner: OWASP + repo: ASVS + paths: + include: + - "4.0/en/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1200 + + polling: + mode: incremental + interval_minutes: 60 diff --git a/application/tests/harvester_test/test_config_loader.py b/application/tests/harvester_test/test_config_loader.py new file mode 100644 index 000000000..0a000cf0a --- /dev/null +++ b/application/tests/harvester_test/test_config_loader.py @@ -0,0 +1,45 @@ +from pathlib import Path +import pytest +from application.utils.harvester.config_loader import ( + ConfigLoaderError, + load_repo_config, +) + +FIXTURES_DIR = Path(__file__).parent / "fixtures" + + +def test_load_valid_config(): + config_path = FIXTURES_DIR / "valid_repos.yaml" + + config = load_repo_config(config_path) + + assert len(config.repositories) == 1 + + repo = config.repositories[0] + + assert repo.id == "owasp-asvs" + assert repo.owner == "OWASP" + assert repo.repo == "ASVS" + + +def test_missing_repository_id(): + config_path = FIXTURES_DIR / "invalid_missing_id.yaml" + with pytest.raises(ConfigLoaderError): + load_repo_config(config_path) + + +def test_invalid_chunk_size(): + config_path = FIXTURES_DIR / "invalid_chunk_size.yaml" + with pytest.raises(ConfigLoaderError): + load_repo_config(config_path) + + +def test_invalid_yaml_syntax(): + config_path = FIXTURES_DIR / "invalid_yaml.yaml" + with pytest.raises(ConfigLoaderError): + load_repo_config(config_path) + + +def test_missing_config_file(): + with pytest.raises(FileNotFoundError): + load_repo_config("does_not_exist.yaml") diff --git a/application/utils/harvester/__init__.py b/application/utils/harvester/__init__.py new file mode 100644 index 000000000..928af5c96 --- /dev/null +++ b/application/utils/harvester/__init__.py @@ -0,0 +1,22 @@ +from .config_loader import ( + ConfigLoaderError, + load_repo_config, +) + +from .schemas import ( + ChunkingConfig, + PathRules, + PollingConfig, + RepositoryConfig, + ReposFile, +) + +__all__ = [ + "ChunkingConfig", + "ConfigLoaderError", + "PathRules", + "PollingConfig", + "RepositoryConfig", + "ReposFile", + "load_repo_config", +] diff --git a/application/utils/harvester/config_loader.py b/application/utils/harvester/config_loader.py new file mode 100644 index 000000000..bba76026c --- /dev/null +++ b/application/utils/harvester/config_loader.py @@ -0,0 +1,26 @@ +from pathlib import Path +import yaml +from pydantic import ValidationError +from .schemas import ReposFile + + +class ConfigLoaderError(Exception): + # when repo config loading fails + pass + + +def load_repo_config(path: str | Path) -> ReposFile: + config_path = Path(path) + + if not config_path.exists(): + raise FileNotFoundError(f"Configuration file not found: {config_path}") + try: + with config_path.open("r", encoding="utf-8") as file: + raw_config = yaml.safe_load(file) + except yaml.YAMLError as exc: + raise ConfigLoaderError(f"Invalid YAML syntax in {config_path}") from exc + + try: + return ReposFile.model_validate(raw_config) + except ValidationError as exc: + raise ConfigLoaderError(f"schema validation failed for {config_path}") from exc diff --git a/application/utils/harvester/repos.yaml b/application/utils/harvester/repos.yaml new file mode 100644 index 000000000..f39d07257 --- /dev/null +++ b/application/utils/harvester/repos.yaml @@ -0,0 +1,43 @@ +repositories: + - id: owasp-asvs + type: github + enabled: true + owner: OWASP + repo: ASVS + branch: master + paths: + include: + - "4.0/en/**/*.md" + + exclude: + - "**/archive/**" + + chunking: + strategy: markdown + max_tokens: 1200 + overlap_tokens: 100 + + polling: + mode: incremental + interval_minutes: 60 + + - id: owasp-cheatsheets + type: github + enabled: true + + owner: OWASP + repo: CheatSheetSeries + branch: master + + paths: + include: + - "cheatsheets/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1000 + overlap_tokens: 100 + + polling: + mode: incremental + interval_minutes: 120 diff --git a/application/utils/harvester/schemas.py b/application/utils/harvester/schemas.py new file mode 100644 index 000000000..d1599e737 --- /dev/null +++ b/application/utils/harvester/schemas.py @@ -0,0 +1,102 @@ +# core routes: +# PathRules +# ChunkingConfig +# ollingConfig +# RepositoryConfig +# ReposFile + +from typing import Literal +from pydantic import BaseModel, Field, ConfigDict + + +# this will control which repo paths are included and excluded during ingestions +class PathRules(BaseModel): + model_config = ConfigDict(extra="forbid") + + include: list[str] = Field( + ..., + min_length=1, + description="Glob patterns to include during ingestions", + ) + exclude: list[str] = Field( + default_factory=list, description="Glob patterns to exclude during ingestions" + ) + + +# this will define how the harvested data should be chunked before downstream +class ChunkingConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + strategy: Literal["markdown", "plaintext"] = Field( + ..., + description="Chunking startergy used for text segmentation", + ) + + max_tokens: int = Field(..., gt=0, description="max toekn size per chunk") + + overlap_tokens: int = Field( # a bit concerned about this + ge=0, # this can also be = 0 i suppose + default=100, + description="token overlap between adjacent chunks", + ) + + +# this one defines repository synchronize behaviour +class PollingConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + mode: Literal["full", "incremental"] = Field( + ..., description="repository sync mode" + ) + + interval_minutes: int = Field(..., gt=0, description="polling interval in minutes") + + +# top level repository ingestion configuration +class RepositoryConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + id: str = Field( + ..., + min_length=1, + description="unique repository identifier.", + ) + + type: Literal["github"] = Field( + ..., + description="repository source type.", + ) + enabled: bool = Field( + default=True, + description="whether ingestion is enabled for this repository.", + ) + owner: str = Field( + ..., + min_length=1, + description="repository organization.", + ) + repo: str = Field( + ..., + min_length=1, + description="repository name.", + ) + branch: str = Field( + default="main", + min_length=1, + description="Repository branch to ingest.", + ) + + paths: PathRules + chunking: ChunkingConfig + polling: PollingConfig + + +# Root configuration object loaded from repos.yaml. +class ReposFile(BaseModel): + model_config = ConfigDict(extra="forbid") + + repositories: list[RepositoryConfig] = Field( + ..., + min_length=1, + description="List of repositories configured for ingestion.", + ) From f370f730c8058deb0fea54f5cad2ad4bae2c0eee Mon Sep 17 00:00:00 2001 From: ParthAggarwal16 Date: Fri, 29 May 2026 11:49:22 +0530 Subject: [PATCH 02/10] fix(harvester): address CodeRabbit validation feedback --- .../harvester_test/fixtures/invalid_chunk_size.yaml | 5 +++-- .../tests/harvester_test/test_config_loader.py | 2 +- application/utils/harvester/config_loader.py | 6 ++++-- application/utils/harvester/schemas.py | 11 ++--------- 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/application/tests/harvester_test/fixtures/invalid_chunk_size.yaml b/application/tests/harvester_test/fixtures/invalid_chunk_size.yaml index 4c81538f4..b06fdc0ad 100644 --- a/application/tests/harvester_test/fixtures/invalid_chunk_size.yaml +++ b/application/tests/harvester_test/fixtures/invalid_chunk_size.yaml @@ -1,6 +1,7 @@ repositories: - - type: github + - id: owasp-asvs + type: github owner: OWASP repo: ASVS @@ -10,7 +11,7 @@ repositories: chunking: strategy: markdown - max_tokens: 1200 + max_tokens: 0 polling: mode: incremental diff --git a/application/tests/harvester_test/test_config_loader.py b/application/tests/harvester_test/test_config_loader.py index 0a000cf0a..4a02ff2d0 100644 --- a/application/tests/harvester_test/test_config_loader.py +++ b/application/tests/harvester_test/test_config_loader.py @@ -30,7 +30,7 @@ def test_missing_repository_id(): def test_invalid_chunk_size(): config_path = FIXTURES_DIR / "invalid_chunk_size.yaml" - with pytest.raises(ConfigLoaderError): + with pytest.raises(ConfigLoaderError, match="max_tokens"): load_repo_config(config_path) diff --git a/application/utils/harvester/config_loader.py b/application/utils/harvester/config_loader.py index bba76026c..77dba72bb 100644 --- a/application/utils/harvester/config_loader.py +++ b/application/utils/harvester/config_loader.py @@ -12,7 +12,7 @@ class ConfigLoaderError(Exception): def load_repo_config(path: str | Path) -> ReposFile: config_path = Path(path) - if not config_path.exists(): + if not config_path.is_file(): raise FileNotFoundError(f"Configuration file not found: {config_path}") try: with config_path.open("r", encoding="utf-8") as file: @@ -23,4 +23,6 @@ def load_repo_config(path: str | Path) -> ReposFile: try: return ReposFile.model_validate(raw_config) except ValidationError as exc: - raise ConfigLoaderError(f"schema validation failed for {config_path}") from exc + raise ConfigLoaderError( + f"schema validation failed for {config_path} : {exc}" + ) from exc diff --git a/application/utils/harvester/schemas.py b/application/utils/harvester/schemas.py index d1599e737..9a94f3285 100644 --- a/application/utils/harvester/schemas.py +++ b/application/utils/harvester/schemas.py @@ -1,10 +1,3 @@ -# core routes: -# PathRules -# ChunkingConfig -# ollingConfig -# RepositoryConfig -# ReposFile - from typing import Literal from pydantic import BaseModel, Field, ConfigDict @@ -29,10 +22,10 @@ class ChunkingConfig(BaseModel): strategy: Literal["markdown", "plaintext"] = Field( ..., - description="Chunking startergy used for text segmentation", + description="Chunking strategy used for text segmentation", ) - max_tokens: int = Field(..., gt=0, description="max toekn size per chunk") + max_tokens: int = Field(..., gt=0, description="max token size per chunk") overlap_tokens: int = Field( # a bit concerned about this ge=0, # this can also be = 0 i suppose From 9478e54856ae737e79b5eefb95991dacb2d9c9f0 Mon Sep 17 00:00:00 2001 From: ParthAggarwal16 Date: Sun, 31 May 2026 14:37:17 +0530 Subject: [PATCH 03/10] feat(harvester): add repository config loader and validation --- .../fixtures/duplicate_repo_ids.yaml | 34 +++++++++++++++++++ .../fixtures/duplicate_repositories.yaml | 34 +++++++++++++++++++ .../fixtures/empty_include_paths.yaml | 16 +++++++++ .../fixtures/invalid_polling_interval.yaml | 17 ++++++++++ .../harvester_test/test_repos_validator.py | 34 +++++++++++++++++++ .../utils/harvester/repos_validator.py | 27 +++++++++++++++ 6 files changed, 162 insertions(+) create mode 100644 application/tests/harvester_test/fixtures/duplicate_repo_ids.yaml create mode 100644 application/tests/harvester_test/fixtures/duplicate_repositories.yaml create mode 100644 application/tests/harvester_test/fixtures/empty_include_paths.yaml create mode 100644 application/tests/harvester_test/fixtures/invalid_polling_interval.yaml create mode 100644 application/tests/harvester_test/test_repos_validator.py create mode 100644 application/utils/harvester/repos_validator.py diff --git a/application/tests/harvester_test/fixtures/duplicate_repo_ids.yaml b/application/tests/harvester_test/fixtures/duplicate_repo_ids.yaml new file mode 100644 index 000000000..b9f76c2e5 --- /dev/null +++ b/application/tests/harvester_test/fixtures/duplicate_repo_ids.yaml @@ -0,0 +1,34 @@ +repositories: + - id: asvs + type: github + owner: OWASP + repo: ASVS + + paths: + include: + - "4.0/en/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1200 + + polling: + mode: incremental + interval_minutes: 60 + + - id: asvs + type: github + owner: OWASP + repo: CheatSheetSeries + + paths: + include: + - "cheatsheets/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1200 + + polling: + mode: incremental + interval_minutes: 60 diff --git a/application/tests/harvester_test/fixtures/duplicate_repositories.yaml b/application/tests/harvester_test/fixtures/duplicate_repositories.yaml new file mode 100644 index 000000000..62f019db8 --- /dev/null +++ b/application/tests/harvester_test/fixtures/duplicate_repositories.yaml @@ -0,0 +1,34 @@ +repositories: + - id: asvs-1 + type: github + owner: OWASP + repo: ASVS + + paths: + include: + - "4.0/en/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1200 + + polling: + mode: incremental + interval_minutes: 60 + + - id: asvs-2 + type: github + owner: OWASP + repo: ASVS + + paths: + include: + - "4.0/en/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1200 + + polling: + mode: incremental + interval_minutes: 60 diff --git a/application/tests/harvester_test/fixtures/empty_include_paths.yaml b/application/tests/harvester_test/fixtures/empty_include_paths.yaml new file mode 100644 index 000000000..afe23361a --- /dev/null +++ b/application/tests/harvester_test/fixtures/empty_include_paths.yaml @@ -0,0 +1,16 @@ +repositories: + - id: asvs + type: github + owner: OWASP + repo: ASVS + + paths: + include: [] + + chunking: + strategy: markdown + max_tokens: 1200 + + polling: + mode: incremental + interval_minutes: 60 diff --git a/application/tests/harvester_test/fixtures/invalid_polling_interval.yaml b/application/tests/harvester_test/fixtures/invalid_polling_interval.yaml new file mode 100644 index 000000000..94923046c --- /dev/null +++ b/application/tests/harvester_test/fixtures/invalid_polling_interval.yaml @@ -0,0 +1,17 @@ +repositories: + - id: asvs + type: github + owner: OWASP + repo: ASVS + + paths: + include: + - "4.0/en/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1200 + + polling: + mode: incremental + interval_minutes: 0 diff --git a/application/tests/harvester_test/test_repos_validator.py b/application/tests/harvester_test/test_repos_validator.py new file mode 100644 index 000000000..35bc4b8e4 --- /dev/null +++ b/application/tests/harvester_test/test_repos_validator.py @@ -0,0 +1,34 @@ +from pathlib import Path +import pytest +from application.utils.harvester.config_loader import ( + load_repo_config, +) +from application.utils.harvester.repos_validator import ( + RepositoryValidationError, + validate_repositories, +) + +FIXTURES_DIR = Path(__file__).parent / "fixtures" + + +def test_duplicate_repository_ids(): + config_path = FIXTURES_DIR / "duplicate_repo_ids.yaml" + + config = load_repo_config(config_path) + + with pytest.raises( + RepositoryValidationError, + match="Duplicate repository id", + ): + validate_repositories(config) + + +def test_duplicate_repositories(): + config_path = FIXTURES_DIR / "duplicate_repositories.yaml" + config = load_repo_config(config_path) + + with pytest.raises( + RepositoryValidationError, + match="Duplicate repository detected", + ): + validate_repositories(config) diff --git a/application/utils/harvester/repos_validator.py b/application/utils/harvester/repos_validator.py new file mode 100644 index 000000000..60eccdf0c --- /dev/null +++ b/application/utils/harvester/repos_validator.py @@ -0,0 +1,27 @@ +# as the name suggests +from application.utils.harvester.schemas import ReposFile + + +class RepositoryValidationError(Exception): + """Raised when repository configuration fails semantic validation.""" + + +def validate_repositories(config: ReposFile) -> None: + seen_ids: set[str] = set() + seen_repositories: set[tuple[str, str]] = set() + + for repository in config.repositories: + if repository.id in seen_ids: + raise RepositoryValidationError( + f"Duplicate repository id found: {repository.id}" + ) + seen_ids.add(repository.id) + repository_key = ( + repository.owner, + repository.repo, + ) + if repository_key in seen_repositories: + raise RepositoryValidationError( + f"Duplicate repository detected: {repository.owner}/{repository.repo}" + ) + seen_repositories.add(repository_key) From 43fb6cb6e2740d67447d153c43e6cc5335bdcf32 Mon Sep 17 00:00:00 2001 From: ParthAggarwal16 Date: Sun, 31 May 2026 14:55:00 +0530 Subject: [PATCH 04/10] addressing coderabbit --- application/utils/harvester/repos_validator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/application/utils/harvester/repos_validator.py b/application/utils/harvester/repos_validator.py index 60eccdf0c..74da14df3 100644 --- a/application/utils/harvester/repos_validator.py +++ b/application/utils/harvester/repos_validator.py @@ -17,8 +17,8 @@ def validate_repositories(config: ReposFile) -> None: ) seen_ids.add(repository.id) repository_key = ( - repository.owner, - repository.repo, + repository.owner.casefold(), + repository.repo.casefold(), ) if repository_key in seen_repositories: raise RepositoryValidationError( From b9622463fef447a5c82f32b2dde0f71b125c41b9 Mon Sep 17 00:00:00 2001 From: ParthAggarwal16 Date: Sun, 31 May 2026 15:38:36 +0530 Subject: [PATCH 05/10] addressing coderabbit again --- application/utils/harvester/schemas.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/application/utils/harvester/schemas.py b/application/utils/harvester/schemas.py index 9a94f3285..1c48a0c43 100644 --- a/application/utils/harvester/schemas.py +++ b/application/utils/harvester/schemas.py @@ -1,5 +1,5 @@ from typing import Literal -from pydantic import BaseModel, Field, ConfigDict +from pydantic import BaseModel, Field, ConfigDict, model_validator # this will control which repo paths are included and excluded during ingestions @@ -33,6 +33,15 @@ class ChunkingConfig(BaseModel): description="token overlap between adjacent chunks", ) + @model_validator(mode="after") + def overlap_must_be_less_than_max(self) -> "ChunkingConfig": + if self.overlap_tokens >= self.max_tokens: + raise ValueError( + f"overlap_tokens ({self.overlap_tokens}) must be less than " + f"max_tokens ({self.max_tokens})" + ) + return self + # this one defines repository synchronize behaviour class PollingConfig(BaseModel): From b556308ef5ebf1ca055c084378a061065d615041 Mon Sep 17 00:00:00 2001 From: ParthAggarwal16 Date: Sun, 31 May 2026 17:44:34 +0530 Subject: [PATCH 06/10] Add repository validation rules and fixtures --- .../fixtures/duplicate_include_paths.yaml | 22 ++++++++++++++ .../harvester_test/fixtures/empty_owner.yaml | 21 +++++++++++++ .../fixtures/invalid_chunking_strategy.yaml | 21 +++++++++++++ .../fixtures/invalid_polling_mode.yaml | 21 +++++++++++++ .../harvester_test/test_config_loader.py | 30 +++++++++++++++++++ .../harvester_test/test_repos_validator.py | 11 +++++++ .../utils/harvester/exclude_patterns.txt | 10 +++++++ .../utils/harvester/repos_validator.py | 8 ++++- 8 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 application/tests/harvester_test/fixtures/duplicate_include_paths.yaml create mode 100644 application/tests/harvester_test/fixtures/empty_owner.yaml create mode 100644 application/tests/harvester_test/fixtures/invalid_chunking_strategy.yaml create mode 100644 application/tests/harvester_test/fixtures/invalid_polling_mode.yaml create mode 100644 application/utils/harvester/exclude_patterns.txt diff --git a/application/tests/harvester_test/fixtures/duplicate_include_paths.yaml b/application/tests/harvester_test/fixtures/duplicate_include_paths.yaml new file mode 100644 index 000000000..7b8cfef04 --- /dev/null +++ b/application/tests/harvester_test/fixtures/duplicate_include_paths.yaml @@ -0,0 +1,22 @@ +repositories: + - id: duplicate-includes + type: github + enabled: true + + owner: OWASP + repo: ASVS + branch: master + + paths: + include: + - "docs/**/*.md" + - "docs/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1200 + overlap_tokens: 100 + + polling: + mode: incremental + interval_minutes: 60 diff --git a/application/tests/harvester_test/fixtures/empty_owner.yaml b/application/tests/harvester_test/fixtures/empty_owner.yaml new file mode 100644 index 000000000..a32fe69da --- /dev/null +++ b/application/tests/harvester_test/fixtures/empty_owner.yaml @@ -0,0 +1,21 @@ +repositories: + - id: empty-owner + type: github + enabled: true + + owner: "" + repo: ASVS + branch: master + + paths: + include: + - "docs/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1200 + overlap_tokens: 100 + + polling: + mode: incremental + interval_minutes: 60 diff --git a/application/tests/harvester_test/fixtures/invalid_chunking_strategy.yaml b/application/tests/harvester_test/fixtures/invalid_chunking_strategy.yaml new file mode 100644 index 000000000..c4cb10905 --- /dev/null +++ b/application/tests/harvester_test/fixtures/invalid_chunking_strategy.yaml @@ -0,0 +1,21 @@ +repositories: + - id: invalid-strategy + type: github + enabled: true + + owner: OWASP + repo: ASVS + branch: master + + paths: + include: + - "docs/**/*.md" + + chunking: + strategy: invalid_strategy + max_tokens: 1200 + overlap_tokens: 100 + + polling: + mode: incremental + interval_minutes: 60 diff --git a/application/tests/harvester_test/fixtures/invalid_polling_mode.yaml b/application/tests/harvester_test/fixtures/invalid_polling_mode.yaml new file mode 100644 index 000000000..d21b643f4 --- /dev/null +++ b/application/tests/harvester_test/fixtures/invalid_polling_mode.yaml @@ -0,0 +1,21 @@ +repositories: + - id: invalid-polling + type: github + enabled: true + + owner: OWASP + repo: ASVS + branch: master + + paths: + include: + - "docs/**/*.md" + + chunking: + strategy: markdown + max_tokens: 1200 + overlap_tokens: 100 + + polling: + mode: realtime + interval_minutes: 60 diff --git a/application/tests/harvester_test/test_config_loader.py b/application/tests/harvester_test/test_config_loader.py index 4a02ff2d0..b6ca0e51a 100644 --- a/application/tests/harvester_test/test_config_loader.py +++ b/application/tests/harvester_test/test_config_loader.py @@ -43,3 +43,33 @@ def test_invalid_yaml_syntax(): def test_missing_config_file(): with pytest.raises(FileNotFoundError): load_repo_config("does_not_exist.yaml") + + +def test_empty_owner(): + config_path = FIXTURES_DIR / "empty_owner.yaml" + + with pytest.raises( + ConfigLoaderError, + match="owner", + ): + load_repo_config(config_path) + + +def test_invalid_chunking_strategy(): + config_path = FIXTURES_DIR / "invalid_chunking_strategy.yaml" + + with pytest.raises( + ConfigLoaderError, + match="strategy", + ): + load_repo_config(config_path) + + +def test_invalid_polling_mode(): + config_path = FIXTURES_DIR / "invalid_polling_mode.yaml" + + with pytest.raises( + ConfigLoaderError, + match="mode", + ): + load_repo_config(config_path) diff --git a/application/tests/harvester_test/test_repos_validator.py b/application/tests/harvester_test/test_repos_validator.py index 35bc4b8e4..517a4bbc1 100644 --- a/application/tests/harvester_test/test_repos_validator.py +++ b/application/tests/harvester_test/test_repos_validator.py @@ -32,3 +32,14 @@ def test_duplicate_repositories(): match="Duplicate repository detected", ): validate_repositories(config) + + +def test_duplicate_include_paths(): + config_path = FIXTURES_DIR / "duplicate_include_paths.yaml" + config = load_repo_config(config_path) + + with pytest.raises( + RepositoryValidationError, + match="duplicate include paths", + ): + validate_repositories(config) diff --git a/application/utils/harvester/exclude_patterns.txt b/application/utils/harvester/exclude_patterns.txt new file mode 100644 index 000000000..cd7f642f8 --- /dev/null +++ b/application/utils/harvester/exclude_patterns.txt @@ -0,0 +1,10 @@ +**/.git/** +**/node_modules/** +**/__pycache__/** +**/*.png +**/*.jpg +**/*.jpeg +**/*.svg +**/*.gif +**/*.pdf +**/archive/** diff --git a/application/utils/harvester/repos_validator.py b/application/utils/harvester/repos_validator.py index 74da14df3..2d1f21340 100644 --- a/application/utils/harvester/repos_validator.py +++ b/application/utils/harvester/repos_validator.py @@ -22,6 +22,12 @@ def validate_repositories(config: ReposFile) -> None: ) if repository_key in seen_repositories: raise RepositoryValidationError( - f"Duplicate repository detected: {repository.owner}/{repository.repo}" + f"Duplicate repository detected: " + f"{repository.owner}/{repository.repo}" ) seen_repositories.add(repository_key) + include_patterns = set(repository.paths.include) + if len(include_patterns) != len(repository.paths.include): + raise RepositoryValidationError( + f"Repository '{repository.id}' " f"has duplicate include paths" + ) From fc4b58323b22342da169f50796389dd787542b77 Mon Sep 17 00:00:00 2001 From: ParthAggarwal16 Date: Sun, 31 May 2026 18:17:43 +0530 Subject: [PATCH 07/10] normalize repository ids during validation --- application/utils/harvester/repos_validator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/application/utils/harvester/repos_validator.py b/application/utils/harvester/repos_validator.py index 2d1f21340..7f4858771 100644 --- a/application/utils/harvester/repos_validator.py +++ b/application/utils/harvester/repos_validator.py @@ -11,23 +11,23 @@ def validate_repositories(config: ReposFile) -> None: seen_repositories: set[tuple[str, str]] = set() for repository in config.repositories: - if repository.id in seen_ids: + repo_id_key = repository.id.casefold() + if repo_id_key in seen_ids: raise RepositoryValidationError( f"Duplicate repository id found: {repository.id}" ) - seen_ids.add(repository.id) + seen_ids.add(repo_id_key) repository_key = ( repository.owner.casefold(), repository.repo.casefold(), ) if repository_key in seen_repositories: raise RepositoryValidationError( - f"Duplicate repository detected: " - f"{repository.owner}/{repository.repo}" + f"Duplicate repository detected: {repository.owner}/{repository.repo}" ) seen_repositories.add(repository_key) include_patterns = set(repository.paths.include) if len(include_patterns) != len(repository.paths.include): raise RepositoryValidationError( - f"Repository '{repository.id}' " f"has duplicate include paths" + f"Repository '{repository.id}' has duplicate include paths" ) From 713f99d75106b1a121c79c710aebfc4c2301db74 Mon Sep 17 00:00:00 2001 From: ParthAggarwal16 Date: Tue, 2 Jun 2026 09:51:15 +0530 Subject: [PATCH 08/10] Refine harvester validation exports and YAML fixtures --- .../tests/harvester_test/fixtures/invalid_yaml.yaml | 6 +++--- application/utils/harvester/__init__.py | 7 ++++++- application/utils/harvester/repos_validator.py | 2 +- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/application/tests/harvester_test/fixtures/invalid_yaml.yaml b/application/tests/harvester_test/fixtures/invalid_yaml.yaml index 3f74ab21c..c60d3e23b 100644 --- a/application/tests/harvester_test/fixtures/invalid_yaml.yaml +++ b/application/tests/harvester_test/fixtures/invalid_yaml.yaml @@ -1,4 +1,4 @@ repositories: - - repository_id: owasp-top10 - source: - type github + - id: broken + paths: + include: [unclosed diff --git a/application/utils/harvester/__init__.py b/application/utils/harvester/__init__.py index 928af5c96..0be31a8a4 100644 --- a/application/utils/harvester/__init__.py +++ b/application/utils/harvester/__init__.py @@ -2,7 +2,10 @@ ConfigLoaderError, load_repo_config, ) - +from .repos_validator import ( + RepositoryValidationError, + validate_repositories, +) from .schemas import ( ChunkingConfig, PathRules, @@ -19,4 +22,6 @@ "RepositoryConfig", "ReposFile", "load_repo_config", + "RepositoryValidationError", + "validate_repositories", ] diff --git a/application/utils/harvester/repos_validator.py b/application/utils/harvester/repos_validator.py index 7f4858771..81cdc8273 100644 --- a/application/utils/harvester/repos_validator.py +++ b/application/utils/harvester/repos_validator.py @@ -1,5 +1,5 @@ # as the name suggests -from application.utils.harvester.schemas import ReposFile +from .schemas import ReposFile class RepositoryValidationError(Exception): From c7ecbf08cff5e4c1c42049eeae7e244575b59d77 Mon Sep 17 00:00:00 2001 From: ParthAggarwal16 Date: Sun, 7 Jun 2026 12:14:07 +0530 Subject: [PATCH 09/10] removed stable dev comments --- application/utils/harvester/schemas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/application/utils/harvester/schemas.py b/application/utils/harvester/schemas.py index 1c48a0c43..fd9d7d720 100644 --- a/application/utils/harvester/schemas.py +++ b/application/utils/harvester/schemas.py @@ -27,8 +27,8 @@ class ChunkingConfig(BaseModel): max_tokens: int = Field(..., gt=0, description="max token size per chunk") - overlap_tokens: int = Field( # a bit concerned about this - ge=0, # this can also be = 0 i suppose + overlap_tokens: int = Field( + ge=0, default=100, description="token overlap between adjacent chunks", ) From 0be6df87c7642908c76dc915c48b54c30227acff Mon Sep 17 00:00:00 2001 From: ParthAggarwal16 Date: Sun, 7 Jun 2026 12:44:40 +0530 Subject: [PATCH 10/10] Add validator success test and clean exports --- application/tests/harvester_test/test_repos_validator.py | 8 ++++++++ application/utils/harvester/__init__.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/application/tests/harvester_test/test_repos_validator.py b/application/tests/harvester_test/test_repos_validator.py index 517a4bbc1..8c03a9de8 100644 --- a/application/tests/harvester_test/test_repos_validator.py +++ b/application/tests/harvester_test/test_repos_validator.py @@ -43,3 +43,11 @@ def test_duplicate_include_paths(): match="duplicate include paths", ): validate_repositories(config) + + +def test_validate_valid_repositories(): + config_path = FIXTURES_DIR / "valid_repos.yaml" + + config = load_repo_config(config_path) + + validate_repositories(config) diff --git a/application/utils/harvester/__init__.py b/application/utils/harvester/__init__.py index 0be31a8a4..9c74939c7 100644 --- a/application/utils/harvester/__init__.py +++ b/application/utils/harvester/__init__.py @@ -20,8 +20,8 @@ "PathRules", "PollingConfig", "RepositoryConfig", + "RepositoryValidationError", "ReposFile", "load_repo_config", - "RepositoryValidationError", "validate_repositories", ]