From d9581c12b4b5f5d0337d70f349dacd5a742ded2b Mon Sep 17 00:00:00 2001 From: Rory Byrne Date: Sun, 4 Jan 2026 16:07:58 +0000 Subject: [PATCH 1/3] feat: add OSA_DATA_DIR support for container deployments Enable single environment variable to control all data paths for container deployments while preserving XDG defaults for local dev. Key changes: - Add PathsConfig to read OSA_DATA_DIR with Pydantic Settings - Refactor OSAPaths to support unified (/data/*) and XDG modes - Derive database URL from paths via model_validator when not set - Derive vector persist_dir from OSAPaths in DI provider - Move file I/O from OSAPaths to dedicated utility modules Design decisions: - Nested configs (DatabaseConfig, etc.) changed from BaseSettings to BaseModel to enable env_nested_delimiter for OSA_DATABASE__URL - PathsConfig uses default_factory to avoid class-definition-time evaluation of environment variables - Empty string sentinel for database.url to detect "derive from paths" Closes #18 --- osa/application/di.py | 6 +- osa/cli/commands/admin.py | 4 +- osa/cli/commands/search.py | 4 +- osa/cli/commands/show.py | 4 +- osa/cli/util/__init__.py | 15 +- osa/cli/util/daemon.py | 21 ++- osa/cli/util/paths.py | 137 +++++------------- osa/cli/util/search_cache.py | 60 ++++++++ osa/cli/util/server_state.py | 65 +++++++++ osa/config.py | 80 ++++++++-- osa/infrastructure/index/di.py | 20 ++- osa/infrastructure/index/vector/backend.py | 4 + osa/infrastructure/index/vector/config.py | 8 +- tests/unit/cli/__init__.py | 0 tests/unit/cli/util/__init__.py | 0 tests/unit/cli/util/test_paths.py | 125 ++++++++++++++++ tests/unit/config/__init__.py | 0 tests/unit/config/test_paths_config.py | 96 ++++++++++++ .../index/test_vector_config.py | 54 +++++++ 19 files changed, 572 insertions(+), 131 deletions(-) create mode 100644 osa/cli/util/search_cache.py create mode 100644 osa/cli/util/server_state.py create mode 100644 tests/unit/cli/__init__.py create mode 100644 tests/unit/cli/util/__init__.py create mode 100644 tests/unit/cli/util/test_paths.py create mode 100644 tests/unit/config/__init__.py create mode 100644 tests/unit/config/test_paths_config.py create mode 100644 tests/unit/infrastructure/index/test_vector_config.py diff --git a/osa/application/di.py b/osa/application/di.py index 4fb5eec..44b51be 100644 --- a/osa/application/di.py +++ b/osa/application/di.py @@ -1,5 +1,6 @@ from dishka import AsyncContainer, make_async_container +from osa.cli.util.paths import OSAPaths from osa.config import Config from osa.domain.deposition.util.di import DepositionProvider from osa.domain.validation.util.di import ValidationProvider @@ -14,6 +15,9 @@ def create_container() -> AsyncContainer: config = Config() + # Create OSAPaths from config, supporting both unified and XDG modes + paths = OSAPaths(unified_data_dir=config.paths.data_dir) + return make_async_container( PersistenceProvider(), OciProvider(), @@ -22,6 +26,6 @@ def create_container() -> AsyncContainer: EventProvider(), DepositionProvider(), ValidationProvider(), - context={Config: config}, + context={Config: config, OSAPaths: paths}, scopes=Scope, # type: ignore[arg-type] # Custom scope class ) diff --git a/osa/cli/commands/admin.py b/osa/cli/commands/admin.py index e9c5d60..e2b4a8a 100644 --- a/osa/cli/commands/admin.py +++ b/osa/cli/commands/admin.py @@ -7,7 +7,7 @@ import cyclopts from osa.cli.console import get_console -from osa.cli.util import DaemonManager, OSAPaths, ServerStatus +from osa.cli.util import DaemonManager, OSAPaths, ServerStatus, read_server_state app = cyclopts.App(name="admin", help="Administrative commands") @@ -147,7 +147,7 @@ def exists_marker(path: Path) -> str: # Server status console.print() - state = paths.read_server_state() + state = read_server_state(paths.server_state_file) if state: from osa.cli.console import relative_time diff --git a/osa/cli/commands/search.py b/osa/cli/commands/search.py index fc962a1..85dd7c3 100644 --- a/osa/cli/commands/search.py +++ b/osa/cli/commands/search.py @@ -11,7 +11,7 @@ from osa.cli.console import get_console from osa.cli.models import RecordMetadata, SearchHit -from osa.cli.util import OSAPaths +from osa.cli.util import OSAPaths, write_search_cache app = cyclopts.App(name="search", help="Search commands") @@ -97,7 +97,7 @@ def search( # Save to cache for `osa show ` lookup if hits: paths = OSAPaths() - paths.write_search_cache(index=index, query=query, results=hits) + write_search_cache(paths.search_cache_file, index=index, query=query, results=hits) console.search_hint() except httpx.ConnectError: diff --git a/osa/cli/commands/show.py b/osa/cli/commands/show.py index a956824..7f064fb 100644 --- a/osa/cli/commands/show.py +++ b/osa/cli/commands/show.py @@ -6,7 +6,7 @@ from osa.cli.console import get_console from osa.cli.models import SearchHit -from osa.cli.util import OSAPaths +from osa.cli.util import OSAPaths, read_search_cache app = cyclopts.App(name="show", help="Show record details from last search") @@ -20,7 +20,7 @@ def show(ref: str, /) -> None: """ console = get_console() paths = OSAPaths() - cache = paths.read_search_cache() + cache = read_search_cache(paths.search_cache_file) if cache is None: console.error( diff --git a/osa/cli/util/__init__.py b/osa/cli/util/__init__.py index 07856d6..dc1c1cc 100644 --- a/osa/cli/util/__init__.py +++ b/osa/cli/util/__init__.py @@ -1,7 +1,13 @@ -"""CLI utilities (~/.osa directory, daemon management, caching).""" +"""CLI utilities (paths, daemon management, caching).""" from osa.cli.util.daemon import ConfigError, DaemonManager, ServerInfo, ServerStatus from osa.cli.util.paths import OSAPaths, ServerState +from osa.cli.util.search_cache import read_search_cache, write_search_cache +from osa.cli.util.server_state import ( + read_server_state, + remove_server_state, + write_server_state, +) __all__ = [ "ConfigError", @@ -10,4 +16,11 @@ "ServerInfo", "ServerState", "ServerStatus", + # Server state I/O + "read_server_state", + "write_server_state", + "remove_server_state", + # Search cache I/O + "read_search_cache", + "write_search_cache", ] diff --git a/osa/cli/util/daemon.py b/osa/cli/util/daemon.py index 42509e9..e7d99e2 100644 --- a/osa/cli/util/daemon.py +++ b/osa/cli/util/daemon.py @@ -11,6 +11,11 @@ from pydantic import ValidationError from osa.cli.util.paths import OSAPaths +from osa.cli.util.server_state import ( + read_server_state, + remove_server_state, + write_server_state, +) class ServerStatus(Enum): @@ -54,7 +59,7 @@ def paths(self) -> OSAPaths: def status(self) -> ServerInfo: """Get server status.""" - state = self._paths.read_server_state() + state = read_server_state(self._paths.server_state_file) if state is None: return ServerInfo(status=ServerStatus.STOPPED) @@ -105,7 +110,7 @@ def start( if current.status == ServerStatus.STALE: # Clean up stale state file - self._paths.remove_server_state() + remove_server_state(self._paths.server_state_file) self._paths.ensure_directories() @@ -165,13 +170,13 @@ def start( ) # Write server state - state = self._paths.write_server_state(process.pid, host, port) + state = write_server_state(self._paths.server_state_file, process.pid, host, port) # Wait briefly and verify it started time.sleep(0.5) if process.poll() is not None: # Process exited immediately - error - self._paths.remove_server_state() + remove_server_state(self._paths.server_state_file) raise RuntimeError(f"Server failed to start. Check logs at {self._paths.server_log}") return ServerInfo( @@ -201,7 +206,7 @@ def stop(self, timeout: float = 10.0) -> ServerInfo: if current.status == ServerStatus.STALE: # Just clean up the stale state file - self._paths.remove_server_state() + remove_server_state(self._paths.server_state_file) return ServerInfo(status=ServerStatus.STOPPED) pid = current.pid @@ -212,14 +217,14 @@ def stop(self, timeout: float = 10.0) -> ServerInfo: os.kill(pid, signal.SIGTERM) except ProcessLookupError: # Process already dead - self._paths.remove_server_state() + remove_server_state(self._paths.server_state_file) return ServerInfo(status=ServerStatus.STOPPED) # Wait for process to exit start_time = time.time() while time.time() - start_time < timeout: if not self._is_process_running(pid): - self._paths.remove_server_state() + remove_server_state(self._paths.server_state_file) return ServerInfo(status=ServerStatus.STOPPED) time.sleep(0.1) @@ -230,7 +235,7 @@ def stop(self, timeout: float = 10.0) -> ServerInfo: except ProcessLookupError: pass - self._paths.remove_server_state() + remove_server_state(self._paths.server_state_file) return ServerInfo(status=ServerStatus.STOPPED) def _is_process_running(self, pid: int) -> bool: diff --git a/osa/cli/util/paths.py b/osa/cli/util/paths.py index 1914f02..646e5d7 100644 --- a/osa/cli/util/paths.py +++ b/osa/cli/util/paths.py @@ -1,31 +1,28 @@ -"""Manages OSA directory structure following XDG Base Directory spec. +"""Manages OSA directory structure. -Directory layout: - ~/.config/osa/ - config.yaml # User configuration +Supports two modes: - ~/.local/share/osa/ - osa.db # SQLite database - vectors/ # Vector index (ChromaDB) +1. **Unified mode** (OSA_DATA_DIR set): All data under a single directory + for container deployments with single volume mount. - ~/.local/state/osa/ - server.json # Daemon state (PID, host, port) - logs/ - server.log # Server logs + OSA_DATA_DIR=/data + /data/config/ → config files + /data/data/ → database, vectors + /data/state/ → logs, server state + /data/cache/ → CLI cache - ~/.cache/osa/ - last_search.json # CLI search cache +2. **XDG mode** (default): Follows XDG Base Directory specification + for local development. + + ~/.config/osa/ → config files + ~/.local/share/osa/ → database, vectors + ~/.local/state/osa/ → logs, server state + ~/.cache/osa/ → CLI cache """ -import json -from dataclasses import asdict, dataclass -from datetime import UTC, datetime +from dataclasses import dataclass from pathlib import Path -from pydantic import ValidationError - -from osa.cli.models import SearchCache, SearchHit - @dataclass class ServerState: @@ -38,32 +35,32 @@ class ServerState: class OSAPaths: - """Manages OSA paths following XDG Base Directory specification. + """Computes OSA paths for unified or XDG mode. - Supports overriding individual directories for testing. + This class is purely for path computation - no file I/O operations. + Use server_state.py and search_cache.py for reading/writing files. """ - def __init__( - self, - *, - config_dir: Path | None = None, - data_dir: Path | None = None, - state_dir: Path | None = None, - cache_dir: Path | None = None, - ) -> None: + def __init__(self, *, unified_data_dir: Path | None = None) -> None: """Initialize paths. Args: - config_dir: Override config directory (default: ~/.config/osa). - data_dir: Override data directory (default: ~/.local/share/osa). - state_dir: Override state directory (default: ~/.local/state/osa). - cache_dir: Override cache directory (default: ~/.cache/osa). + unified_data_dir: If set, derive all paths from this directory + (container mode). If None, use XDG defaults (local mode). """ - home = Path.home() - self._config_dir = config_dir or home / ".config" / "osa" - self._data_dir = data_dir or home / ".local" / "share" / "osa" - self._state_dir = state_dir or home / ".local" / "state" / "osa" - self._cache_dir = cache_dir or home / ".cache" / "osa" + if unified_data_dir is not None: + # Unified mode: all paths under single directory + self._config_dir = unified_data_dir / "config" + self._data_dir = unified_data_dir / "data" + self._state_dir = unified_data_dir / "state" + self._cache_dir = unified_data_dir / "cache" + else: + # XDG mode: standard paths for local development + home = Path.home() + self._config_dir = home / ".config" / "osa" + self._data_dir = home / ".local" / "share" / "osa" + self._state_dir = home / ".local" / "state" / "osa" + self._cache_dir = home / ".cache" / "osa" # ------------------------------------------------------------------------- # Base directories @@ -156,65 +153,3 @@ def ensure_directories(self) -> None: def is_initialized(self) -> bool: """Check if OSA has been initialized (config file exists).""" return self.config_file.exists() - - # ------------------------------------------------------------------------- - # Server state - # ------------------------------------------------------------------------- - - def read_server_state(self) -> ServerState | None: - """Read server state from file.""" - if not self.server_state_file.exists(): - return None - try: - data = json.loads(self.server_state_file.read_text()) - return ServerState(**data) - except (json.JSONDecodeError, TypeError, KeyError, OSError): - return None - - def write_server_state(self, pid: int, host: str, port: int) -> ServerState: - """Write server state to file.""" - self._state_dir.mkdir(parents=True, exist_ok=True) - state = ServerState( - pid=pid, - host=host, - port=port, - started_at=datetime.now(UTC).isoformat(), - ) - self.server_state_file.write_text(json.dumps(asdict(state), indent=2)) - return state - - def remove_server_state(self) -> None: - """Remove server state file.""" - if self.server_state_file.exists(): - self.server_state_file.unlink() - - # ------------------------------------------------------------------------- - # Search cache - # ------------------------------------------------------------------------- - - def read_search_cache(self) -> SearchCache | None: - """Read cached search results.""" - if not self.search_cache_file.exists(): - return None - try: - data = json.loads(self.search_cache_file.read_text()) - return SearchCache.model_validate(data) - except (json.JSONDecodeError, ValidationError, OSError): - return None - - def write_search_cache( - self, - index: str, - query: str, - results: list[SearchHit], - ) -> SearchCache: - """Write search results to cache for numbered lookup.""" - self._cache_dir.mkdir(parents=True, exist_ok=True) - cache = SearchCache( - index=index, - query=query, - searched_at=datetime.now(UTC).isoformat(), - results=results, - ) - self.search_cache_file.write_text(cache.model_dump_json(indent=2)) - return cache diff --git a/osa/cli/util/search_cache.py b/osa/cli/util/search_cache.py new file mode 100644 index 0000000..ebde485 --- /dev/null +++ b/osa/cli/util/search_cache.py @@ -0,0 +1,60 @@ +"""Search cache file I/O utilities. + +Handles reading and writing the search results cache file that enables +numbered result lookup in CLI (e.g., `osa show 1`). +""" + +from datetime import UTC, datetime +from pathlib import Path + +from pydantic import ValidationError + +from osa.cli.models import SearchCache, SearchHit + + +def read_search_cache(cache_file: Path) -> SearchCache | None: + """Read cached search results. + + Args: + cache_file: Path to the cache file (e.g., last_search.json). + + Returns: + SearchCache if file exists and is valid, None otherwise. + """ + if not cache_file.exists(): + return None + try: + data = cache_file.read_text() + return SearchCache.model_validate_json(data) + except (ValidationError, OSError): + return None + + +def write_search_cache( + cache_file: Path, + index: str, + query: str, + results: list[SearchHit], +) -> SearchCache: + """Write search results to cache for numbered lookup. + + Creates parent directories if they don't exist. + + Args: + cache_file: Path to the cache file. + index: Name of the index that was searched. + query: The search query string. + results: List of search result hits. + + Returns: + The created SearchCache. + """ + cache_file.parent.mkdir(parents=True, exist_ok=True) + cache = SearchCache( + index=index, + query=query, + searched_at=datetime.now(UTC).isoformat(), + results=results, + ) + cache_file.write_text(cache.model_dump_json(indent=2)) + return cache diff --git a/osa/cli/util/server_state.py b/osa/cli/util/server_state.py new file mode 100644 index 0000000..8688bc0 --- /dev/null +++ b/osa/cli/util/server_state.py @@ -0,0 +1,65 @@ +"""Server state file I/O utilities. + +Handles reading and writing the server.json state file that tracks +the running daemon's PID, host, port, and start time. +""" + +import json +from dataclasses import asdict +from datetime import UTC, datetime +from pathlib import Path + +from osa.cli.util.paths import ServerState + + +def read_server_state(state_file: Path) -> ServerState | None: + """Read server state from file. + + Args: + state_file: Path to the server.json file. + + Returns: + ServerState if file exists and is valid, None otherwise. + """ + if not state_file.exists(): + return None + try: + data = json.loads(state_file.read_text()) + return ServerState(**data) + except (json.JSONDecodeError, TypeError, KeyError, OSError): + return None + + +def write_server_state(state_file: Path, pid: int, host: str, port: int) -> ServerState: + """Write server state to file. + + Creates parent directories if they don't exist. + + Args: + state_file: Path to the server.json file. + pid: Process ID of the running server. + host: Host the server is listening on. + port: Port the server is listening on. + + Returns: + The created ServerState. + """ + state_file.parent.mkdir(parents=True, exist_ok=True) + state = ServerState( + pid=pid, + host=host, + port=port, + started_at=datetime.now(UTC).isoformat(), + ) + state_file.write_text(json.dumps(asdict(state), indent=2)) + return state + + +def remove_server_state(state_file: Path) -> None: + """Remove server state file. + + Args: + state_file: Path to the server.json file. + """ + if state_file.exists(): + state_file.unlink() diff --git a/osa/config.py b/osa/config.py index 96b3375..e0ae028 100644 --- a/osa/config.py +++ b/osa/config.py @@ -6,12 +6,35 @@ from typing import Annotated, Any, Union import yaml -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, model_validator from pydantic_settings import BaseSettings, PydanticBaseSettingsSource +from typing_extensions import Self +from osa.cli.util.paths import OSAPaths from osa.infrastructure.index.vector.config import VectorBackendConfig +# ============================================================================= +# Paths Configuration +# ============================================================================= + + +class PathsConfig(BaseSettings): + """Configuration for OSA data paths. + + When data_dir is set (e.g., OSA_DATA_DIR=/data), all paths are derived + from this single directory, enabling simple container deployments with + a single volume mount. + + When data_dir is None (default), XDG Base Directory paths are used + (~/.config/osa, ~/.local/share/osa, etc.) for local development. + """ + + data_dir: Path | None = None # OSA_DATA_DIR - unified data directory for containers + + model_config = {"env_prefix": "OSA_"} + + # ============================================================================= # Index Configuration # ============================================================================= @@ -97,24 +120,37 @@ def _load_yaml_config(self) -> dict[str, Any]: return {} -class Frontend(BaseSettings): +class Frontend(BaseModel): + """Frontend configuration (nested in Config, uses env_nested_delimiter).""" + url: str = "http://localhost:3000" -class Server(BaseSettings): +class Server(BaseModel): + """Server configuration (nested in Config, uses env_nested_delimiter).""" + name: str = "Open Science Archive" version: str = "0.0.1" # TODO: better type? description: str = "An open platform for depositing scientific data" domain: str = "localhost" # Node domain for SRN construction -class DatabaseConfig(BaseSettings): - url: str = "sqlite+aiosqlite:///~/.local/share/osa/osa.db" # XDG data directory +class DatabaseConfig(BaseModel): + """Database configuration (nested in Config, uses env_nested_delimiter). + + The url field uses empty string as sentinel to indicate "derive from OSAPaths". + When user doesn't override via OSA_DATABASE__URL, we compute the actual path + in Config's model_validator. + """ + + url: str = "" # Empty string = derive from paths; explicit value = use as-is echo: bool = False auto_migrate: bool = True # Auto-migrate for SQLite, manual for PostgreSQL -class LoggingConfig(BaseSettings): +class LoggingConfig(BaseModel): + """Logging configuration (nested in Config, uses env_nested_delimiter).""" + level: str = "DEBUG" # Root log level (DEBUG for development) format: str = "%(asctime)s %(levelname)-8s [%(name)s] %(message)s" date_format: str = "%Y-%m-%d %H:%M:%S" @@ -126,6 +162,10 @@ def file(self) -> str | None: class Config(BaseSettings): + # Use default_factory for PathsConfig (BaseSettings) to ensure env vars are read + # at instantiation time, not class definition time (avoids Pydantic footgun) + paths: PathsConfig = Field(default_factory=PathsConfig) + # These are BaseModel, so env_nested_delimiter handles their env vars server: Server = Server() frontend: Frontend = Frontend() database: DatabaseConfig = DatabaseConfig() @@ -133,10 +173,30 @@ class Config(BaseSettings): indexes: list[IndexConfig] = [] # list of index configs ingestors: list[IngestConfig] = [] # list of ingestor configs - class Config: - env_file = ".env" - env_file_encoding = "utf-8" - env_nested_delimiter = "__" # Allows OSA_DATABASE__URL override + model_config = { + "env_prefix": "OSA_", + "env_file": ".env", + "env_file_encoding": "utf-8", + "env_nested_delimiter": "__", # Allows OSA_DATABASE__URL override + } + + @model_validator(mode="after") + def derive_database_url(self) -> Self: + """Derive database URL from paths if not explicitly set. + + When database.url is empty (sentinel value), compute the path from OSAPaths. + This allows OSA_DATA_DIR to control the database location while still + allowing explicit OSA_DATABASE__URL override. + """ + if not self.database.url: + # URL is sentinel (empty), derive from paths + osa_paths = OSAPaths(unified_data_dir=self.paths.data_dir) + self.database = DatabaseConfig( + url=f"sqlite+aiosqlite:///{osa_paths.database_file}", + echo=self.database.echo, + auto_migrate=self.database.auto_migrate, + ) + return self @classmethod def settings_customise_sources( diff --git a/osa/infrastructure/index/di.py b/osa/infrastructure/index/di.py index b0b02ee..8e5ee3a 100644 --- a/osa/infrastructure/index/di.py +++ b/osa/infrastructure/index/di.py @@ -2,10 +2,12 @@ from dishka import Provider, provide +from osa.cli.util.paths import OSAPaths from osa.config import Config from osa.domain.index.model.registry import IndexRegistry from osa.domain.index.service import IndexService from osa.infrastructure.index.vector.backend import VectorStorageBackend +from osa.infrastructure.index.vector.config import VectorBackendConfig from osa.sdk.index.backend import StorageBackend from osa.util.di.scope import Scope @@ -14,9 +16,13 @@ class IndexProvider(Provider): """Provides configured index backends.""" @provide(scope=Scope.APP) - def get_backends(self, config: Config) -> IndexRegistry: + def get_backends(self, config: Config, paths: OSAPaths) -> IndexRegistry: """Build all configured index backends. + Args: + config: Application configuration with index configs. + paths: OSAPaths for deriving default persist_dir. + Returns: Registry of index backends. """ @@ -24,7 +30,17 @@ def get_backends(self, config: Config) -> IndexRegistry: for idx_config in config.indexes: if idx_config.backend == "vector": - backends[idx_config.name] = VectorStorageBackend(idx_config.name, idx_config.config) + vector_config = idx_config.config + # Derive persist_dir from OSAPaths if not explicitly set + if ( + isinstance(vector_config, VectorBackendConfig) + and vector_config.persist_dir is None + ): + vector_config = VectorBackendConfig( + persist_dir=paths.vectors_dir, + embedding=vector_config.embedding, + ) + backends[idx_config.name] = VectorStorageBackend(idx_config.name, vector_config) # Add more backend types here as they're implemented # elif idx_config.backend == "keyword": # backends[idx_config.name] = KeywordStorageBackend( diff --git a/osa/infrastructure/index/vector/backend.py b/osa/infrastructure/index/vector/backend.py index 46c95ed..b3a853f 100644 --- a/osa/infrastructure/index/vector/backend.py +++ b/osa/infrastructure/index/vector/backend.py @@ -22,6 +22,10 @@ def __init__(self, name: str, config: VectorBackendConfig) -> None: self._config = config self._model = SentenceTransformer(config.embedding.model.value) + # persist_dir must be set by DI (derived from OSAPaths if not explicit) + if config.persist_dir is None: + raise ValueError("VectorBackendConfig.persist_dir must be set") + # Expand ~ to home directory and ensure persist directory exists persist_dir = config.persist_dir.expanduser() persist_dir.mkdir(parents=True, exist_ok=True) diff --git a/osa/infrastructure/index/vector/config.py b/osa/infrastructure/index/vector/config.py index 495fda5..6c9cf2b 100644 --- a/osa/infrastructure/index/vector/config.py +++ b/osa/infrastructure/index/vector/config.py @@ -31,7 +31,11 @@ class EmbeddingConfig(BaseModel): class VectorBackendConfig(BackendConfig): - """ChromaDB + sentence-transformers specific configuration.""" + """ChromaDB + sentence-transformers specific configuration. - persist_dir: Path + If persist_dir is not specified (None), it will be derived from OSAPaths.vectors_dir + in the DI provider, respecting OSA_DATA_DIR for container deployments. + """ + + persist_dir: Path | None = None # None = derive from OSAPaths embedding: EmbeddingConfig = EmbeddingConfig() diff --git a/tests/unit/cli/__init__.py b/tests/unit/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/cli/util/__init__.py b/tests/unit/cli/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/cli/util/test_paths.py b/tests/unit/cli/util/test_paths.py new file mode 100644 index 0000000..d64902e --- /dev/null +++ b/tests/unit/cli/util/test_paths.py @@ -0,0 +1,125 @@ +"""Tests for OSAPaths path computation.""" + +from pathlib import Path + +import pytest + +from osa.cli.util.paths import OSAPaths + + +class TestOSAPathsXDGMode: + """Tests for XDG mode (default, when unified_data_dir is None).""" + + def test_xdg_mode_uses_home_directory(self, monkeypatch: pytest.MonkeyPatch) -> None: + """XDG mode should derive paths from home directory.""" + # Patch Path.home() to use a test path + test_home = Path("/test/home") + monkeypatch.setattr(Path, "home", lambda: test_home) + + paths = OSAPaths() + + assert paths.config_dir == test_home / ".config" / "osa" + assert paths.data_dir == test_home / ".local" / "share" / "osa" + assert paths.state_dir == test_home / ".local" / "state" / "osa" + assert paths.cache_dir == test_home / ".cache" / "osa" + + def test_xdg_mode_file_paths(self, monkeypatch: pytest.MonkeyPatch) -> None: + """XDG mode should derive file paths correctly.""" + test_home = Path("/test/home") + monkeypatch.setattr(Path, "home", lambda: test_home) + + paths = OSAPaths() + + assert paths.config_file == test_home / ".config" / "osa" / "config.yaml" + assert paths.database_file == test_home / ".local" / "share" / "osa" / "osa.db" + assert paths.vectors_dir == test_home / ".local" / "share" / "osa" / "vectors" + assert paths.server_state_file == test_home / ".local" / "state" / "osa" / "server.json" + assert paths.logs_dir == test_home / ".local" / "state" / "osa" / "logs" + assert paths.server_log == test_home / ".local" / "state" / "osa" / "logs" / "server.log" + assert paths.search_cache_file == test_home / ".cache" / "osa" / "last_search.json" + + +class TestOSAPathsUnifiedMode: + """Tests for unified mode (when unified_data_dir is set).""" + + def test_unified_mode_derives_all_paths_from_data_dir(self) -> None: + """Unified mode should derive all paths from the single data directory.""" + data_dir = Path("/data") + paths = OSAPaths(unified_data_dir=data_dir) + + assert paths.config_dir == data_dir / "config" + assert paths.data_dir == data_dir / "data" + assert paths.state_dir == data_dir / "state" + assert paths.cache_dir == data_dir / "cache" + + def test_unified_mode_file_paths(self) -> None: + """Unified mode should derive file paths correctly.""" + data_dir = Path("/data") + paths = OSAPaths(unified_data_dir=data_dir) + + assert paths.config_file == data_dir / "config" / "config.yaml" + assert paths.database_file == data_dir / "data" / "osa.db" + assert paths.vectors_dir == data_dir / "data" / "vectors" + assert paths.server_state_file == data_dir / "state" / "server.json" + assert paths.logs_dir == data_dir / "state" / "logs" + assert paths.server_log == data_dir / "state" / "logs" / "server.log" + assert paths.search_cache_file == data_dir / "cache" / "last_search.json" + + def test_unified_mode_with_absolute_path(self) -> None: + """Unified mode should work with absolute paths.""" + data_dir = Path("/var/lib/osa") + paths = OSAPaths(unified_data_dir=data_dir) + + assert paths.data_dir == Path("/var/lib/osa/data") + assert paths.database_file == Path("/var/lib/osa/data/osa.db") + + def test_unified_mode_subdirectory_structure(self) -> None: + """Unified mode should create expected subdirectory structure.""" + data_dir = Path("/data") + paths = OSAPaths(unified_data_dir=data_dir) + + # Verify the structure matches the spec + assert paths.config_dir.parent == data_dir + assert paths.data_dir.parent == data_dir + assert paths.state_dir.parent == data_dir + assert paths.cache_dir.parent == data_dir + + +class TestOSAPathsEnsureDirectories: + """Tests for ensure_directories method.""" + + def test_ensure_directories_creates_all_dirs(self, tmp_path: Path) -> None: + """ensure_directories should create all required directories.""" + paths = OSAPaths(unified_data_dir=tmp_path) + paths.ensure_directories() + + assert paths.config_dir.exists() + assert paths.data_dir.exists() + assert paths.state_dir.exists() + assert paths.cache_dir.exists() + assert paths.logs_dir.exists() + assert paths.vectors_dir.exists() + + def test_ensure_directories_is_idempotent(self, tmp_path: Path) -> None: + """ensure_directories should be safe to call multiple times.""" + paths = OSAPaths(unified_data_dir=tmp_path) + paths.ensure_directories() + paths.ensure_directories() # Should not raise + + assert paths.config_dir.exists() + + +class TestOSAPathsIsInitialized: + """Tests for is_initialized method.""" + + def test_is_initialized_returns_false_when_no_config(self, tmp_path: Path) -> None: + """is_initialized should return False when config file doesn't exist.""" + paths = OSAPaths(unified_data_dir=tmp_path) + assert not paths.is_initialized() + + def test_is_initialized_returns_true_when_config_exists(self, tmp_path: Path) -> None: + """is_initialized should return True when config file exists.""" + paths = OSAPaths(unified_data_dir=tmp_path) + paths.ensure_directories() + paths.config_file.write_text("server:\n name: test") + assert paths.is_initialized() diff --git a/tests/unit/config/__init__.py b/tests/unit/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/config/test_paths_config.py b/tests/unit/config/test_paths_config.py new file mode 100644 index 0000000..3e8053f --- /dev/null +++ b/tests/unit/config/test_paths_config.py @@ -0,0 +1,96 @@ +"""Tests for PathsConfig Pydantic Settings.""" + +from pathlib import Path + +import pytest + +from osa.config import Config, PathsConfig + + +class TestPathsConfig: + """Tests for PathsConfig Pydantic Settings class.""" + + def test_data_dir_defaults_to_none(self) -> None: + """data_dir should default to None (XDG mode).""" + config = PathsConfig() + assert config.data_dir is None + + def test_data_dir_from_env_var(self, monkeypatch: pytest.MonkeyPatch) -> None: + """data_dir should be set from OSA_DATA_DIR environment variable.""" + monkeypatch.setenv("OSA_DATA_DIR", "/data") + config = PathsConfig() + assert config.data_dir == Path("/data") + + def test_data_dir_with_absolute_path(self, monkeypatch: pytest.MonkeyPatch) -> None: + """data_dir should accept absolute paths.""" + monkeypatch.setenv("OSA_DATA_DIR", "/var/lib/osa") + config = PathsConfig() + assert config.data_dir == Path("/var/lib/osa") + + def test_data_dir_with_relative_path(self, monkeypatch: pytest.MonkeyPatch) -> None: + """data_dir should accept relative paths (though not recommended).""" + monkeypatch.setenv("OSA_DATA_DIR", "./data") + config = PathsConfig() + assert config.data_dir == Path("./data") + + def test_env_prefix_is_osa(self) -> None: + """PathsConfig should use OSA_ prefix for env vars.""" + # Verify the model config + assert PathsConfig.model_config.get("env_prefix") == "OSA_" + + +class TestDatabaseUrlDerivation: + """Tests for database URL derivation from paths (T029-T031).""" + + def test_database_url_derived_from_osa_data_dir(self, monkeypatch: pytest.MonkeyPatch) -> None: + """T029: Database URL should derive from OSA_DATA_DIR when set.""" + monkeypatch.setenv("OSA_DATA_DIR", "/data") + # Clear any existing database URL override + monkeypatch.delenv("OSA_DATABASE__URL", raising=False) + + config = Config() + + # Should derive database path from /data/data/osa.db + assert config.database.url == "sqlite+aiosqlite:////data/data/osa.db" + + def test_database_url_derived_from_xdg_defaults(self, monkeypatch: pytest.MonkeyPatch) -> None: + """T030: Database URL should derive from XDG defaults when OSA_DATA_DIR not set.""" + # Clear OSA_DATA_DIR to use XDG mode + monkeypatch.delenv("OSA_DATA_DIR", raising=False) + monkeypatch.delenv("OSA_DATABASE__URL", raising=False) + + # Patch Path.home() to use a test path + test_home = Path("/test/home") + monkeypatch.setattr(Path, "home", lambda: test_home) + + config = Config() + + # Should derive database path from XDG data directory + expected_path = test_home / ".local" / "share" / "osa" / "osa.db" + assert config.database.url == f"sqlite+aiosqlite:///{expected_path}" + + def test_explicit_database_url_override(self, monkeypatch: pytest.MonkeyPatch) -> None: + """T031: Explicit OSA_DATABASE__URL should override derived path.""" + # Set both OSA_DATA_DIR and explicit database URL + monkeypatch.setenv("OSA_DATA_DIR", "/data") + monkeypatch.setenv("OSA_DATABASE__URL", "postgresql+asyncpg://user:pass@db:5432/osa") + + config = Config() + + # Should use explicit URL, not derived + assert config.database.url == "postgresql+asyncpg://user:pass@db:5432/osa" + + def test_database_url_preserves_other_settings(self, monkeypatch: pytest.MonkeyPatch) -> None: + """Database URL derivation should preserve echo and auto_migrate settings.""" + monkeypatch.setenv("OSA_DATA_DIR", "/data") + monkeypatch.delenv("OSA_DATABASE__URL", raising=False) + monkeypatch.setenv("OSA_DATABASE__ECHO", "true") + monkeypatch.setenv("OSA_DATABASE__AUTO_MIGRATE", "false") + + config = Config() + + # URL should be derived + assert "sqlite+aiosqlite" in config.database.url + # But other settings should be preserved from env vars + assert config.database.echo is True + assert config.database.auto_migrate is False diff --git a/tests/unit/infrastructure/index/test_vector_config.py b/tests/unit/infrastructure/index/test_vector_config.py new file mode 100644 index 0000000..a6f9cb2 --- /dev/null +++ b/tests/unit/infrastructure/index/test_vector_config.py @@ -0,0 +1,54 @@ +"""Tests for vector index configuration with OSA_DATA_DIR support.""" + +from pathlib import Path + +import pytest + +from osa.cli.util.paths import OSAPaths +from osa.infrastructure.index.vector.config import VectorBackendConfig + + +class TestVectorBackendConfig: + """Tests for VectorBackendConfig persist_dir handling.""" + + def test_persist_dir_defaults_to_none(self) -> None: + """persist_dir should default to None (derive from OSAPaths).""" + config = VectorBackendConfig() + assert config.persist_dir is None + + def test_persist_dir_accepts_explicit_path(self) -> None: + """persist_dir should accept explicit path values.""" + config = VectorBackendConfig(persist_dir=Path("/custom/vectors")) + assert config.persist_dir == Path("/custom/vectors") + + +class TestVectorPersistDirDerivation: + """Tests for vector persist_dir derivation from OSAPaths.""" + + def test_persist_dir_derived_from_osa_data_dir(self) -> None: + """T034: Vector persist_dir should derive from OSA_DATA_DIR when set.""" + paths = OSAPaths(unified_data_dir=Path("/data")) + + # Verify paths derives vectors_dir correctly + assert paths.vectors_dir == Path("/data/data/vectors") + + def test_persist_dir_derived_from_xdg_defaults(self, monkeypatch: pytest.MonkeyPatch) -> None: + """T035: Vector persist_dir should derive from XDG defaults when OSA_DATA_DIR not set.""" + # Patch Path.home() to use a test path + test_home = Path("/test/home") + monkeypatch.setattr(Path, "home", lambda: test_home) + + paths = OSAPaths(unified_data_dir=None) + + # Should derive vectors path from XDG data directory + expected_path = test_home / ".local" / "share" / "osa" / "vectors" + assert paths.vectors_dir == expected_path + + def test_persist_dir_with_tmp_path(self, tmp_path: Path) -> None: + """Vector persist_dir should work with actual tmp directories.""" + paths = OSAPaths(unified_data_dir=tmp_path) + paths.ensure_directories() + + # Verify the vectors directory exists + assert paths.vectors_dir.exists() + assert paths.vectors_dir == tmp_path / "data" / "vectors" From 46443fd601a999908df62eb467fde0f20aac5362 Mon Sep 17 00:00:00 2001 From: Rory Byrne Date: Mon, 5 Jan 2026 16:00:17 +0000 Subject: [PATCH 2/3] refactor: restructure CLI for zero-setup local development - Rename `osa server` to `osa local` (start/stop/logs/status) - Move `osa init` to `osa config init` with template selection - Make `osa local start` self-sufficient: auto-creates directories and works without config file (uses sensible defaults) - Config init writes to ./osa.yaml by default (cwd, not XDG) - Simplify OSAPaths to read OSA_DATA_DIR directly from environment New user journey: osa local start # Just works, no setup required osa config init geo # Optional: generate config template vim osa.yaml && osa local start # Customize and restart Config resolution order: 1. ./osa.yaml (current directory) 2. $OSA_DATA_DIR/config/config.yaml 3. ~/.config/osa/config.yaml (XDG fallback) --- osa/application/di.py | 4 +- osa/cli/commands/config.py | 187 ++++++++++++++++++ osa/cli/commands/init.py | 143 -------------- osa/cli/commands/{server.py => local.py} | 74 +++---- osa/cli/main.py | 6 +- osa/cli/util/paths.py | 17 +- osa/config.py | 31 +-- tests/unit/cli/util/test_paths.py | 105 ++++++---- tests/unit/config/test_paths_config.py | 44 +---- .../index/test_vector_config.py | 21 +- 10 files changed, 320 insertions(+), 312 deletions(-) create mode 100644 osa/cli/commands/config.py delete mode 100644 osa/cli/commands/init.py rename osa/cli/commands/{server.py => local.py} (72%) diff --git a/osa/application/di.py b/osa/application/di.py index 44b51be..cdd63b3 100644 --- a/osa/application/di.py +++ b/osa/application/di.py @@ -15,8 +15,8 @@ def create_container() -> AsyncContainer: config = Config() - # Create OSAPaths from config, supporting both unified and XDG modes - paths = OSAPaths(unified_data_dir=config.paths.data_dir) + # OSAPaths reads OSA_DATA_DIR from environment automatically + paths = OSAPaths() return make_async_container( PersistenceProvider(), diff --git a/osa/cli/commands/config.py b/osa/cli/commands/config.py new file mode 100644 index 0000000..48bd112 --- /dev/null +++ b/osa/cli/commands/config.py @@ -0,0 +1,187 @@ +"""Configuration management commands.""" + +import sys +from pathlib import Path +from typing import Literal + +import cyclopts + +from osa.cli.console import get_console +from osa.cli.util import OSAPaths + +app = cyclopts.App(name="config", help="Configuration management") + +# Available templates +Template = Literal["geo", "minimal"] +TEMPLATES: list[Template] = ["geo", "minimal"] + +# Default output file in current working directory +DEFAULT_OUTPUT = Path("osa.yaml") + + +def _get_template_content(template: Template, paths: OSAPaths) -> str: + """Get template content with paths substituted.""" + if template == "geo": + return GEO_TEMPLATE.format(vectors_dir=paths.vectors_dir) + else: + return MINIMAL_TEMPLATE.format(vectors_dir=paths.vectors_dir) + + +GEO_TEMPLATE = """\ +# OSA Configuration - GEO Template + +server: + name: "My OSA Node" + domain: "localhost" + +# Database (SQLite by default, stored in $OSA_DATA_DIR or ~/.local/share/osa/) +# database: +# auto_migrate: true + +# Logging +# logging: +# level: "INFO" + +# GEO Ingestor - pulls from NCBI Gene Expression Omnibus via Entrez API +ingestors: + - ingestor: geo-entrez + config: + record_type: gse # gse (~250k all) or gds (~5k curated) + email: your@email.com # Required by NCBI - please update this + # api_key: null # Optional: NCBI API key for higher rate limits (https://account.ncbi.nlm.nih.gov/settings/) + initial_run: + enabled: true + limit: 50 + # schedule: + # cron: "0 * * * *" # Hourly + # limit: 100 + +# Vector search index +indexes: + - name: vector + backend: vector + config: + persist_dir: {vectors_dir} + embedding: + model: all-MiniLM-L6-v2 + fields: [title, summary, organism, platform, entry_type] +""" + +MINIMAL_TEMPLATE = """\ +# OSA Configuration + +server: + name: "My OSA Node" + domain: "localhost" + +# Database (SQLite by default, stored in $OSA_DATA_DIR or ~/.local/share/osa/) +# database: +# auto_migrate: true + +# Logging +# logging: +# level: "INFO" + +# Add your ingestors here: +# ingestors: +# - ingestor: geo-entrez +# config: +# email: your@email.com + +# Add your indexes here: +# indexes: +# - name: my-index +# backend: vector +# config: +# persist_dir: {vectors_dir} +""" + + +@app.command +def init( + template: Template | None = None, + /, + output: Path | None = None, + stdout: bool = False, + force: bool = False, +) -> None: + """Generate a configuration file from a template. + + Creates an osa.yaml configuration file in the current directory (or specified path). + Edit this file to customize your OSA instance, then run 'osa local start'. + + Args: + template: Template to use (geo, minimal). + output: Output path. Defaults to ./osa.yaml. + stdout: Print to stdout instead of writing to file. + force: Overwrite existing file. + """ + console = get_console() + paths = OSAPaths() + + # If no template specified, show available options + if template is None: + console.print("[bold]Available templates:[/bold]\n") + console.print(" [cyan]geo[/cyan] NCBI GEO integration with vector search") + console.print(" [cyan]minimal[/cyan] Blank configuration to customize") + console.print() + console.print("Usage: [bold]osa config init