diff --git a/mboxer-current-config.yaml b/mboxer-current-config.yaml deleted file mode 100644 index 02e73ed..0000000 --- a/mboxer-current-config.yaml +++ /dev/null @@ -1,213 +0,0 @@ -# Agent/reference copy. Canonical template: config/mboxer.example.yaml -project: - name: mboxer - repo: uscient/mboxer - default_database: var/mboxer.sqlite - -# Account settings. -# Use: mboxer account add --email
-# Default behavior operates on one account at a time. -# Cross-account operations must be explicit via --accounts key1,key2. -accounts: - # NOTE: account resolution is currently built-in, not driven by these keys. - # One account -> auto-selected; multiple accounts -> --account is required. - # The keys below are reserved placeholders and are not read by the code yet. - default_account: null # reserved (not yet wired) - require_account: true # reserved (not yet wired) - -paths: - database: var/mboxer.sqlite - mbox_dir: data/mboxes - attachments_dir: data/attachments - exports_dir: exports - notebooklm_dir: exports/notebooklm - rag_dir: exports/rag - manifests_dir: exports/manifests - logs_dir: var/log - -ingest: - batch_commit_size: 500 - resume: true - extract_attachments: true - store_body_text: true - store_body_html: false - max_body_chars: 50000 - dedupe_by: - - message_id - - body_hash - -classification: - provider: rules # rules | ollama - level: thread # message | thread - prompt_version: classifier-v1 - confidence_review_threshold: 0.70 - allow_category_proposals: true - require_review_for_new_categories: true - ollama: - host: http://localhost:11434 - default_model: llama3.1:8b - timeout_seconds: 60 - temperature: 0 - num_ctx: 8192 - models: - classifier: llama3.1:8b - summarizer: llama3.1:8b - taxonomy_manager: llama3.1:8b - security_reviewer: llama3.1:8b - -# Category paths are slash-delimited and later become directories. -taxonomy: - locked_categories: - - medical - - medical/hospital-billing - - medical/pharmacy - - medical/insurance - - legal - - legal/law-firm-correspondence - - finance - - household - - household/utilities - - postal - - postal/usps-informed-delivery - - family - - family/recipient-family - - personal-correspondence - - government - - shopping - - technology - - noise - - noise/marketing - - noise/newsletters - - noise/spam - -rules: - - name: usps-informed-delivery - description: USPS daily digest / mail preview emails. - match: - from_contains: - - usps - subject_contains: - - informed delivery - - daily digest - assign: - category_path: postal/usps-informed-delivery - notebooklm_priority: include_metadata_only - sensitivity: medium - export_profile: metadata-only - - - name: common-law-firm - description: Replace example domain with known law firm domain(s). - match: - from_domain: - - examplelawfirm.com - assign: - category_path: legal/law-firm-correspondence - notebooklm_priority: include - sensitivity: high - export_profile: scrubbed - - - name: obvious-utility-bill - description: Catch obvious utility/payment statement emails before LLM classification. - match: - subject_contains: - - bill - - statement - - payment due - - autopay - - utility - assign_hint: - category_path: household/utilities - sensitivity: medium - export_profile: scrubbed - -security: - default_export_profile: scrubbed - scan_enabled: true - scrub_enabled: true - on_residual_findings: warn # allow | warn | block - scan_attachments: true - quarantine_unsafe_attachments: true - redact_email_addresses: true - redact_phone_numbers: true - redact_ssn_like_numbers: true - redact_credit_card_like_numbers: true - # reserved/planned detectors (not yet implemented): physical_addresses - -exports: - notebooklm: - # Default export limit profile. This is conservative for Google AI Ultra workflows. - profile: ultra_safe - - profiles: - standard: - max_sources: 50 - reserved_sources: 10 - target_sources: 40 - max_words_per_source: 450000 - target_words_per_source: 300000 - max_bytes_per_source: 190000000 - target_bytes_per_source: 100000000 - max_messages_per_source: 5000 - - plus: - max_sources: 100 - reserved_sources: 20 - target_sources: 80 - max_words_per_source: 450000 - target_words_per_source: 300000 - max_bytes_per_source: 190000000 - target_bytes_per_source: 100000000 - max_messages_per_source: 5000 - - pro: - max_sources: 300 - reserved_sources: 50 - target_sources: 250 - max_words_per_source: 450000 - target_words_per_source: 300000 - max_bytes_per_source: 190000000 - target_bytes_per_source: 100000000 - max_messages_per_source: 5000 - - ultra: - max_sources: 600 - reserved_sources: 75 - target_sources: 525 - max_words_per_source: 450000 - target_words_per_source: 300000 - max_bytes_per_source: 190000000 - target_bytes_per_source: 100000000 - max_messages_per_source: 5000 - - ultra_safe: - max_sources: 600 - reserved_sources: 100 - target_sources: 450 - max_words_per_source: 350000 - target_words_per_source: 225000 - max_bytes_per_source: 125000000 - target_bytes_per_source: 75000000 - max_messages_per_source: 5000 - - format: - extension: md - category_directories: true - include_manifest: true - include_source_header: true - include_message_metadata: true - include_attachment_references: true - include_security_notes: true - - split_strategy: - close_source_when_any_limit_reached: true - prefer_thread_integrity: true - split_by_category: true - split_by_year: true - split_oversized_threads: true - - jsonl: - enabled: true - output_file: exports/rag/messages.jsonl - include_clean_body: true - include_raw_metadata: true - include_classification: true diff --git a/prompts/claude-code-mboxer-current.md b/prompts/claude-code-mboxer-current.md deleted file mode 100644 index 5557509..0000000 --- a/prompts/claude-code-mboxer-current.md +++ /dev/null @@ -1,372 +0,0 @@ -# Claude Code Prompt: Build `uscient/mboxer` - -> **Note:** Historical bootstrap prompt from early scaffold work. The core pipeline is implemented; use `AGENTS.md` and `config/mboxer.example.yaml` for current behavior unless a task explicitly asks to extend missing pieces. - -You are working in the `uscient/mboxer` repository. - -Project identity: - -```text -GitHub repo: uscient/mboxer -Python package: mboxer -CLI command: mboxer -Default database: var/mboxer.sqlite -``` - -## Goal - -Turn this scaffold into a working local-first MBOX archive processor for knowledge management, RAG, and NotebookLM source-pack exports. - -The project must support: - -- ingesting one or more `.mbox` files -- storing email metadata and normalized body text in SQLite -- tracking attachments in a normalized `attachments` table -- tracking ingest runs and checkpoints so interrupted imports can resume -- deterministic rules before LLM classification -- optional local LLM classification through Ollama -- LLM-managed category proposals with review/approval -- category paths as filesystem directories -- future security scan/scrubbing stages -- NotebookLM Markdown export with config-driven file/source limits -- JSONL export for RAG - -## Important design constraints - -1. Raw email stays local by default. -2. SQLite is durable project state, not a disposable cache. -3. Ingest must be resumable. -4. Multiple MBOX files may be imported into the same SQLite database. -5. Attachments are first-class database records. -6. Category paths are slash-delimited filesystem paths. -7. NotebookLM export limits are config-driven, not hardcoded. -8. The default cloud-oriented export profile should be `scrubbed`, not `raw`. -9. Keep the code boring, modular, and testable. -10. Do not introduce a web app yet. - -## Current files to respect - -- `config/mboxer.example.yaml` -- `docs/architecture.md` -- `docs/naming-conventions.md` -- `docs/notebooklm-limits.md` -- `docs/security-roadmap.md` -- `src/mboxer/config.py` -- `src/mboxer/limits.py` -- `src/mboxer/naming.py` -- `src/mboxer/db/schema.sql` -- `src/mboxer/cli.py` - -Keep naming consistent with these files. - -## Implementation order - -### 1. Database initialization - -Implement: - -```bash -mboxer init-db --config config/mboxer.yaml -``` - -Behavior: - -- create parent directory for SQLite DB -- execute `src/mboxer/db/schema.sql` -- be idempotent -- print the DB path - -### 2. MBOX ingest - -Implement: - -```bash -mboxer ingest data/mboxes/archive.mbox \ - --config config/mboxer.yaml \ - --source-name "Main Archive" \ - --extract-attachments \ - --resume -``` - -Behavior: - -- create or find `mbox_sources` row using canonical file path -- create `ingest_runs` row -- parse messages with Python stdlib `mailbox` -- store metadata in `messages` -- store normalized body text when enabled -- compute body hash -- extract attachments if enabled -- store one row per attachment in `attachments` -- update `ingest_runs.last_mbox_key` and counters regularly -- commit in batches using `ingest.batch_commit_size` -- log message-level errors in `ingest_errors` -- mark ingest run `completed` or `failed` - -Minimum message fields: - -- `source_id` -- `mbox_key` -- `message_id` -- `thread_key` -- `subject` -- `sender` -- `recipients_json` -- `cc_json` -- `date_header` -- `date_utc` -- `body_text` -- `body_hash` -- `body_chars` -- `attachment_count` -- `raw_headers_json` - -Resume behavior: - -- if `--resume`, find latest incomplete run for that source -- use `last_mbox_key` as a conservative checkpoint -- iterate from beginning until checkpoint key is found, then continue -- always skip duplicate messages already present unless `--force` is provided - -### 3. Header/body normalization - -Implement safe helpers in `normalize.py`: - -- decode MIME headers with `email.header.decode_header` -- parse dates with `email.utils.parsedate_to_datetime` -- decode plain text body parts with content charset fallback -- optionally strip HTML to text only if needed; keep this minimal for v0 -- produce stable `thread_key` from `References`, `In-Reply-To`, `Message-ID`, or normalized subject fallback - -### 4. Attachment tracking - -Implement `attachments.py`: - -- sanitize attachment filenames -- store under `data/attachments///` -- avoid overwrite by adding sequence suffix when needed -- compute sha256 -- store original filename, safe filename, content type, size, sha256, path, status -- do not execute or inspect attachment contents beyond safe metadata for now - -### 5. Config-driven NotebookLM limits - -Use `config/mboxer.example.yaml` as source of truth. - -Implement in `limits.py`: - -- profile resolution -- CLI override precedence -- byte/MB conversion helpers -- effective source budget: `max_sources - reserved_sources` -- validation warnings/errors - -CLI flags for `mboxer export notebooklm`: - -```text ---profile ---max-sources ---reserved-sources ---target-sources ---target-words ---max-words ---target-mb ---max-mb ---allow-full-source-budget ---force -``` - -Validation: - -- fail if max bytes > 200 MB unless `--force` -- warn if max words > 500,000 -- warn if target sources > effective source budget -- never exceed effective source budget unless `--allow-full-source-budget` - -### 6. Rules and classification - -Implement deterministic rule matching from config first. - -Then implement optional local LLM classification through Ollama. - -Classifier output should be structured JSON: - -```json -{ - "category_path": "medical/hospital-billing", - "secondary_paths": ["finance/bills"], - "specific_topic": "Hospital billing correspondence", - "summary": "Short factual summary.", - "people": [], - "organizations": [], - "sensitivity": "high", - "notebooklm_priority": "include", - "export_profile": "scrubbed", - "confidence": 0.86, - "needs_review": false, - "proposed_category_path": null, - "proposal_reason": null -} -``` - -The LLM may propose categories, but do not silently create many new categories. Store proposals in `category_proposals` with `pending` status. - -### 7. Category review - -Implement: - -```bash -mboxer review-categories --config config/mboxer.yaml -mboxer approve-category --config config/mboxer.yaml -mboxer reject-category --config config/mboxer.yaml -``` - -Review should show: - -- category counts -- pending proposals -- suggested high-volume categories -- low-confidence classifications - -### 8. Security scan placeholder - -Implement a basic `security-scan` command. - -For v0, detect and store findings for: - -- phone-number-like values -- SSN-like values -- credit-card-like values -- credential/password hints -- sensitive category markers for medical/legal/financial content - -Store findings in `security_findings`. - -Do not overpromise perfect redaction. - -### 9. NotebookLM Markdown export - -Implement: - -```bash -mboxer export notebooklm \ - --config config/mboxer.yaml \ - --profile ultra_safe \ - --out exports/notebooklm -``` - -Behavior: - -- use category directories -- split by category and date band/year -- preserve thread integrity when possible -- obey profile limits -- close files at target limits when practical -- never exceed hard limits -- write source headers containing category, date range, profile, counts, and scrub status -- include message metadata -- include attachment references, not attachment contents -- write `manifest.csv` -- record export run in `exports` and `export_items` - -Markdown source format: - -```markdown -# Medical / Hospital Billing — 2024 — Part 001 - -Source Pack Metadata: -- Category: medical/hospital-billing -- Date Range: 2024-01-01 to 2024-12-31 -- Export Profile: scrubbed -- Message Count: 123 -- Attachment Count: 12 - ---- - -## Email: Subject here - -- Date: ... -- From: ... -- To: ... -- Message-ID: ... -- Attachments: ... - -Body text... -``` - -### 10. JSONL export - -Implement: - -```bash -mboxer export jsonl --config config/mboxer.yaml --out exports/rag/messages.jsonl -``` - -Each line should include: - -- message id -- source info -- metadata -- clean body -- classification -- attachment metadata -- security flags - -## Tests - -Add lightweight tests for: - -- slug generation -- category path normalization -- config loading -- NotebookLM profile resolution -- override precedence -- source-budget calculation -- DB initialization idempotency - -Do not require a huge sample MBOX for tests. - -## Style - -Use stdlib where practical. - -Avoid heavy dependencies unless clearly needed. - -Keep modules small: - -```text -src/mboxer/ - cli.py - config.py - limits.py - naming.py - db/schema.py - ingest.py - normalize.py - attachments.py - taxonomy.py - classify.py - security/scan.py - security/scrub.py - exporters/notebooklm.py - exporters/jsonl.py - exporters/manifest.py -``` - -## Acceptance criteria - -After implementation, these should work: - -```bash -pip install -e . -mboxer --help -mboxer init-db --config config/mboxer.example.yaml -mboxer export notebooklm --config config/mboxer.example.yaml --dry-run -pytest -``` - -A real ingest should work with: - -```bash -mboxer ingest data/mboxes/archive.mbox --config config/mboxer.yaml --resume --extract-attachments -``` diff --git a/schema_ingest_tracking.sql b/schema_ingest_tracking.sql deleted file mode 100644 index 508a79b..0000000 --- a/schema_ingest_tracking.sql +++ /dev/null @@ -1,98 +0,0 @@ --- Schema extension for resumable multi-MBOX ingest and normalized attachment tracking. --- Intended for use with the existing mbox_manager_v0.py SQLite schema. - -CREATE TABLE IF NOT EXISTS mbox_sources ( - id INTEGER PRIMARY KEY, - path TEXT UNIQUE NOT NULL, - filename TEXT, - path_sha256 TEXT UNIQUE NOT NULL, - size_bytes INTEGER DEFAULT 0, - mtime_ns INTEGER DEFAULT 0, - first_seen_at TEXT NOT NULL, - last_seen_at TEXT, - last_ingested_at TEXT, - last_status TEXT, - last_run_id INTEGER, - message_count INTEGER DEFAULT 0, - inserted_count INTEGER DEFAULT 0, - skipped_count INTEGER DEFAULT 0, - failed_count INTEGER DEFAULT 0 -); - -CREATE INDEX IF NOT EXISTS idx_mbox_sources_path_sha ON mbox_sources(path_sha256); -CREATE INDEX IF NOT EXISTS idx_mbox_sources_last_status ON mbox_sources(last_status); - -CREATE TABLE IF NOT EXISTS ingest_runs ( - id INTEGER PRIMARY KEY, - mbox_source_id INTEGER NOT NULL, - mbox_path TEXT NOT NULL, - status TEXT NOT NULL DEFAULT 'running', - started_at TEXT NOT NULL, - updated_at TEXT, - finished_at TEXT, - command_json TEXT DEFAULT '{}', - total_scanned INTEGER DEFAULT 0, - inserted_count INTEGER DEFAULT 0, - skipped_count INTEGER DEFAULT 0, - failed_count INTEGER DEFAULT 0, - last_mbox_key TEXT, - last_error TEXT, - process_id INTEGER, - host TEXT, - extract_attachments INTEGER DEFAULT 0, - attachments_dir TEXT, - FOREIGN KEY(mbox_source_id) REFERENCES mbox_sources(id) -); - -CREATE INDEX IF NOT EXISTS idx_ingest_runs_source ON ingest_runs(mbox_source_id); -CREATE INDEX IF NOT EXISTS idx_ingest_runs_status ON ingest_runs(status); -CREATE INDEX IF NOT EXISTS idx_ingest_runs_started ON ingest_runs(started_at); - -CREATE TABLE IF NOT EXISTS ingest_errors ( - id INTEGER PRIMARY KEY, - ingest_run_id INTEGER, - mbox_source_id INTEGER, - mbox_key TEXT, - error_type TEXT, - error_message TEXT, - traceback TEXT, - created_at TEXT NOT NULL, - FOREIGN KEY(ingest_run_id) REFERENCES ingest_runs(id), - FOREIGN KEY(mbox_source_id) REFERENCES mbox_sources(id) -); - -CREATE INDEX IF NOT EXISTS idx_ingest_errors_run ON ingest_errors(ingest_run_id); -CREATE INDEX IF NOT EXISTS idx_ingest_errors_source ON ingest_errors(mbox_source_id); -CREATE INDEX IF NOT EXISTS idx_ingest_errors_key ON ingest_errors(mbox_key); - -CREATE TABLE IF NOT EXISTS attachments ( - id INTEGER PRIMARY KEY, - message_db_id INTEGER NOT NULL, - mbox_source_id INTEGER, - ingest_run_id INTEGER, - filename_original TEXT, - filename_safe TEXT, - content_type TEXT, - size_bytes INTEGER DEFAULT 0, - sha256 TEXT, - storage_path TEXT DEFAULT '', - extraction_status TEXT DEFAULT 'metadata_only', - created_at TEXT NOT NULL, - FOREIGN KEY(message_db_id) REFERENCES messages(id), - FOREIGN KEY(mbox_source_id) REFERENCES mbox_sources(id), - FOREIGN KEY(ingest_run_id) REFERENCES ingest_runs(id), - UNIQUE(message_db_id, sha256, filename_safe) -); - -CREATE INDEX IF NOT EXISTS idx_attachments_message ON attachments(message_db_id); -CREATE INDEX IF NOT EXISTS idx_attachments_source ON attachments(mbox_source_id); -CREATE INDEX IF NOT EXISTS idx_attachments_run ON attachments(ingest_run_id); -CREATE INDEX IF NOT EXISTS idx_attachments_sha ON attachments(sha256); -CREATE INDEX IF NOT EXISTS idx_attachments_status ON attachments(extraction_status); -CREATE INDEX IF NOT EXISTS idx_attachments_content_type ON attachments(content_type); - --- Existing messages table should be migrated by Python helper functions: --- ALTER TABLE messages ADD COLUMN mbox_source_id INTEGER; --- ALTER TABLE messages ADD COLUMN ingest_run_id INTEGER; --- CREATE INDEX IF NOT EXISTS idx_messages_source ON messages(mbox_source_id); --- CREATE INDEX IF NOT EXISTS idx_messages_source_key ON messages(mbox_source_id, mbox_key);