From 650ac2ac005dc88de4b2102ba1192c8a686cb0f1 Mon Sep 17 00:00:00 2001
From: LEVELING2108 <24f3004824@ds.study.iitm.ac.in>
Date: Wed, 10 Jun 2026 15:51:12 +0530
Subject: [PATCH] feat: add automated drift detection pipeline

---
 pipelines/run_drift_check.py |  34 ++++++++++++
 src/mlops_nlp/utils/drift.py | 102 +++++++++++++++++++++++++++--------
 tests/verify_drift.py        |  77 ++++++++++++++++++++++++++
 3 files changed, 191 insertions(+), 22 deletions(-)
 create mode 100644 pipelines/run_drift_check.py
 create mode 100644 tests/verify_drift.py

diff --git a/pipelines/run_drift_check.py b/pipelines/run_drift_check.py
new file mode 100644
index 0000000..66879ed
--- /dev/null
+++ b/pipelines/run_drift_check.py
@@ -0,0 +1,34 @@
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+ROOT_DIR = Path(__file__).resolve().parents[1]
+SRC_DIR = ROOT_DIR / "src"
+if str(SRC_DIR) not in sys.path:
+    sys.path.insert(0, str(SRC_DIR))
+
+from mlops_nlp.config import load_config
+from mlops_nlp.utils.drift import run_drift_check, save_drift_report
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Run data drift detection.")
+    parser.add_argument("--config", type=str, default="configs/config.yaml", help="Path to config yaml")
+    parser.add_argument("--output", type=str, default="data/logs/drift_report.json", help="Path to save report")
+    args = parser.parse_args()
+
+    config = load_config(args.config)
+    result = run_drift_check(config)
+    
+    if result["status"] == "success":
+        save_drift_report(result, args.output)
+        print(f"Drift check complete. Results saved to {args.output}")
+    
+    print(json.dumps(result, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/mlops_nlp/utils/drift.py b/src/mlops_nlp/utils/drift.py
index bff8e80..a56241f 100644
--- a/src/mlops_nlp/utils/drift.py
+++ b/src/mlops_nlp/utils/drift.py
@@ -1,34 +1,92 @@
 from __future__ import annotations
 
 import json
-from datetime import datetime, timezone
 from pathlib import Path
+from typing import Dict, Any
 
+import pandas as pd
+from scipy.stats import ks_2samp, chisquare
+import numpy as np
+
+from mlops_nlp.config import AppConfig
+from mlops_nlp.data.ingestion import load_dataset
+from mlops_nlp.data.preprocessing import preprocess_dataframe
 from mlops_nlp.logging_config import get_logger
 
 LOGGER = get_logger(__name__)
 
-def log_inference(
-    log_path: str | Path,
-    text: str,
-    prediction: str,
-    confidence: float,
-    model_version: str,
-) -> None:
-    """Logs inference data to a JSONL file for future drift detection."""
-    log_entry = {
-        "timestamp": datetime.now(timezone.utc).isoformat(),
-        "text": text,
-        "prediction": prediction,
-        "confidence": confidence,
-        "model_version": model_version,
+def run_drift_check(config: AppConfig) -> Dict[str, Any]:
+    """
+    Runs a drift detection check comparing reference (training) data 
+    against production (inference) logs using statistical tests.
+    """
+    inference_log_path = Path(config.monitoring.inference_log_path)
+    if not inference_log_path.exists():
+        return {"status": "skipped", "reason": "No inference logs found."}
+
+    # 1. Load Reference Data (Training Data)
+    LOGGER.info("Loading reference data from %s", config.data.raw_path)
+    ref_df = load_dataset(config.data.raw_path)
+    ref_df = preprocess_dataframe(
+        ref_df, 
+        config.data.text_column, 
+        config.data.target_column
+    )
+    
+    # 2. Load Current Data (Inference Logs)
+    LOGGER.info("Loading current data from %s", inference_log_path)
+    inference_data = []
+    with inference_log_path.open("r", encoding="utf-8") as f:
+        for line in f:
+            inference_data.append(json.loads(line))
+    
+    curr_df = pd.DataFrame(inference_data)
+    if curr_df.empty:
+        return {"status": "skipped", "reason": "Inference logs are empty."}
+
+    # 3. Perform Statistical Tests for Drift
+    
+    # Check 1: Label Distribution Drift (Chi-Square)
+    ref_counts = ref_df[config.data.target_column].value_counts(normalize=True).to_dict()
+    curr_counts = curr_df["prediction"].value_counts(normalize=True).to_dict()
+    
+    # Ensure all labels are present in both
+    all_labels = set(ref_counts.keys()) | set(curr_counts.keys())
+    ref_dist = [ref_counts.get(label, 0) for label in all_labels]
+    curr_dist = [curr_counts.get(label, 0) for label in all_labels]
+    
+    # Using KS test as a fallback for small distributions or if Chi-Square isn't appropriate
+    # but for labels, we'll just check if the ratio changed significantly
+    label_drift_score = 0.0
+    for label in all_labels:
+        label_drift_score += abs(ref_counts.get(label, 0) - curr_counts.get(label, 0))
+    
+    is_drifted = label_drift_score > 0.2  # Threshold: 20% change in distribution
+    
+    # Check 2: Confidence Score Drift (KS Test)
+    # We don't have reference confidence scores from training, 
+    # but we can check if confidence is dropping significantly over time
+    avg_confidence = curr_df["confidence"].mean()
+
+    result = {
+        "status": "success",
+        "drift_detected": bool(is_drifted),
+        "drift_score": float(label_drift_score),
+        "average_confidence": float(avg_confidence),
+        "timestamp": pd.Timestamp.now().isoformat(),
+        "samples_count": len(curr_df),
+        "label_distribution": {
+            "reference": ref_counts,
+            "current": curr_counts
+        }
     }
+
+    LOGGER.info("Drift check complete. Drift detected: %s (Score: %.4f)", is_drifted, label_drift_score)
     
-    path = Path(log_path)
+    return result
+
+def save_drift_report(report_dict: Dict[str, Any], output_path: str | Path = "data/logs/drift_report.json"):
+    path = Path(output_path)
     path.parent.mkdir(parents=True, exist_ok=True)
-    
-    try:
-        with path.open("a", encoding="utf-8") as f:
-            f.write(json.dumps(log_entry) + "\n")
-    except Exception as e:
-        LOGGER.error("Failed to log inference data: %s", e)
+    with path.open("w", encoding="utf-8") as f:
+        json.dump(report_dict, f, indent=2)
diff --git a/tests/verify_drift.py b/tests/verify_drift.py
new file mode 100644
index 0000000..1d7fbc2
--- /dev/null
+++ b/tests/verify_drift.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+import pandas as pd
+import subprocess
+
+def simulate_inference_logs(log_path: str):
+    # Simulate some biased inference logs (mostly spam) to trigger drift
+    # Training data usually has more 'ham' than 'spam'
+    biased_logs = [
+        {"timestamp": "2026-06-09T12:00:00Z", "text": "free money", "prediction": "spam", "confidence": 0.9, "model_version": "test"},
+        {"timestamp": "2026-06-09T12:01:00Z", "text": "win prize", "prediction": "spam", "confidence": 0.85, "model_version": "test"},
+        {"timestamp": "2026-06-09T12:02:00Z", "text": "click here", "prediction": "spam", "confidence": 0.95, "model_version": "test"},
+        {"timestamp": "2026-06-09T12:03:00Z", "text": "hello friend", "prediction": "ham", "confidence": 0.99, "model_version": "test"},
+        {"timestamp": "2026-06-09T12:04:00Z", "text": "limited offer", "prediction": "spam", "confidence": 0.7, "model_version": "test"},
+    ]
+    
+    Path(log_path).parent.mkdir(parents=True, exist_ok=True)
+    with open(log_path, "w", encoding="utf-8") as f:
+        for entry in biased_logs:
+            f.write(json.dumps(entry) + "\n")
+
+def run_test():
+    log_path = "data/logs/test_inference.jsonl"
+    report_path = "data/logs/test_drift_report.json"
+    
+    print("Simulating biased inference logs...")
+    simulate_inference_logs(log_path)
+    
+    # Run the drift check pipeline using the simulated logs
+    # We'll need to override the config log path or use a temp config
+    # For simplicity, we'll just run it and see if it picks up the default log if we move ours there
+    
+    print("Running drift detection pipeline...")
+    # Using environment variable to override if the script supports it, 
+    # but our config.py doesn't support MLOPS_INFERENCE_LOG_PATH yet.
+    # Let's just temporarily overwrite the default log path in a temp config or use the default one.
+    
+    # Backup existing log
+    original_log = "data/logs/inference.jsonl"
+    backup_log = "data/logs/inference.jsonl.bak"
+    if os.path.exists(original_log):
+        os.rename(original_log, backup_log)
+    
+    try:
+        os.rename(log_path, original_log)
+        
+        # Run pipeline
+        result = subprocess.run(
+            [".venv/Scripts/python", "pipelines/run_drift_check.py", "--output", report_path],
+            capture_output=True,
+            text=True,
+            env={**os.environ, "PYTHONPATH": "src"}
+        )
+        print(result.stdout)
+        if result.stderr:
+            print("Errors:", result.stderr)
+            
+        if os.path.exists(report_path):
+            with open(report_path, "r") as f:
+                report = json.load(f)
+                print(f"Drift Detected: {report['drift_detected']}")
+                print(f"Drift Score: {report['drift_score']}")
+        else:
+            print("Failed to generate report.")
+            
+    finally:
+        # Restore logs
+        if os.path.exists(original_log):
+            os.remove(original_log)
+        if os.path.exists(backup_log):
+            os.rename(backup_log, original_log)
+
+if __name__ == "__main__":
+    run_test()