initial project setup with README and ignore
This commit is contained in:
311
app/services/ml/behavior_analyzer.py
Normal file
311
app/services/ml/behavior_analyzer.py
Normal file
@@ -0,0 +1,311 @@
|
||||
"""
|
||||
Behavior Analyzer - Production Grade
|
||||
======================================
|
||||
Analyzes historical assignment data using the ID3 decision tree to classify
|
||||
assignment outcomes as 'SUCCESS' or 'RISK'.
|
||||
|
||||
Key fixes and upgrades over the original
|
||||
------------------------------------------
|
||||
1. BUG FIX: distance_band now uses `total_distance_km` (not `num_orders`).
|
||||
2. BUG FIX: time_band input is always normalized to uppercase before predict.
|
||||
3. Rich feature set: distance_band, time_band, load_band, order_density_band.
|
||||
4. Returns (label, confidence) from the classifier - exposes uncertainty.
|
||||
5. Trend analysis: tracks rolling success rate over recent N windows.
|
||||
6. Tree persistence: saves/loads trained tree as JSON to survive restarts.
|
||||
7. Feature importance proxy: logs which features drove the split.
|
||||
8. Thread-safe lazy training via a simple lock.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sqlite3
|
||||
import threading
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from app.services.ml.id3_classifier import ID3Classifier, get_behavior_model
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_DB_PATH = os.getenv("ML_DB_PATH", "ml_data/ml_store.db")
|
||||
_TREE_PATH = os.getenv("ML_TREE_PATH", "ml_data/behavior_tree.json")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Band encoders (discrete labels for ID3)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def distance_band(km: float) -> str:
|
||||
"""Total route distance -> discrete band."""
|
||||
if km <= 5.0: return "SHORT"
|
||||
if km <= 15.0: return "MID"
|
||||
if km <= 30.0: return "LONG"
|
||||
return "VERY_LONG"
|
||||
|
||||
|
||||
def time_band(ts_str: str) -> str:
|
||||
"""ISO timestamp -> time-of-day band."""
|
||||
try:
|
||||
hour = datetime.fromisoformat(ts_str).hour
|
||||
if 6 <= hour < 10: return "MORNING_RUSH"
|
||||
if 10 <= hour < 12: return "LATE_MORNING"
|
||||
if 12 <= hour < 14: return "LUNCH_RUSH"
|
||||
if 14 <= hour < 17: return "AFTERNOON"
|
||||
if 17 <= hour < 20: return "EVENING_RUSH"
|
||||
if 20 <= hour < 23: return "NIGHT"
|
||||
return "LATE_NIGHT"
|
||||
except Exception:
|
||||
return "UNKNOWN"
|
||||
|
||||
|
||||
def load_band(avg_load: float) -> str:
|
||||
"""Average orders-per-rider -> load band."""
|
||||
if avg_load <= 2.0: return "LIGHT"
|
||||
if avg_load <= 5.0: return "MODERATE"
|
||||
if avg_load <= 8.0: return "HEAVY"
|
||||
return "OVERLOADED"
|
||||
|
||||
|
||||
def order_density_band(num_orders: int, num_riders: int) -> str:
|
||||
"""Orders per available rider -> density band."""
|
||||
if num_riders == 0:
|
||||
return "NO_RIDERS"
|
||||
ratio = num_orders / num_riders
|
||||
if ratio <= 2.0: return "SPARSE"
|
||||
if ratio <= 5.0: return "NORMAL"
|
||||
if ratio <= 9.0: return "DENSE"
|
||||
return "OVERLOADED"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Behavior Analyzer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class BehaviorAnalyzer:
|
||||
"""
|
||||
Trains an ID3 tree on historical assignment logs and predicts whether
|
||||
a new assignment context is likely to SUCCEED or be at RISK.
|
||||
|
||||
Features used
|
||||
-------------
|
||||
- distance_band : total route distance bucket
|
||||
- time_band : time-of-day bucket
|
||||
- load_band : average load per rider bucket
|
||||
- order_density_band : orders-per-rider ratio bucket
|
||||
|
||||
Target
|
||||
------
|
||||
- is_success: "SUCCESS" if unassigned_count == 0, else "RISK"
|
||||
"""
|
||||
|
||||
TARGET = "is_success"
|
||||
FEATURES = ["distance_band", "time_band", "load_band", "order_density_band"]
|
||||
|
||||
def __init__(self):
|
||||
self._db_path = _DB_PATH
|
||||
self._tree_path = _TREE_PATH
|
||||
self.model: ID3Classifier = get_behavior_model(max_depth=5)
|
||||
self.is_trained: bool = False
|
||||
self._lock = threading.Lock()
|
||||
self._training_size: int = 0
|
||||
self._success_rate: float = 0.0
|
||||
self._rules: List[str] = []
|
||||
self._recent_trend: List[float] = []
|
||||
|
||||
self._load_tree()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Training
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def train_on_history(self, limit: int = 2000) -> Dict[str, Any]:
|
||||
"""Fetch the most recent rows from SQLite and rebuild the tree."""
|
||||
with self._lock:
|
||||
try:
|
||||
rows = self._fetch_rows(limit)
|
||||
if len(rows) < 10:
|
||||
logger.warning(f"ID3 BehaviorAnalyzer: only {len(rows)} rows - need >=10.")
|
||||
return {"status": "insufficient_data", "rows": len(rows)}
|
||||
|
||||
training_data, successes = self._preprocess(rows)
|
||||
|
||||
if not training_data:
|
||||
return {"status": "preprocess_failed", "rows": len(rows)}
|
||||
|
||||
self.model.train(
|
||||
data=training_data,
|
||||
target=self.TARGET,
|
||||
features=self.FEATURES,
|
||||
)
|
||||
self.is_trained = True
|
||||
self._training_size = len(training_data)
|
||||
self._success_rate = successes / len(training_data)
|
||||
self._rules = self.model.get_tree_rules()
|
||||
self._compute_trend(rows)
|
||||
self._save_tree()
|
||||
|
||||
summary = {
|
||||
"status": "ok",
|
||||
"training_rows": self._training_size,
|
||||
"success_rate": round(self._success_rate, 4),
|
||||
"n_rules": len(self._rules),
|
||||
"classes": self.model.classes,
|
||||
"feature_values": self.model.feature_values,
|
||||
}
|
||||
logger.info(
|
||||
f"ID3 BehaviorAnalyzer trained - rows={self._training_size}, "
|
||||
f"success_rate={self._success_rate:.1%}, rules={len(self._rules)}"
|
||||
)
|
||||
return summary
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"ID3 BehaviorAnalyzer training failed: {e}", exc_info=True)
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Prediction
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def predict(self, distance_km: float, timestamp_or_band: str,
|
||||
avg_load: float = 4.0, num_orders: int = 5,
|
||||
num_riders: int = 2) -> Dict[str, Any]:
|
||||
"""Predict whether an assignment context will SUCCEED or be at RISK."""
|
||||
if not self.is_trained:
|
||||
return {
|
||||
"label": "SUCCESS",
|
||||
"confidence": 0.5,
|
||||
"features_used": {},
|
||||
"model_trained": False,
|
||||
}
|
||||
|
||||
KNOWN_BANDS = {
|
||||
"MORNING_RUSH", "LATE_MORNING", "LUNCH_RUSH",
|
||||
"AFTERNOON", "EVENING_RUSH", "NIGHT", "LATE_NIGHT", "UNKNOWN"
|
||||
}
|
||||
t_band = (
|
||||
timestamp_or_band.upper()
|
||||
if timestamp_or_band.upper() in KNOWN_BANDS
|
||||
else time_band(timestamp_or_band)
|
||||
)
|
||||
|
||||
features_used = {
|
||||
"distance_band": distance_band(distance_km),
|
||||
"time_band": t_band,
|
||||
"load_band": load_band(avg_load),
|
||||
"order_density_band": order_density_band(num_orders, num_riders),
|
||||
}
|
||||
|
||||
label, confidence = self.model.predict(features_used)
|
||||
return {
|
||||
"label": label,
|
||||
"confidence": round(confidence, 4),
|
||||
"features_used": features_used,
|
||||
"model_trained": True,
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Info / Diagnostics
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_info(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"is_trained": self.is_trained,
|
||||
"training_rows": self._training_size,
|
||||
"success_rate": round(self._success_rate, 4),
|
||||
"n_rules": len(self._rules),
|
||||
"rules": self._rules[:20],
|
||||
"recent_trend": self._recent_trend,
|
||||
"feature_names": self.FEATURES,
|
||||
"feature_values": self.model.feature_values if self.is_trained else {},
|
||||
"classes": self.model.classes if self.is_trained else [],
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _fetch_rows(self, limit: int) -> List[Dict]:
|
||||
conn = sqlite3.connect(self._db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
rows = conn.execute(
|
||||
"SELECT * FROM assignment_ml_log ORDER BY id DESC LIMIT ?", (limit,)
|
||||
).fetchall()
|
||||
conn.close()
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
def _preprocess(self, rows: List[Dict]) -> Tuple[List[Dict], int]:
|
||||
training_data: List[Dict] = []
|
||||
successes = 0
|
||||
for r in rows:
|
||||
try:
|
||||
dist_km = float(r.get("total_distance_km") or 0.0)
|
||||
ts = str(r.get("timestamp") or "")
|
||||
avg_ld = float(r.get("avg_load") or 0.0)
|
||||
n_orders = int(r.get("num_orders") or 0)
|
||||
n_riders = int(r.get("num_riders") or 1)
|
||||
unassigned = int(r.get("unassigned_count") or 0)
|
||||
|
||||
label = "SUCCESS" if unassigned == 0 else "RISK"
|
||||
if label == "SUCCESS":
|
||||
successes += 1
|
||||
|
||||
training_data.append({
|
||||
"distance_band": distance_band(dist_km),
|
||||
"time_band": time_band(ts),
|
||||
"load_band": load_band(avg_ld),
|
||||
"order_density_band": order_density_band(n_orders, n_riders),
|
||||
self.TARGET: label,
|
||||
})
|
||||
except Exception:
|
||||
continue
|
||||
return training_data, successes
|
||||
|
||||
def _compute_trend(self, rows: List[Dict], window: int = 50) -> None:
|
||||
trend = []
|
||||
for i in range(0, len(rows), window):
|
||||
chunk = rows[i:i + window]
|
||||
if not chunk:
|
||||
break
|
||||
rate = sum(1 for r in chunk if int(r.get("unassigned_count", 1)) == 0) / len(chunk)
|
||||
trend.append(round(rate, 4))
|
||||
self._recent_trend = trend[-20:]
|
||||
|
||||
def _save_tree(self) -> None:
|
||||
try:
|
||||
os.makedirs(os.path.dirname(self._tree_path) or ".", exist_ok=True)
|
||||
with open(self._tree_path, "w") as f:
|
||||
f.write(self.model.to_json())
|
||||
logger.info(f"ID3 tree persisted -> {self._tree_path}")
|
||||
except Exception as e:
|
||||
logger.warning(f"ID3 tree save failed: {e}")
|
||||
|
||||
def _load_tree(self) -> None:
|
||||
try:
|
||||
if not os.path.exists(self._tree_path):
|
||||
return
|
||||
with open(self._tree_path) as f:
|
||||
self.model = ID3Classifier.from_json(f.read())
|
||||
self.is_trained = True
|
||||
self._rules = self.model.get_tree_rules()
|
||||
logger.info(f"ID3 tree restored - rules={len(self._rules)}")
|
||||
except Exception as e:
|
||||
logger.warning(f"ID3 tree load failed (will retrain): {e}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Module-level singleton
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_analyzer: Optional[BehaviorAnalyzer] = None
|
||||
_analyzer_lock = threading.Lock()
|
||||
|
||||
|
||||
def get_analyzer() -> BehaviorAnalyzer:
|
||||
global _analyzer
|
||||
with _analyzer_lock:
|
||||
if _analyzer is None:
|
||||
_analyzer = BehaviorAnalyzer()
|
||||
if not _analyzer.is_trained:
|
||||
_analyzer.train_on_history()
|
||||
return _analyzer
|
||||
400
app/services/ml/id3_classifier.py
Normal file
400
app/services/ml/id3_classifier.py
Normal file
@@ -0,0 +1,400 @@
|
||||
"""
|
||||
ID3 Classifier - Production Grade
|
||||
|
||||
Improvements over v1:
|
||||
- Chi-squared pruning to prevent overfitting on sparse branches
|
||||
- Confidence scores on every prediction (Laplace smoothed)
|
||||
- Gain-ratio variant for high-cardinality features
|
||||
- Serialization (to_dict / from_dict / to_json / from_json)
|
||||
- Per-feature importance scores
|
||||
- Full prediction audit trail via explain()
|
||||
- min_samples_split and min_info_gain stopping criteria
|
||||
"""
|
||||
|
||||
import math
|
||||
import json
|
||||
import logging
|
||||
from collections import Counter
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ID3Classifier:
|
||||
"""
|
||||
ID3 decision tree (entropy / information-gain splitting).
|
||||
All predict* methods work even if the model has never been trained -
|
||||
they return safe defaults rather than raising.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_depth: int = 6,
|
||||
min_samples_split: int = 5,
|
||||
min_info_gain: float = 0.001,
|
||||
use_gain_ratio: bool = False,
|
||||
chi2_pruning: bool = True,
|
||||
):
|
||||
self.max_depth = max_depth
|
||||
self.min_samples_split = min_samples_split
|
||||
self.min_info_gain = min_info_gain
|
||||
self.use_gain_ratio = use_gain_ratio
|
||||
self.chi2_pruning = chi2_pruning
|
||||
|
||||
self.tree: Any = None
|
||||
self.features: List[str] = []
|
||||
self.target: str = ""
|
||||
self.classes_: List[str] = []
|
||||
self.feature_importances_: Dict[str, float] = {}
|
||||
self.feature_values: Dict[str, List[str]] = {} # unique values seen per feature
|
||||
self._n_samples: int = 0
|
||||
self._total_gain: Dict[str, float] = {}
|
||||
|
||||
# ------------------------------------------------------------------ train
|
||||
|
||||
def train(self, data: List[Dict[str, Any]], target: str, features: List[str]) -> None:
|
||||
if not data:
|
||||
logger.warning("ID3: train() called with empty data.")
|
||||
return
|
||||
|
||||
self.target = target
|
||||
self.features = list(features)
|
||||
self.classes_ = sorted({str(row.get(target)) for row in data})
|
||||
self._total_gain = {f: 0.0 for f in features}
|
||||
self._n_samples = len(data)
|
||||
|
||||
# Collect unique values per feature for dashboard display
|
||||
self.feature_values = {
|
||||
f: sorted({str(row.get(f)) for row in data if row.get(f) is not None})
|
||||
for f in features
|
||||
}
|
||||
|
||||
self.tree = self._build_tree(data, list(features), target, depth=0)
|
||||
|
||||
if self.chi2_pruning:
|
||||
self.tree = self._prune(self.tree, data, target)
|
||||
|
||||
total_gain = sum(self._total_gain.values()) or 1.0
|
||||
self.feature_importances_ = {
|
||||
f: round(v / total_gain, 4) for f, v in self._total_gain.items()
|
||||
}
|
||||
logger.info(
|
||||
f"ID3: trained on {len(data)} samples | "
|
||||
f"classes={self.classes_} | importances={self.feature_importances_}"
|
||||
)
|
||||
|
||||
# ----------------------------------------------------------- predict API
|
||||
|
||||
def predict(self, sample: Dict[str, Any]) -> Tuple[str, float]:
|
||||
"""Return (label, confidence 0-1). Safe to call before training."""
|
||||
if self.tree is None:
|
||||
return "Unknown", 0.0
|
||||
label, proba = self._classify(self.tree, sample, [])
|
||||
confidence = proba.get(str(label), 0.0) if isinstance(proba, dict) else 1.0
|
||||
return str(label), round(confidence, 4)
|
||||
|
||||
def predict_proba(self, sample: Dict[str, Any]) -> Dict[str, float]:
|
||||
"""Full class probability distribution."""
|
||||
if self.tree is None:
|
||||
return {}
|
||||
_, proba = self._classify(self.tree, sample, [])
|
||||
return proba if isinstance(proba, dict) else {str(proba): 1.0}
|
||||
|
||||
def explain(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Human-readable decision path for audit / dashboard display."""
|
||||
if self.tree is None:
|
||||
return {"prediction": "Unknown", "confidence": 0.0, "decision_path": []}
|
||||
path: List[str] = []
|
||||
label, proba = self._classify(self.tree, sample, path)
|
||||
return {
|
||||
"prediction": str(label),
|
||||
"confidence": round(proba.get(str(label), 1.0), 4),
|
||||
"probabilities": proba,
|
||||
"decision_path": path,
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------- serialisation
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"tree": self.tree,
|
||||
"features": self.features,
|
||||
"target": self.target,
|
||||
"classes": self.classes_,
|
||||
"feature_importances": self.feature_importances_,
|
||||
"feature_values": self.feature_values,
|
||||
"n_samples": self._n_samples,
|
||||
"params": {
|
||||
"max_depth": self.max_depth,
|
||||
"min_samples_split": self.min_samples_split,
|
||||
"min_info_gain": self.min_info_gain,
|
||||
"use_gain_ratio": self.use_gain_ratio,
|
||||
"chi2_pruning": self.chi2_pruning,
|
||||
},
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, d: Dict[str, Any]) -> "ID3Classifier":
|
||||
p = d.get("params", {})
|
||||
obj = cls(
|
||||
max_depth=p.get("max_depth", 6),
|
||||
min_samples_split=p.get("min_samples_split", 5),
|
||||
min_info_gain=p.get("min_info_gain", 0.001),
|
||||
use_gain_ratio=p.get("use_gain_ratio", False),
|
||||
chi2_pruning=p.get("chi2_pruning", True),
|
||||
)
|
||||
obj.tree = d["tree"]
|
||||
obj.features = d["features"]
|
||||
obj.target = d["target"]
|
||||
obj.classes_ = d["classes"]
|
||||
obj.feature_importances_ = d.get("feature_importances", {})
|
||||
obj.feature_values = d.get("feature_values", {})
|
||||
obj._n_samples = d.get("n_samples", 0)
|
||||
return obj
|
||||
|
||||
def to_json(self) -> str:
|
||||
return json.dumps(self.to_dict(), indent=2)
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, s: str) -> "ID3Classifier":
|
||||
return cls.from_dict(json.loads(s))
|
||||
|
||||
def summary(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"n_samples": self._n_samples,
|
||||
"n_classes": len(self.classes_),
|
||||
"classes": self.classes_,
|
||||
"n_features": len(self.features),
|
||||
"feature_importances": self.feature_importances_,
|
||||
"feature_values": self.feature_values,
|
||||
"trained": self.tree is not None,
|
||||
}
|
||||
|
||||
@property
|
||||
def classes(self) -> List[str]:
|
||||
"""Alias for classes_ for compatibility."""
|
||||
return self.classes_
|
||||
|
||||
def get_tree_rules(self) -> List[str]:
|
||||
"""Extract human-readable if/then rules from the trained tree."""
|
||||
rules: List[str] = []
|
||||
if self.tree is None:
|
||||
return rules
|
||||
self._extract_rules(self.tree, [], rules)
|
||||
return rules
|
||||
|
||||
def _extract_rules(self, node: Any, conditions: List[str], rules: List[str]) -> None:
|
||||
"""Recursively walk the tree and collect decision paths as strings."""
|
||||
if not isinstance(node, dict):
|
||||
return
|
||||
if node.get("__leaf__"):
|
||||
label = node.get("__label__", "?")
|
||||
proba = node.get("__proba__", {})
|
||||
conf = proba.get(str(label), 0.0)
|
||||
prefix = " AND ".join(conditions) if conditions else "(root)"
|
||||
rules.append(f"{prefix} => {label} ({conf:.0%})")
|
||||
return
|
||||
feature = node.get("__feature__", "?")
|
||||
for val, child in node.get("__branches__", {}).items():
|
||||
self._extract_rules(child, conditions + [f"{feature}={val}"], rules)
|
||||
|
||||
# --------------------------------------------------------- tree building
|
||||
|
||||
def _build_tree(
|
||||
self,
|
||||
data: List[Dict[str, Any]],
|
||||
features: List[str],
|
||||
target: str,
|
||||
depth: int,
|
||||
) -> Any:
|
||||
counts = Counter(str(row.get(target)) for row in data)
|
||||
|
||||
# Pure node
|
||||
if len(counts) == 1:
|
||||
return self._make_leaf(data, target)
|
||||
|
||||
# Stopping criteria
|
||||
if not features or depth >= self.max_depth or len(data) < self.min_samples_split:
|
||||
return self._make_leaf(data, target)
|
||||
|
||||
best_f, best_gain = self._best_split(data, features, target)
|
||||
if best_f is None or best_gain < self.min_info_gain:
|
||||
return self._make_leaf(data, target)
|
||||
|
||||
self._total_gain[best_f] = self._total_gain.get(best_f, 0.0) + best_gain
|
||||
|
||||
remaining = [f for f in features if f != best_f]
|
||||
node = {
|
||||
"__feature__": best_f,
|
||||
"__gain__": round(best_gain, 6),
|
||||
"__n__": len(data),
|
||||
"__branches__": {},
|
||||
}
|
||||
for val in {row.get(best_f) for row in data}:
|
||||
subset = [r for r in data if r.get(best_f) == val]
|
||||
node["__branches__"][str(val)] = self._build_tree(
|
||||
subset, remaining, target, depth + 1
|
||||
)
|
||||
return node
|
||||
|
||||
def _make_leaf(self, data: List[Dict[str, Any]], target: str) -> Dict[str, Any]:
|
||||
counts = Counter(str(row.get(target)) for row in data)
|
||||
total = len(data)
|
||||
k = len(self.classes_) or 1
|
||||
# Laplace smoothing
|
||||
proba = {
|
||||
cls: round((counts.get(cls, 0) + 1) / (total + k), 4)
|
||||
for cls in self.classes_
|
||||
}
|
||||
label = max(proba, key=proba.get)
|
||||
return {"__leaf__": True, "__label__": label, "__proba__": proba, "__n__": total}
|
||||
|
||||
# ---------------------------------------------------------- splitting
|
||||
|
||||
def _best_split(
|
||||
self, data: List[Dict[str, Any]], features: List[str], target: str
|
||||
) -> Tuple[Optional[str], float]:
|
||||
base_e = self._entropy(data, target)
|
||||
best_f, best_gain = None, -1.0
|
||||
for f in features:
|
||||
gain = self._info_gain(data, f, target, base_e)
|
||||
if self.use_gain_ratio:
|
||||
si = self._split_info(data, f)
|
||||
gain = gain / si if si > 0 else 0.0
|
||||
if gain > best_gain:
|
||||
best_gain = gain
|
||||
best_f = f
|
||||
return best_f, best_gain
|
||||
|
||||
# ----------------------------------------------------------- pruning
|
||||
|
||||
def _prune(self, node: Any, data: List[Dict[str, Any]], target: str) -> Any:
|
||||
if not isinstance(node, dict) or node.get("__leaf__"):
|
||||
return node
|
||||
feature = node["__feature__"]
|
||||
# Recurse children first
|
||||
for val in list(node["__branches__"].keys()):
|
||||
subset = [r for r in data if str(r.get(feature)) == str(val)]
|
||||
node["__branches__"][val] = self._prune(node["__branches__"][val], subset, target)
|
||||
# Chi-squared test: if split is not significant, collapse to leaf
|
||||
if not self._chi2_significant(data, feature, target):
|
||||
return self._make_leaf(data, target)
|
||||
return node
|
||||
|
||||
def _chi2_significant(
|
||||
self, data: List[Dict[str, Any]], feature: str, target: str
|
||||
) -> bool:
|
||||
classes = self.classes_
|
||||
feature_vals = list({str(r.get(feature)) for r in data})
|
||||
if not classes or len(feature_vals) < 2:
|
||||
return False
|
||||
total = len(data)
|
||||
class_totals = Counter(str(r.get(target)) for r in data)
|
||||
chi2 = 0.0
|
||||
for val in feature_vals:
|
||||
subset = [r for r in data if str(r.get(feature)) == val]
|
||||
n_val = len(subset)
|
||||
val_counts = Counter(str(r.get(target)) for r in subset)
|
||||
for cls in classes:
|
||||
observed = val_counts.get(cls, 0)
|
||||
expected = (n_val * class_totals.get(cls, 0)) / total
|
||||
if expected > 0:
|
||||
chi2 += (observed - expected) ** 2 / expected
|
||||
df = (len(feature_vals) - 1) * (len(classes) - 1)
|
||||
if df <= 0:
|
||||
return False
|
||||
# Critical values at p=0.05
|
||||
crit_table = {1: 3.841, 2: 5.991, 3: 7.815, 4: 9.488, 5: 11.070, 6: 12.592}
|
||||
crit = crit_table.get(df, 3.841 * df)
|
||||
return chi2 > crit
|
||||
|
||||
# ---------------------------------------------------------- classify
|
||||
|
||||
def _classify(
|
||||
self, node: Any, row: Dict[str, Any], path: List[str]
|
||||
) -> Tuple[Any, Any]:
|
||||
if not isinstance(node, dict):
|
||||
return node, {str(node): 1.0}
|
||||
if node.get("__leaf__"):
|
||||
label = node["__label__"]
|
||||
proba = node["__proba__"]
|
||||
path.append(f"predict={label} (p={proba.get(label, 0):.2f})")
|
||||
return label, proba
|
||||
|
||||
feature = node["__feature__"]
|
||||
value = str(row.get(feature, ""))
|
||||
path.append(f"{feature}={value}")
|
||||
|
||||
branches = node["__branches__"]
|
||||
if value in branches:
|
||||
return self._classify(branches[value], row, path)
|
||||
|
||||
# Unseen value: weighted vote from all leaf children
|
||||
all_proba: Counter = Counter()
|
||||
total_n = 0
|
||||
for child in branches.values():
|
||||
if isinstance(child, dict) and child.get("__leaf__"):
|
||||
n = child.get("__n__", 1)
|
||||
total_n += n
|
||||
for cls, p in child.get("__proba__", {}).items():
|
||||
all_proba[cls] += p * n
|
||||
|
||||
if not total_n:
|
||||
fallback = self.classes_[0] if self.classes_ else "Unknown"
|
||||
path.append(f"unseen fallback: {fallback}")
|
||||
return fallback, {fallback: 1.0}
|
||||
|
||||
proba = {cls: round(v / total_n, 4) for cls, v in all_proba.items()}
|
||||
label = max(proba, key=proba.get)
|
||||
path.append(f"weighted vote: {label}")
|
||||
return label, proba
|
||||
|
||||
# ---------------------------------------------------------- entropy math
|
||||
|
||||
def _entropy(self, data: List[Dict[str, Any]], target: str) -> float:
|
||||
if not data:
|
||||
return 0.0
|
||||
counts = Counter(str(row.get(target)) for row in data)
|
||||
total = len(data)
|
||||
return -sum((c / total) * math.log2(c / total) for c in counts.values() if c > 0)
|
||||
|
||||
def _info_gain(
|
||||
self,
|
||||
data: List[Dict[str, Any]],
|
||||
feature: str,
|
||||
target: str,
|
||||
base_entropy: Optional[float] = None,
|
||||
) -> float:
|
||||
if base_entropy is None:
|
||||
base_entropy = self._entropy(data, target)
|
||||
total = len(data)
|
||||
buckets: Dict[Any, list] = {}
|
||||
for row in data:
|
||||
buckets.setdefault(row.get(feature), []).append(row)
|
||||
weighted = sum(
|
||||
(len(sub) / total) * self._entropy(sub, target) for sub in buckets.values()
|
||||
)
|
||||
return base_entropy - weighted
|
||||
|
||||
def _split_info(self, data: List[Dict[str, Any]], feature: str) -> float:
|
||||
total = len(data)
|
||||
counts = Counter(row.get(feature) for row in data)
|
||||
return -sum((c / total) * math.log2(c / total) for c in counts.values() if c > 0)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------ factory
|
||||
|
||||
def get_behavior_model(
|
||||
max_depth: int = 5,
|
||||
min_samples_split: int = 8,
|
||||
min_info_gain: float = 0.005,
|
||||
use_gain_ratio: bool = True,
|
||||
chi2_pruning: bool = True,
|
||||
) -> ID3Classifier:
|
||||
return ID3Classifier(
|
||||
max_depth=max_depth,
|
||||
min_samples_split=min_samples_split,
|
||||
min_info_gain=min_info_gain,
|
||||
use_gain_ratio=use_gain_ratio,
|
||||
chi2_pruning=chi2_pruning,
|
||||
)
|
||||
539
app/services/ml/ml_data_collector.py
Normal file
539
app/services/ml/ml_data_collector.py
Normal file
@@ -0,0 +1,539 @@
|
||||
"""
|
||||
ML Data Collector - Production Grade
|
||||
======================================
|
||||
Logs every assignment call (inputs + outcomes) to SQLite.
|
||||
|
||||
Key upgrades over the original
|
||||
--------------------------------
|
||||
1. FROZEN historical scores - quality_score is written ONCE at log time.
|
||||
get_training_data() returns scores as-is from the DB (no retroactive mutation).
|
||||
2. Rich schema - zone_id, city_id, is_peak, weather_code,
|
||||
sla_breached, avg_delivery_time_min for richer features.
|
||||
3. SLA tracking - logs whether delivery SLA was breached.
|
||||
4. Analytics API - get_hourly_stats(), get_strategy_comparison(),
|
||||
get_quality_histogram(), get_zone_stats() for dashboard consumption.
|
||||
5. Thread-safe writes - connection-per-write pattern for FastAPI workers.
|
||||
6. Indexed columns - timestamp, ml_strategy, zone_id for fast queries.
|
||||
"""
|
||||
|
||||
import csv
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import sqlite3
|
||||
import threading
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_DB_PATH = os.getenv("ML_DB_PATH", "ml_data/ml_store.db")
|
||||
_WRITE_LOCK = threading.Lock()
|
||||
|
||||
|
||||
def _std(values: List[float]) -> float:
|
||||
if len(values) < 2:
|
||||
return 0.0
|
||||
mean = sum(values) / len(values)
|
||||
return (sum((v - mean) ** 2 for v in values) / len(values)) ** 0.5
|
||||
|
||||
|
||||
class MLDataCollector:
|
||||
"""
|
||||
Event logger for assignment service calls.
|
||||
|
||||
Each log_assignment_event() call writes one row capturing:
|
||||
- Operating context (time, orders, riders, zone, city)
|
||||
- Active hyperparams (exact config snapshot for this call)
|
||||
- Measured outcomes (quality score, SLA, latency, distances)
|
||||
|
||||
quality_score is computed once and FROZEN - never retroactively changed.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._db_path = _DB_PATH
|
||||
self._ensure_db()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Main logging API
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def log_assignment_event(
|
||||
self,
|
||||
*,
|
||||
num_orders: int,
|
||||
num_riders: int,
|
||||
hyperparams: Dict[str, Any],
|
||||
assignments: Dict[int, List[Any]],
|
||||
unassigned_count: int,
|
||||
elapsed_ms: float,
|
||||
zone_id: str = "default",
|
||||
city_id: str = "default",
|
||||
weather_code: str = "CLEAR",
|
||||
sla_minutes: Optional[float] = None,
|
||||
avg_delivery_time_min: Optional[float] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Log one assignment event.
|
||||
|
||||
Call this at the END of AssignmentService.assign_orders() once
|
||||
outcomes are known.
|
||||
"""
|
||||
try:
|
||||
now = datetime.utcnow()
|
||||
hour = now.hour
|
||||
day_of_week = now.weekday()
|
||||
is_peak = int(hour in (7, 8, 9, 12, 13, 18, 19, 20))
|
||||
|
||||
rider_loads = [len(orders) for orders in assignments.values() if orders]
|
||||
riders_used = len(rider_loads)
|
||||
total_assigned = sum(rider_loads)
|
||||
avg_load = total_assigned / riders_used if riders_used else 0.0
|
||||
load_std = _std(rider_loads) if rider_loads else 0.0
|
||||
|
||||
all_orders = [o for orders in assignments.values() if orders for o in orders]
|
||||
total_distance_km = sum(self._get_km(o) for o in all_orders)
|
||||
ml_strategy = hyperparams.get("ml_strategy", "balanced")
|
||||
max_opr = hyperparams.get("max_orders_per_rider", 12)
|
||||
|
||||
sla_breached = 0
|
||||
if sla_minutes and avg_delivery_time_min:
|
||||
sla_breached = int(avg_delivery_time_min > sla_minutes)
|
||||
|
||||
# Quality score - FROZEN at log time
|
||||
quality_score = self._compute_quality_score(
|
||||
num_orders=num_orders,
|
||||
unassigned_count=unassigned_count,
|
||||
load_std=load_std,
|
||||
riders_used=riders_used,
|
||||
num_riders=num_riders,
|
||||
total_distance_km=total_distance_km,
|
||||
max_orders_per_rider=max_opr,
|
||||
ml_strategy=ml_strategy,
|
||||
)
|
||||
|
||||
row = {
|
||||
"timestamp": now.isoformat(),
|
||||
"hour": hour,
|
||||
"day_of_week": day_of_week,
|
||||
"is_peak": is_peak,
|
||||
"zone_id": zone_id,
|
||||
"city_id": city_id,
|
||||
"weather_code": weather_code,
|
||||
"num_orders": num_orders,
|
||||
"num_riders": num_riders,
|
||||
"max_pickup_distance_km": hyperparams.get("max_pickup_distance_km", 10.0),
|
||||
"max_kitchen_distance_km": hyperparams.get("max_kitchen_distance_km", 3.0),
|
||||
"max_orders_per_rider": max_opr,
|
||||
"ideal_load": hyperparams.get("ideal_load", 6),
|
||||
"workload_balance_threshold": hyperparams.get("workload_balance_threshold", 0.7),
|
||||
"workload_penalty_weight": hyperparams.get("workload_penalty_weight", 100.0),
|
||||
"distance_penalty_weight": hyperparams.get("distance_penalty_weight", 2.0),
|
||||
"cluster_radius_km": hyperparams.get("cluster_radius_km", 3.0),
|
||||
"search_time_limit_seconds": hyperparams.get("search_time_limit_seconds", 5),
|
||||
"road_factor": hyperparams.get("road_factor", 1.3),
|
||||
"ml_strategy": ml_strategy,
|
||||
"riders_used": riders_used,
|
||||
"total_assigned": total_assigned,
|
||||
"unassigned_count": unassigned_count,
|
||||
"avg_load": round(avg_load, 3),
|
||||
"load_std": round(load_std, 3),
|
||||
"total_distance_km": round(total_distance_km, 2),
|
||||
"elapsed_ms": round(elapsed_ms, 1),
|
||||
"sla_breached": sla_breached,
|
||||
"avg_delivery_time_min": round(avg_delivery_time_min or 0.0, 2),
|
||||
"quality_score": round(quality_score, 2),
|
||||
}
|
||||
|
||||
with _WRITE_LOCK:
|
||||
self._insert(row)
|
||||
|
||||
logger.info(
|
||||
f"[MLCollector] zone={zone_id} orders={num_orders} "
|
||||
f"assigned={total_assigned} unassigned={unassigned_count} "
|
||||
f"quality={quality_score:.1f} elapsed={elapsed_ms:.0f}ms"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"[MLCollector] Logging failed (non-fatal): {e}")
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Data retrieval for training
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_training_data(
|
||||
self,
|
||||
min_records: int = 30,
|
||||
strategy_filter: Optional[str] = None,
|
||||
since_hours: Optional[int] = None,
|
||||
) -> Optional[List[Dict[str, Any]]]:
|
||||
"""
|
||||
Return logged rows for model training.
|
||||
quality_score is returned AS-IS (frozen at log time - no re-scoring).
|
||||
"""
|
||||
try:
|
||||
conn = sqlite3.connect(self._db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
|
||||
query = "SELECT * FROM assignment_ml_log"
|
||||
params: list = []
|
||||
clauses: list = []
|
||||
|
||||
if strategy_filter:
|
||||
clauses.append("ml_strategy = ?")
|
||||
params.append(strategy_filter)
|
||||
if since_hours:
|
||||
cutoff = (datetime.utcnow() - timedelta(hours=since_hours)).isoformat()
|
||||
clauses.append("timestamp >= ?")
|
||||
params.append(cutoff)
|
||||
|
||||
if clauses:
|
||||
query += " WHERE " + " AND ".join(clauses)
|
||||
query += " ORDER BY id ASC"
|
||||
|
||||
rows = conn.execute(query, params).fetchall()
|
||||
conn.close()
|
||||
|
||||
if len(rows) < min_records:
|
||||
logger.info(f"[MLCollector] {len(rows)} records < {min_records} minimum.")
|
||||
return None
|
||||
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[MLCollector] get_training_data failed: {e}")
|
||||
return None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Analytics API
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_recent_quality_trend(self, last_n: int = 50) -> Dict[str, Any]:
|
||||
"""Recent quality scores + series for sparkline charts."""
|
||||
try:
|
||||
conn = sqlite3.connect(self._db_path)
|
||||
rows = conn.execute(
|
||||
"SELECT quality_score, timestamp, unassigned_count, elapsed_ms "
|
||||
"FROM assignment_ml_log ORDER BY id DESC LIMIT ?", (last_n,)
|
||||
).fetchall()
|
||||
conn.close()
|
||||
if not rows:
|
||||
return {"avg_quality": 0.0, "sample_size": 0, "history": []}
|
||||
scores = [r[0] for r in rows]
|
||||
return {
|
||||
"avg_quality": round(sum(scores) / len(scores), 2),
|
||||
"min_quality": round(min(scores), 2),
|
||||
"max_quality": round(max(scores), 2),
|
||||
"sample_size": len(scores),
|
||||
"history": list(reversed(scores)),
|
||||
"timestamps": list(reversed([r[1] for r in rows])),
|
||||
"unassigned_series": list(reversed([r[2] for r in rows])),
|
||||
"latency_series": list(reversed([r[3] for r in rows])),
|
||||
}
|
||||
except Exception:
|
||||
return {"avg_quality": 0.0, "sample_size": 0, "history": []}
|
||||
|
||||
def get_hourly_stats(self, last_days: int = 7) -> List[Dict[str, Any]]:
|
||||
"""Quality, SLA, and call volume aggregated by hour-of-day."""
|
||||
try:
|
||||
conn = sqlite3.connect(self._db_path)
|
||||
cutoff = (datetime.utcnow() - timedelta(days=last_days)).isoformat()
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT hour,
|
||||
COUNT(*) AS call_count,
|
||||
AVG(quality_score) AS avg_quality,
|
||||
AVG(unassigned_count) AS avg_unassigned,
|
||||
AVG(elapsed_ms) AS avg_latency_ms,
|
||||
SUM(CASE WHEN sla_breached=1 THEN 1 ELSE 0 END) AS sla_breaches
|
||||
FROM assignment_ml_log WHERE timestamp >= ?
|
||||
GROUP BY hour ORDER BY hour
|
||||
""", (cutoff,)
|
||||
).fetchall()
|
||||
conn.close()
|
||||
return [
|
||||
{
|
||||
"hour": r[0],
|
||||
"call_count": r[1],
|
||||
"avg_quality": round(r[2] or 0.0, 2),
|
||||
"avg_unassigned": round(r[3] or 0.0, 2),
|
||||
"avg_latency_ms": round(r[4] or 0.0, 1),
|
||||
"sla_breaches": r[5],
|
||||
}
|
||||
for r in rows
|
||||
]
|
||||
except Exception as e:
|
||||
logger.error(f"[MLCollector] get_hourly_stats: {e}")
|
||||
return []
|
||||
|
||||
def get_strategy_comparison(self) -> List[Dict[str, Any]]:
|
||||
"""Compare quality metrics across ml_strategy values."""
|
||||
try:
|
||||
conn = sqlite3.connect(self._db_path)
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT ml_strategy,
|
||||
COUNT(*) AS call_count,
|
||||
AVG(quality_score) AS avg_quality,
|
||||
MIN(quality_score) AS min_quality,
|
||||
MAX(quality_score) AS max_quality,
|
||||
AVG(unassigned_count) AS avg_unassigned,
|
||||
AVG(total_distance_km) AS avg_distance_km,
|
||||
AVG(elapsed_ms) AS avg_latency_ms
|
||||
FROM assignment_ml_log
|
||||
GROUP BY ml_strategy ORDER BY avg_quality DESC
|
||||
"""
|
||||
).fetchall()
|
||||
conn.close()
|
||||
return [
|
||||
{
|
||||
"strategy": r[0],
|
||||
"call_count": r[1],
|
||||
"avg_quality": round(r[2] or 0.0, 2),
|
||||
"min_quality": round(r[3] or 0.0, 2),
|
||||
"max_quality": round(r[4] or 0.0, 2),
|
||||
"avg_unassigned": round(r[5] or 0.0, 2),
|
||||
"avg_distance_km": round(r[6] or 0.0, 2),
|
||||
"avg_latency_ms": round(r[7] or 0.0, 1),
|
||||
}
|
||||
for r in rows
|
||||
]
|
||||
except Exception as e:
|
||||
logger.error(f"[MLCollector] get_strategy_comparison: {e}")
|
||||
return []
|
||||
|
||||
def get_quality_histogram(self, bins: int = 10) -> List[Dict[str, Any]]:
|
||||
"""Quality score distribution for histogram chart."""
|
||||
try:
|
||||
conn = sqlite3.connect(self._db_path)
|
||||
rows = conn.execute("SELECT quality_score FROM assignment_ml_log").fetchall()
|
||||
conn.close()
|
||||
scores = [r[0] for r in rows if r[0] is not None]
|
||||
if not scores:
|
||||
return []
|
||||
bin_width = 100.0 / bins
|
||||
return [
|
||||
{
|
||||
"range": f"{i*bin_width:.0f}-{(i+1)*bin_width:.0f}",
|
||||
"count": sum(1 for s in scores if i*bin_width <= s < (i+1)*bin_width)
|
||||
}
|
||||
for i in range(bins)
|
||||
]
|
||||
except Exception as e:
|
||||
logger.error(f"[MLCollector] get_quality_histogram: {e}")
|
||||
return []
|
||||
|
||||
def get_zone_stats(self) -> List[Dict[str, Any]]:
|
||||
"""Quality and SLA stats grouped by zone."""
|
||||
try:
|
||||
conn = sqlite3.connect(self._db_path)
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT zone_id, COUNT(*) AS call_count,
|
||||
AVG(quality_score) AS avg_quality,
|
||||
SUM(sla_breached) AS sla_breaches,
|
||||
AVG(total_distance_km) AS avg_distance_km
|
||||
FROM assignment_ml_log
|
||||
GROUP BY zone_id ORDER BY avg_quality DESC
|
||||
"""
|
||||
).fetchall()
|
||||
conn.close()
|
||||
return [
|
||||
{
|
||||
"zone_id": r[0],
|
||||
"call_count": r[1],
|
||||
"avg_quality": round(r[2] or 0.0, 2),
|
||||
"sla_breaches": r[3],
|
||||
"avg_distance_km": round(r[4] or 0.0, 2),
|
||||
}
|
||||
for r in rows
|
||||
]
|
||||
except Exception as e:
|
||||
logger.error(f"[MLCollector] get_zone_stats: {e}")
|
||||
return []
|
||||
|
||||
def count_records(self) -> int:
|
||||
try:
|
||||
conn = sqlite3.connect(self._db_path)
|
||||
count = conn.execute("SELECT COUNT(*) FROM assignment_ml_log").fetchone()[0]
|
||||
conn.close()
|
||||
return count
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
def count_by_strategy(self) -> Dict[str, int]:
|
||||
try:
|
||||
conn = sqlite3.connect(self._db_path)
|
||||
rows = conn.execute(
|
||||
"SELECT ml_strategy, COUNT(*) FROM assignment_ml_log GROUP BY ml_strategy"
|
||||
).fetchall()
|
||||
conn.close()
|
||||
return {r[0]: r[1] for r in rows}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def export_csv(self) -> str:
|
||||
"""Export all records as CSV string."""
|
||||
try:
|
||||
conn = sqlite3.connect(self._db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
rows = conn.execute("SELECT * FROM assignment_ml_log ORDER BY id ASC").fetchall()
|
||||
conn.close()
|
||||
if not rows:
|
||||
return ""
|
||||
buf = io.StringIO()
|
||||
writer = csv.DictWriter(buf, fieldnames=rows[0].keys())
|
||||
writer.writeheader()
|
||||
writer.writerows([dict(r) for r in rows])
|
||||
return buf.getvalue()
|
||||
except Exception as e:
|
||||
logger.error(f"[MLCollector] export_csv failed: {e}")
|
||||
return ""
|
||||
|
||||
def purge_old_records(self, keep_days: int = 90) -> int:
|
||||
"""Delete records older than keep_days. Returns count deleted."""
|
||||
try:
|
||||
cutoff = (datetime.utcnow() - timedelta(days=keep_days)).isoformat()
|
||||
conn = sqlite3.connect(self._db_path)
|
||||
cursor = conn.execute(
|
||||
"DELETE FROM assignment_ml_log WHERE timestamp < ?", (cutoff,)
|
||||
)
|
||||
deleted = cursor.rowcount
|
||||
conn.commit()
|
||||
conn.close()
|
||||
logger.info(f"[MLCollector] Purged {deleted} records older than {keep_days} days.")
|
||||
return deleted
|
||||
except Exception as e:
|
||||
logger.error(f"[MLCollector] purge failed: {e}")
|
||||
return 0
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Quality Score Formula (frozen at log time - do not change behavior)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def _compute_quality_score(
|
||||
num_orders: int, unassigned_count: int, load_std: float,
|
||||
riders_used: int, num_riders: int, total_distance_km: float,
|
||||
max_orders_per_rider: int, ml_strategy: str = "balanced",
|
||||
) -> float:
|
||||
if num_orders == 0:
|
||||
return 0.0
|
||||
assigned_ratio = 1.0 - (unassigned_count / num_orders)
|
||||
max_std = max(1.0, max_orders_per_rider / 2.0)
|
||||
balance_ratio = max(0.0, 1.0 - (load_std / max_std))
|
||||
max_dist = max(1.0, float((num_orders - unassigned_count) * 8.0))
|
||||
distance_ratio = max(0.0, 1.0 - (total_distance_km / max_dist))
|
||||
weights = {
|
||||
"aggressive_speed": (80.0, 20.0, 0.0),
|
||||
"fuel_saver": (30.0, 70.0, 0.0),
|
||||
"zone_strict": (40.0, 30.0, 30.0),
|
||||
"balanced": (50.0, 25.0, 25.0),
|
||||
}
|
||||
w_comp, w_dist, w_bal = weights.get(ml_strategy, (50.0, 25.0, 25.0))
|
||||
return min(
|
||||
assigned_ratio * w_comp + distance_ratio * w_dist + balance_ratio * w_bal,
|
||||
100.0,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _get_km(order: Any) -> float:
|
||||
try:
|
||||
return float(order.get("kms") or order.get("calculationDistanceKm") or 0.0)
|
||||
except Exception:
|
||||
return 0.0
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# DB Bootstrap
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _ensure_db(self) -> None:
|
||||
try:
|
||||
os.makedirs(os.path.dirname(self._db_path) or ".", exist_ok=True)
|
||||
conn = sqlite3.connect(self._db_path)
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS assignment_ml_log (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
timestamp TEXT NOT NULL,
|
||||
hour INTEGER,
|
||||
day_of_week INTEGER,
|
||||
is_peak INTEGER DEFAULT 0,
|
||||
zone_id TEXT DEFAULT 'default',
|
||||
city_id TEXT DEFAULT 'default',
|
||||
weather_code TEXT DEFAULT 'CLEAR',
|
||||
num_orders INTEGER,
|
||||
num_riders INTEGER,
|
||||
max_pickup_distance_km REAL,
|
||||
max_kitchen_distance_km REAL,
|
||||
max_orders_per_rider INTEGER,
|
||||
ideal_load INTEGER,
|
||||
workload_balance_threshold REAL,
|
||||
workload_penalty_weight REAL,
|
||||
distance_penalty_weight REAL,
|
||||
cluster_radius_km REAL,
|
||||
search_time_limit_seconds INTEGER,
|
||||
road_factor REAL,
|
||||
ml_strategy TEXT DEFAULT 'balanced',
|
||||
riders_used INTEGER,
|
||||
total_assigned INTEGER,
|
||||
unassigned_count INTEGER,
|
||||
avg_load REAL,
|
||||
load_std REAL,
|
||||
total_distance_km REAL DEFAULT 0.0,
|
||||
elapsed_ms REAL,
|
||||
sla_breached INTEGER DEFAULT 0,
|
||||
avg_delivery_time_min REAL DEFAULT 0.0,
|
||||
quality_score REAL
|
||||
)
|
||||
""")
|
||||
migrations = [
|
||||
"ALTER TABLE assignment_ml_log ADD COLUMN is_peak INTEGER DEFAULT 0",
|
||||
"ALTER TABLE assignment_ml_log ADD COLUMN zone_id TEXT DEFAULT 'default'",
|
||||
"ALTER TABLE assignment_ml_log ADD COLUMN city_id TEXT DEFAULT 'default'",
|
||||
"ALTER TABLE assignment_ml_log ADD COLUMN weather_code TEXT DEFAULT 'CLEAR'",
|
||||
"ALTER TABLE assignment_ml_log ADD COLUMN sla_breached INTEGER DEFAULT 0",
|
||||
"ALTER TABLE assignment_ml_log ADD COLUMN avg_delivery_time_min REAL DEFAULT 0.0",
|
||||
"ALTER TABLE assignment_ml_log ADD COLUMN ml_strategy TEXT DEFAULT 'balanced'",
|
||||
"ALTER TABLE assignment_ml_log ADD COLUMN total_distance_km REAL DEFAULT 0.0",
|
||||
]
|
||||
for ddl in migrations:
|
||||
try:
|
||||
conn.execute(ddl)
|
||||
except Exception:
|
||||
pass
|
||||
for idx in [
|
||||
"CREATE INDEX IF NOT EXISTS idx_timestamp ON assignment_ml_log(timestamp)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_strategy ON assignment_ml_log(ml_strategy)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_zone ON assignment_ml_log(zone_id)",
|
||||
]:
|
||||
conn.execute(idx)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
logger.error(f"[MLCollector] DB init failed: {e}")
|
||||
|
||||
def _insert(self, row: Dict[str, Any]) -> None:
|
||||
os.makedirs(os.path.dirname(self._db_path) or ".", exist_ok=True)
|
||||
conn = sqlite3.connect(self._db_path)
|
||||
cols = ", ".join(row.keys())
|
||||
placeholders = ", ".join(["?"] * len(row))
|
||||
conn.execute(
|
||||
f"INSERT INTO assignment_ml_log ({cols}) VALUES ({placeholders})",
|
||||
list(row.values()),
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Module-level singleton
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_collector: Optional[MLDataCollector] = None
|
||||
|
||||
|
||||
def get_collector() -> MLDataCollector:
|
||||
global _collector
|
||||
if _collector is None:
|
||||
_collector = MLDataCollector()
|
||||
return _collector
|
||||
610
app/services/ml/ml_hypertuner.py
Normal file
610
app/services/ml/ml_hypertuner.py
Normal file
@@ -0,0 +1,610 @@
|
||||
"""
|
||||
ML Hypertuner - Production Grade
|
||||
===================================
|
||||
XGBoost surrogate model + Optuna TPE Bayesian optimization.
|
||||
|
||||
Key upgrades over the original
|
||||
--------------------------------
|
||||
1. Persistent Optuna study - stores trial history in SQLite so every
|
||||
retrain warm-starts from the previous study (progressively smarter).
|
||||
2. Multi-objective optimization - optimizes quality score AND latency
|
||||
simultaneously using Pareto-front search (NSGA-II sampler).
|
||||
3. Segment-aware training - trains separate surrogates for peak vs
|
||||
off-peak hours (very different operating regimes).
|
||||
4. Lag features - rolling_avg_quality_5 and quality_delta_10
|
||||
added to the feature matrix for trend-awareness.
|
||||
5. SHAP feature importance - uses TreeExplainer when available;
|
||||
falls back to XGBoost fscore.
|
||||
6. Warm-start incremental fit - adds trees on top of existing model
|
||||
instead of cold retraining every time.
|
||||
7. Staleness detection - warns if model is older than 24h.
|
||||
8. Richer audit reports - JSON report includes Pareto frontier,
|
||||
segment stats, improvement proof, and top-10 trial params.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
from sklearn.model_selection import KFold
|
||||
from sklearn.metrics import r2_score, mean_absolute_error
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import xgboost as xgb
|
||||
XGB_AVAILABLE = True
|
||||
except ImportError:
|
||||
XGB_AVAILABLE = False
|
||||
logger.warning("[Hypertuner] xgboost not installed.")
|
||||
|
||||
try:
|
||||
import optuna
|
||||
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
||||
OPTUNA_AVAILABLE = True
|
||||
except ImportError:
|
||||
OPTUNA_AVAILABLE = False
|
||||
logger.warning("[Hypertuner] optuna not installed.")
|
||||
|
||||
try:
|
||||
import shap
|
||||
SHAP_AVAILABLE = True
|
||||
except ImportError:
|
||||
SHAP_AVAILABLE = False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Feature columns
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
BASE_FEATURE_COLS = [
|
||||
"hour", "day_of_week", "is_peak",
|
||||
"num_orders", "num_riders",
|
||||
"max_pickup_distance_km", "max_kitchen_distance_km",
|
||||
"max_orders_per_rider", "ideal_load",
|
||||
"workload_balance_threshold", "workload_penalty_weight",
|
||||
"distance_penalty_weight", "cluster_radius_km",
|
||||
"search_time_limit_seconds", "road_factor",
|
||||
]
|
||||
|
||||
LAG_FEATURE_COLS = [
|
||||
"rolling_avg_quality_5", # rolling mean of last 5 quality scores
|
||||
"quality_delta_10", # quality[i] - quality[i-10]
|
||||
]
|
||||
|
||||
ALL_FEATURE_COLS = BASE_FEATURE_COLS + LAG_FEATURE_COLS
|
||||
LABEL_COL = "quality_score"
|
||||
|
||||
SEARCH_SPACE = {
|
||||
"max_pickup_distance_km": ("float", 4.0, 15.0),
|
||||
"max_kitchen_distance_km": ("float", 1.0, 8.0),
|
||||
"max_orders_per_rider": ("int", 6, 20),
|
||||
"ideal_load": ("int", 2, 10),
|
||||
"workload_balance_threshold": ("float", 0.3, 0.95),
|
||||
"workload_penalty_weight": ("float", 20.0, 200.0),
|
||||
"distance_penalty_weight": ("float", 0.5, 10.0),
|
||||
"cluster_radius_km": ("float", 1.0, 8.0),
|
||||
"search_time_limit_seconds": ("int", 2, 15),
|
||||
"road_factor": ("float", 1.1, 1.6),
|
||||
}
|
||||
|
||||
_STUDY_DB_PATH = os.getenv("ML_DB_PATH", "ml_data/ml_store.db")
|
||||
_REPORT_DIR = "ml_data/reports"
|
||||
_MAX_MODEL_AGE_H = 24
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# MLHypertuner
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class MLHypertuner:
|
||||
"""XGBoost surrogate + Optuna TPE / NSGA-II hyperparameter optimizer."""
|
||||
|
||||
def __init__(self):
|
||||
self._model: Optional[Any] = None
|
||||
self._peak_model: Optional[Any] = None
|
||||
self._offpeak_model: Optional[Any] = None
|
||||
self._model_trained_at: Optional[datetime] = None
|
||||
self._training_rows: int = 0
|
||||
self._latest_validation: Optional[Dict] = None
|
||||
self._latest_baseline: Optional[Dict] = None
|
||||
self._feature_importance: Optional[Dict[str, float]] = None
|
||||
self._top_trials: List[Dict] = []
|
||||
self._pareto_frontier: List[Dict] = []
|
||||
self._load_latest_report()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Main entry point
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def run(
|
||||
self,
|
||||
n_trials: int = 150,
|
||||
min_training_records: int = 30,
|
||||
context_override: Optional[Dict] = None,
|
||||
multi_objective: bool = False,
|
||||
segment_aware: bool = True,
|
||||
) -> Dict[str, Any]:
|
||||
"""Full pipeline: load -> engineer -> validate -> train -> search -> write."""
|
||||
if not XGB_AVAILABLE or not OPTUNA_AVAILABLE:
|
||||
missing = []
|
||||
if not XGB_AVAILABLE: missing.append("xgboost")
|
||||
if not OPTUNA_AVAILABLE: missing.append("optuna")
|
||||
return {"status": "error", "message": f"Missing: {', '.join(missing)}"}
|
||||
|
||||
from app.services.ml.ml_data_collector import get_collector
|
||||
collector = get_collector()
|
||||
records = collector.get_training_data(min_records=min_training_records)
|
||||
|
||||
if records is None:
|
||||
count = collector.count_records()
|
||||
return {
|
||||
"status": "insufficient_data",
|
||||
"message": f"{count} records - need >={min_training_records}.",
|
||||
"records_available": count,
|
||||
"records_needed": min_training_records,
|
||||
}
|
||||
|
||||
records = self._add_lag_features(records)
|
||||
X, y = self._prepare_data(records, ALL_FEATURE_COLS)
|
||||
if X is None or len(X) == 0:
|
||||
return {"status": "error", "message": "Data preparation failed."}
|
||||
|
||||
cv_results = self._cross_validate(X, y)
|
||||
logger.info(f"[Hypertuner] CV: R2={cv_results['r2_score']:.3f}, MAE={cv_results['mae']:.2f}")
|
||||
|
||||
self._train_model(X, y, model_attr="_model")
|
||||
self._latest_validation = cv_results
|
||||
|
||||
if segment_aware and len(records) >= 60:
|
||||
peak_recs = [r for r in records if r.get("is_peak", 0) == 1]
|
||||
offpeak_recs = [r for r in records if r.get("is_peak", 0) == 0]
|
||||
if len(peak_recs) >= 20:
|
||||
Xp, yp = self._prepare_data(peak_recs, ALL_FEATURE_COLS)
|
||||
self._train_model(Xp, yp, model_attr="_peak_model")
|
||||
if len(offpeak_recs) >= 20:
|
||||
Xo, yo = self._prepare_data(offpeak_recs, ALL_FEATURE_COLS)
|
||||
self._train_model(Xo, yo, model_attr="_offpeak_model")
|
||||
|
||||
baseline_stats = self._compute_baseline_stats(records)
|
||||
self._latest_baseline = baseline_stats
|
||||
context = context_override or self._get_current_context(records)
|
||||
|
||||
if multi_objective:
|
||||
best_params, best_score, pareto = self._optuna_search_multi(context, n_trials)
|
||||
self._pareto_frontier = pareto
|
||||
else:
|
||||
best_params, best_score = self._optuna_search_single(context, n_trials)
|
||||
|
||||
if best_params is None:
|
||||
return {"status": "error", "message": "Optuna search failed."}
|
||||
|
||||
improvement = round(best_score - baseline_stats["avg_quality"], 2)
|
||||
self._compute_feature_importance()
|
||||
|
||||
if cv_results["r2_score"] < 0.5:
|
||||
return {
|
||||
"status": "model_not_ready",
|
||||
"message": f"R2={cv_results['r2_score']:.3f} too low.",
|
||||
"validation": cv_results,
|
||||
"training_rows": len(records),
|
||||
"action_taken": "none - existing config preserved",
|
||||
}
|
||||
|
||||
try:
|
||||
from app.config.dynamic_config import get_config
|
||||
get_config().set_bulk(best_params, source="ml_hypertuner")
|
||||
except ImportError:
|
||||
logger.info("[Hypertuner] DynamicConfig not available - params not written to config.")
|
||||
|
||||
self._save_report(best_params, best_score, len(records), n_trials, cv_results, baseline_stats)
|
||||
|
||||
return {
|
||||
"status": "ok",
|
||||
"best_params": best_params,
|
||||
"best_predicted_quality": round(best_score, 2),
|
||||
"training_rows": len(records),
|
||||
"trials_run": n_trials,
|
||||
"context_used": context,
|
||||
"validation": cv_results,
|
||||
"improvement_proof": {
|
||||
"baseline_avg_quality": baseline_stats["avg_quality"],
|
||||
"baseline_worst": baseline_stats["worst_quality"],
|
||||
"baseline_best": baseline_stats["best_quality"],
|
||||
"ml_predicted_quality": round(best_score, 2),
|
||||
"predicted_improvement": improvement,
|
||||
"verdict": (
|
||||
"ML params significantly better" if improvement > 5 else
|
||||
"Marginal improvement - keep collecting data" if improvement > 0 else
|
||||
"No improvement - defaults may be near-optimal"
|
||||
),
|
||||
},
|
||||
"feature_importance": self._feature_importance,
|
||||
"top_trials": self._top_trials[:5],
|
||||
"message": "Hyperparameters updated successfully.",
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Feature Engineering
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _add_lag_features(self, records: List[Dict]) -> List[Dict]:
|
||||
scores = [float(r.get("quality_score", 0)) for r in records]
|
||||
for i, r in enumerate(records):
|
||||
window5 = scores[max(0, i - 5):i] if i > 0 else [scores[0]]
|
||||
r["rolling_avg_quality_5"] = sum(window5) / len(window5)
|
||||
r["quality_delta_10"] = (scores[i] - scores[max(0, i - 10)]) if i >= 10 else 0.0
|
||||
return records
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Data Preparation
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _prepare_data(
|
||||
self, records: List[Dict], feature_cols: List[str]
|
||||
) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
|
||||
try:
|
||||
X_rows, y_vals = [], []
|
||||
for rec in records:
|
||||
row = []
|
||||
for col in feature_cols:
|
||||
try:
|
||||
row.append(float(rec.get(col, 0) or 0))
|
||||
except (TypeError, ValueError):
|
||||
row.append(0.0)
|
||||
X_rows.append(row)
|
||||
y_vals.append(float(rec.get(LABEL_COL, 0)))
|
||||
return (
|
||||
np.array(X_rows, dtype=np.float32),
|
||||
np.array(y_vals, dtype=np.float32),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"[Hypertuner] Data prep failed: {e}")
|
||||
return None, None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Model Training (warm-start capable)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _train_model(self, X: np.ndarray, y: np.ndarray, model_attr: str = "_model") -> None:
|
||||
kwargs = {
|
||||
"n_estimators": 300, "max_depth": 5, "learning_rate": 0.04,
|
||||
"subsample": 0.8, "colsample_bytree": 0.8,
|
||||
"reg_alpha": 0.1, "reg_lambda": 1.0, "random_state": 42, "verbosity": 0,
|
||||
}
|
||||
existing = getattr(self, model_attr, None)
|
||||
if existing is not None:
|
||||
try:
|
||||
m = xgb.XGBRegressor(n_estimators=50, **{k: v for k, v in kwargs.items() if k != "n_estimators"})
|
||||
m.fit(X, y, xgb_model=existing.get_booster())
|
||||
setattr(self, model_attr, m)
|
||||
if model_attr == "_model":
|
||||
self._model_trained_at = datetime.utcnow()
|
||||
self._training_rows = len(X)
|
||||
logger.info(f"[Hypertuner] XGBoost warm-updated ({model_attr}) - {len(X)} rows.")
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
m = xgb.XGBRegressor(**kwargs)
|
||||
m.fit(X, y)
|
||||
setattr(self, model_attr, m)
|
||||
if model_attr == "_model":
|
||||
self._model_trained_at = datetime.utcnow()
|
||||
self._training_rows = len(X)
|
||||
logger.info(f"[Hypertuner] XGBoost trained ({model_attr}) - {len(X)} rows.")
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Cross Validation
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _cross_validate(self, X: np.ndarray, y: np.ndarray, k: int = 5) -> Dict:
|
||||
if len(X) < k * 2:
|
||||
split = max(1, int(len(X) * 0.8))
|
||||
X_tr, X_te, y_tr, y_te = X[:split], X[split:], y[:split], y[split:]
|
||||
if len(X_te) == 0:
|
||||
return {"r2_score": 0.0, "mae": 99.0, "trust_level": "insufficient_data",
|
||||
"trust_score": 0, "folds": 0}
|
||||
m = xgb.XGBRegressor(n_estimators=100, max_depth=4, verbosity=0, random_state=42)
|
||||
m.fit(X_tr, y_tr)
|
||||
r2 = float(r2_score(y_te, m.predict(X_te)))
|
||||
mae = float(mean_absolute_error(y_te, m.predict(X_te)))
|
||||
folds_used = 1
|
||||
else:
|
||||
kf = KFold(n_splits=k, shuffle=True, random_state=42)
|
||||
r2s, maes = [], []
|
||||
for tr_idx, te_idx in kf.split(X):
|
||||
m = xgb.XGBRegressor(n_estimators=100, max_depth=4, verbosity=0, random_state=42)
|
||||
m.fit(X[tr_idx], y[tr_idx])
|
||||
preds = m.predict(X[te_idx])
|
||||
r2s.append(r2_score(y[te_idx], preds))
|
||||
maes.append(mean_absolute_error(y[te_idx], preds))
|
||||
r2, mae, folds_used = float(np.mean(r2s)), float(np.mean(maes)), k
|
||||
|
||||
trust_map = [(0.85, "excellent", 5), (0.75, "strong", 4),
|
||||
(0.60, "good", 3), (0.50, "acceptable", 2)]
|
||||
trust_level, trust_score = "poor - need more data", 1
|
||||
for threshold, level, score in trust_map:
|
||||
if r2 >= threshold:
|
||||
trust_level, trust_score = level, score
|
||||
break
|
||||
|
||||
return {
|
||||
"r2_score": round(r2, 4),
|
||||
"mae": round(mae, 3),
|
||||
"folds": folds_used,
|
||||
"trust_level": trust_level,
|
||||
"trust_score": trust_score,
|
||||
"interpretation": f"Predictions off by +/-{mae:.1f} pts (R2={r2:.2f}, trust={trust_level})",
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Optuna - Single Objective (persistent SQLite storage)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _optuna_search_single(self, context: Dict, n_trials: int) -> Tuple[Optional[Dict], float]:
|
||||
def objective(trial):
|
||||
params = self._sample_params(trial)
|
||||
if params.get("ideal_load", 6) > params.get("max_orders_per_rider", 12):
|
||||
return 0.0
|
||||
return self._predict_quality(context, params)
|
||||
try:
|
||||
study = optuna.create_study(
|
||||
study_name="hypertuner_v1",
|
||||
storage=f"sqlite:///{_STUDY_DB_PATH}",
|
||||
direction="maximize",
|
||||
load_if_exists=True,
|
||||
sampler=optuna.samplers.TPESampler(seed=42),
|
||||
)
|
||||
study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
|
||||
best = study.best_trial
|
||||
self._top_trials = [
|
||||
{"params": t.params, "score": t.value}
|
||||
for t in sorted(study.trials, key=lambda x: x.value or 0, reverse=True)[:10]
|
||||
if t.value is not None
|
||||
]
|
||||
return {k: best.params[k] for k in SEARCH_SPACE if k in best.params}, best.value
|
||||
except Exception as e:
|
||||
logger.error(f"[Hypertuner] Optuna single-obj failed: {e}", exc_info=True)
|
||||
return None, 0.0
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Optuna - Multi Objective (quality + latency, NSGA-II)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _optuna_search_multi(
|
||||
self, context: Dict, n_trials: int
|
||||
) -> Tuple[Optional[Dict], float, List[Dict]]:
|
||||
def objective(trial):
|
||||
params = self._sample_params(trial)
|
||||
if params.get("ideal_load", 6) > params.get("max_orders_per_rider", 12):
|
||||
return 0.0, 99.0
|
||||
quality = self._predict_quality(context, params)
|
||||
latency_proxy = float(params.get("search_time_limit_seconds", 5)) * 200.0
|
||||
return quality, latency_proxy
|
||||
try:
|
||||
study = optuna.create_study(
|
||||
study_name="hypertuner_multi_v1",
|
||||
storage=f"sqlite:///{_STUDY_DB_PATH}",
|
||||
directions=["maximize", "minimize"],
|
||||
load_if_exists=True,
|
||||
sampler=optuna.samplers.NSGAIISampler(seed=42),
|
||||
)
|
||||
study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
|
||||
pareto = [
|
||||
{"params": t.params, "quality": t.values[0], "latency_proxy": t.values[1]}
|
||||
for t in study.best_trials
|
||||
]
|
||||
if not pareto:
|
||||
return None, 0.0, []
|
||||
best_trial = max(pareto, key=lambda x: x["quality"])
|
||||
return (
|
||||
{k: best_trial["params"][k] for k in SEARCH_SPACE if k in best_trial["params"]},
|
||||
best_trial["quality"],
|
||||
pareto,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"[Hypertuner] Optuna multi-obj failed: {e}", exc_info=True)
|
||||
return None, 0.0, []
|
||||
|
||||
def _sample_params(self, trial) -> Dict:
|
||||
params = {}
|
||||
for name, (p_type, lo, hi) in SEARCH_SPACE.items():
|
||||
if p_type == "float":
|
||||
params[name] = trial.suggest_float(name, lo, hi)
|
||||
elif p_type == "int":
|
||||
params[name] = trial.suggest_int(name, int(lo), int(hi))
|
||||
return params
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Prediction
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _predict_quality(self, context: Dict, params: Dict) -> float:
|
||||
if self._model is None:
|
||||
return 0.0
|
||||
combined = {
|
||||
**context, **params,
|
||||
"rolling_avg_quality_5": context.get("rolling_avg_quality_5", 50.0),
|
||||
"quality_delta_10": context.get("quality_delta_10", 0.0),
|
||||
}
|
||||
row = []
|
||||
for col in ALL_FEATURE_COLS:
|
||||
try:
|
||||
row.append(float(combined.get(col, 0) or 0))
|
||||
except (TypeError, ValueError):
|
||||
row.append(0.0)
|
||||
is_peak = int(context.get("is_peak", 0))
|
||||
model = (self._peak_model if is_peak else self._offpeak_model) or self._model
|
||||
pred = float(model.predict(np.array([row], dtype=np.float32))[0])
|
||||
return max(0.0, min(pred, 100.0))
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Feature Importance
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _compute_feature_importance(self) -> None:
|
||||
if self._model is None:
|
||||
return
|
||||
try:
|
||||
if SHAP_AVAILABLE:
|
||||
from ml_data_collector import get_collector
|
||||
records = get_collector().get_training_data(min_records=1) or []
|
||||
records = self._add_lag_features(records[-200:])
|
||||
X, _ = self._prepare_data(records, ALL_FEATURE_COLS)
|
||||
if X is not None and len(X) > 0:
|
||||
explainer = shap.TreeExplainer(self._model)
|
||||
shap_values = np.abs(explainer.shap_values(X)).mean(axis=0)
|
||||
total = max(shap_values.sum(), 1e-9)
|
||||
self._feature_importance = dict(sorted(
|
||||
{ALL_FEATURE_COLS[i]: round(float(shap_values[i] / total) * 100, 2)
|
||||
for i in range(len(ALL_FEATURE_COLS))}.items(),
|
||||
key=lambda x: x[1], reverse=True
|
||||
))
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
scores = self._model.get_booster().get_fscore()
|
||||
total = max(sum(scores.values()), 1)
|
||||
self._feature_importance = dict(sorted(
|
||||
{ALL_FEATURE_COLS[int(k[1:])]: round(v / total * 100, 2)
|
||||
for k, v in scores.items()
|
||||
if k.startswith("f") and k[1:].isdigit() and int(k[1:]) < len(ALL_FEATURE_COLS)
|
||||
}.items(),
|
||||
key=lambda x: x[1], reverse=True
|
||||
))
|
||||
except Exception as e:
|
||||
logger.warning(f"[Hypertuner] Feature importance failed: {e}")
|
||||
|
||||
def get_feature_importance(self) -> Optional[Dict[str, float]]:
|
||||
return self._feature_importance
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Context
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _get_current_context(self, records: List[Dict]) -> Dict:
|
||||
now = datetime.utcnow()
|
||||
recent = records[-20:]
|
||||
avg_orders = sum(r.get("num_orders", 0) for r in recent) / max(len(recent), 1)
|
||||
avg_riders = sum(r.get("num_riders", 0) for r in recent) / max(len(recent), 1)
|
||||
recent_scores = [float(r.get("quality_score", 0)) for r in recent]
|
||||
rolling_avg5 = sum(recent_scores[-5:]) / max(len(recent_scores[-5:]), 1)
|
||||
delta10 = (recent_scores[-1] - recent_scores[-11]) if len(recent_scores) >= 11 else 0.0
|
||||
return {
|
||||
"hour": now.hour,
|
||||
"day_of_week": now.weekday(),
|
||||
"is_peak": int(now.hour in (7, 8, 9, 12, 13, 18, 19, 20)),
|
||||
"num_orders": round(avg_orders),
|
||||
"num_riders": round(avg_riders),
|
||||
"rolling_avg_quality_5": round(rolling_avg5, 2),
|
||||
"quality_delta_10": round(delta10, 2),
|
||||
}
|
||||
|
||||
def _compute_baseline_stats(self, records: List[Dict]) -> Dict:
|
||||
scores = [float(r.get("quality_score", 0)) for r in records if r.get("quality_score")]
|
||||
if not scores:
|
||||
return {"avg_quality": 0.0, "best_quality": 0.0, "worst_quality": 0.0}
|
||||
return {
|
||||
"avg_quality": round(sum(scores) / len(scores), 2),
|
||||
"best_quality": round(max(scores), 2),
|
||||
"worst_quality": round(min(scores), 2),
|
||||
"sample_size": len(scores),
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Model Info
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_model_info(self) -> Dict[str, Any]:
|
||||
baseline = self._latest_baseline
|
||||
if baseline is None:
|
||||
try:
|
||||
from ml_data_collector import get_collector
|
||||
records = get_collector().get_training_data(min_records=1)
|
||||
if records:
|
||||
baseline = self._compute_baseline_stats(records)
|
||||
except Exception:
|
||||
pass
|
||||
return {
|
||||
"model_trained": self._model is not None,
|
||||
"trained_at": self._model_trained_at.isoformat() if self._model_trained_at else None,
|
||||
"training_rows": self._training_rows,
|
||||
"peak_model_trained": self._peak_model is not None,
|
||||
"offpeak_model_trained": self._offpeak_model is not None,
|
||||
"features": ALL_FEATURE_COLS,
|
||||
"validation": self._latest_validation,
|
||||
"baseline": baseline,
|
||||
"search_space": {k: {"type": v[0], "low": v[1], "high": v[2]} for k, v in SEARCH_SPACE.items()},
|
||||
"feature_importance": self._feature_importance,
|
||||
"top_trials": self._top_trials[:10],
|
||||
"pareto_frontier_size": len(self._pareto_frontier),
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Report I/O
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _save_report(self, best_params, best_score, training_rows,
|
||||
n_trials, cv_results, baseline_stats) -> None:
|
||||
try:
|
||||
os.makedirs(_REPORT_DIR, exist_ok=True)
|
||||
report = {
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"training_rows": training_rows,
|
||||
"n_trials": n_trials,
|
||||
"best_predicted_quality": round(best_score, 2),
|
||||
"best_params": best_params,
|
||||
"validation": cv_results or {},
|
||||
"baseline_stats": baseline_stats or {},
|
||||
"feature_importance": self._feature_importance or {},
|
||||
"top_trials": self._top_trials[:10],
|
||||
"pareto_frontier": self._pareto_frontier[:20],
|
||||
}
|
||||
path = os.path.join(_REPORT_DIR, f"tuning_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.json")
|
||||
with open(path, "w") as f:
|
||||
json.dump(report, f, indent=2)
|
||||
logger.info(f"[Hypertuner] Report -> {path}")
|
||||
except Exception as e:
|
||||
logger.warning(f"[Hypertuner] Report save failed: {e}")
|
||||
|
||||
def _load_latest_report(self) -> None:
|
||||
try:
|
||||
if not os.path.isdir(_REPORT_DIR):
|
||||
return
|
||||
files = sorted([f for f in os.listdir(_REPORT_DIR) if f.endswith(".json")], reverse=True)
|
||||
if not files:
|
||||
return
|
||||
with open(os.path.join(_REPORT_DIR, files[0])) as f:
|
||||
report = json.load(f)
|
||||
self._latest_validation = report.get("validation")
|
||||
self._latest_baseline = report.get("baseline_stats")
|
||||
self._training_rows = report.get("training_rows", 0)
|
||||
self._feature_importance = report.get("feature_importance")
|
||||
self._top_trials = report.get("top_trials", [])
|
||||
self._pareto_frontier = report.get("pareto_frontier", [])
|
||||
ts = report.get("timestamp")
|
||||
if ts:
|
||||
self._model_trained_at = datetime.fromisoformat(ts)
|
||||
logger.info(f"[Hypertuner] Restored state from {files[0]}")
|
||||
except Exception as e:
|
||||
logger.warning(f"[Hypertuner] Load latest report failed: {e}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Module-level singleton
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_tuner: Optional[MLHypertuner] = None
|
||||
|
||||
|
||||
def get_hypertuner() -> MLHypertuner:
|
||||
global _tuner
|
||||
if _tuner is None:
|
||||
_tuner = MLHypertuner()
|
||||
return _tuner
|
||||
Reference in New Issue
Block a user