mlflow/mlflow_utils/experiment_comparison.py

"""
Experiment Comparison and Analysis Utilities

Provides tools for comparing model versions, querying experiments,
and making data-driven decisions about model promotion to production.

Features:
- Compare multiple runs/experiments side by side
- Query experiments by tags, metrics, or parameters
- Analyze inference metrics from NATS handlers
- Generate promotion recommendations
- Export comparison reports

Usage:
    from mlflow_utils.experiment_comparison import (
        ExperimentAnalyzer,
        compare_runs,
        get_best_run,
        promotion_recommendation,
    )

    analyzer = ExperimentAnalyzer("chat-inference")

    # Compare last N runs
    comparison = analyzer.compare_recent_runs(n=5)

    # Find best performing model
    best = analyzer.get_best_run(metric="total_latency_mean", minimize=True)

    # Get promotion recommendation
    rec = analyzer.promotion_recommendation(
        model_name="whisper-finetuned",
        min_accuracy=0.9,
        max_latency_p95=2.0
    )
"""

import os
import json
import logging
from datetime import datetime, timedelta
from typing import Optional, Dict, Any, List, Tuple, Union
from dataclasses import dataclass, field
from collections import defaultdict

import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import Run, Experiment

from .client import get_mlflow_client, MLflowConfig

logger = logging.getLogger(__name__)


@dataclass
class RunComparison:
    """Comparison result for multiple MLflow runs."""
    run_ids: List[str]
    experiment_name: str

    # Metric comparisons (metric_name -> {run_id -> value})
    metrics: Dict[str, Dict[str, float]] = field(default_factory=dict)

    # Parameter differences
    params: Dict[str, Dict[str, str]] = field(default_factory=dict)

    # Run metadata
    run_names: Dict[str, str] = field(default_factory=dict)
    start_times: Dict[str, datetime] = field(default_factory=dict)
    durations: Dict[str, float] = field(default_factory=dict)

    # Best performers by metric
    best_by_metric: Dict[str, str] = field(default_factory=dict)

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for serialization."""
        return {
            "run_ids": self.run_ids,
            "experiment_name": self.experiment_name,
            "metrics": self.metrics,
            "params": self.params,
            "run_names": self.run_names,
            "best_by_metric": self.best_by_metric,
        }

    def summary_table(self) -> str:
        """Generate a text summary table of the comparison."""
        if not self.run_ids:
            return "No runs to compare"

        lines = []
        lines.append(f"Experiment: {self.experiment_name}")
        lines.append(f"Comparing {len(self.run_ids)} runs")
        lines.append("")

        # Header
        header = ["Metric"] + [self.run_names.get(rid, rid[:8]) for rid in self.run_ids]
        lines.append(" | ".join(header))
        lines.append("-" * (len(lines[-1]) + 10))

        # Metrics
        for metric_name, values in sorted(self.metrics.items()):
            row = [metric_name]
            for run_id in self.run_ids:
                value = values.get(run_id)
                if value is not None:
                    row.append(f"{value:.4f}")
                else:
                    row.append("N/A")
            lines.append(" | ".join(row))

        return "\n".join(lines)


@dataclass
class PromotionRecommendation:
    """Recommendation for model promotion."""
    model_name: str
    version: Optional[int]
    recommended: bool
    reasons: List[str]
    metrics_summary: Dict[str, float]
    comparison_with_production: Optional[Dict[str, Any]] = None

    def to_dict(self) -> Dict[str, Any]:
        return {
            "model_name": self.model_name,
            "version": self.version,
            "recommended": self.recommended,
            "reasons": self.reasons,
            "metrics_summary": self.metrics_summary,
            "comparison_with_production": self.comparison_with_production,
        }


class ExperimentAnalyzer:
    """
    Analyze MLflow experiments for model comparison and promotion decisions.

    Example:
        analyzer = ExperimentAnalyzer("chat-inference")

        # Get metrics summary for last 24 hours
        summary = analyzer.get_metrics_summary(hours=24)

        # Compare models by accuracy
        best = analyzer.get_best_run(metric="eval.accuracy", minimize=False)

        # Analyze inference latency trends
        trends = analyzer.get_metric_trends("total_latency_mean", days=7)
    """

    def __init__(
        self,
        experiment_name: str,
        tracking_uri: Optional[str] = None,
    ):
        """
        Initialize the experiment analyzer.

        Args:
            experiment_name: Name of the MLflow experiment to analyze
            tracking_uri: Override default tracking URI
        """
        self.experiment_name = experiment_name
        self.tracking_uri = tracking_uri
        self.client = get_mlflow_client(tracking_uri=tracking_uri)
        self._experiment: Optional[Experiment] = None

    @property
    def experiment(self) -> Optional[Experiment]:
        """Get the experiment object, fetching if needed."""
        if self._experiment is None:
            self._experiment = self.client.get_experiment_by_name(self.experiment_name)
        return self._experiment

    def search_runs(
        self,
        filter_string: str = "",
        order_by: Optional[List[str]] = None,
        max_results: int = 100,
        run_view_type: str = "ACTIVE_ONLY",
    ) -> List[Run]:
        """
        Search for runs matching criteria.

        Args:
            filter_string: MLflow search filter (e.g., "metrics.accuracy > 0.9")
            order_by: List of order clauses (e.g., ["metrics.accuracy DESC"])
            max_results: Maximum runs to return
            run_view_type: ACTIVE_ONLY, DELETED_ONLY, or ALL

        Returns:
            List of matching Run objects
        """
        if not self.experiment:
            logger.warning(f"Experiment '{self.experiment_name}' not found")
            return []

        runs = self.client.search_runs(
            experiment_ids=[self.experiment.experiment_id],
            filter_string=filter_string,
            order_by=order_by or ["start_time DESC"],
            max_results=max_results,
        )

        return runs

    def get_recent_runs(
        self,
        n: int = 10,
        hours: Optional[int] = None,
    ) -> List[Run]:
        """
        Get the most recent runs.

        Args:
            n: Number of runs to return
            hours: Only include runs from the last N hours

        Returns:
            List of Run objects
        """
        filter_string = ""
        if hours:
            cutoff = datetime.now() - timedelta(hours=hours)
            cutoff_ms = int(cutoff.timestamp() * 1000)
            filter_string = f"attributes.start_time >= {cutoff_ms}"

        return self.search_runs(
            filter_string=filter_string,
            order_by=["start_time DESC"],
            max_results=n,
        )

    def compare_runs(
        self,
        run_ids: Optional[List[str]] = None,
        n_recent: int = 5,
    ) -> RunComparison:
        """
        Compare multiple runs side by side.

        Args:
            run_ids: Specific run IDs to compare, or None for recent runs
            n_recent: If run_ids is None, compare this many recent runs

        Returns:
            RunComparison object with detailed comparison
        """
        if run_ids:
            runs = [self.client.get_run(rid) for rid in run_ids]
        else:
            runs = self.get_recent_runs(n=n_recent)

        comparison = RunComparison(
            run_ids=[r.info.run_id for r in runs],
            experiment_name=self.experiment_name,
        )

        # Collect all metrics and find best performers
        all_metrics: Dict[str, Dict[str, float]] = defaultdict(dict)

        for run in runs:
            run_id = run.info.run_id

            # Metadata
            comparison.run_names[run_id] = run.info.run_name or run_id[:8]
            comparison.start_times[run_id] = datetime.fromtimestamp(
                run.info.start_time / 1000
            )
            if run.info.end_time:
                comparison.durations[run_id] = (
                    run.info.end_time - run.info.start_time
                ) / 1000

            # Metrics
            for key, value in run.data.metrics.items():
                all_metrics[key][run_id] = value

            # Params
            for key, value in run.data.params.items():
                if key not in comparison.params:
                    comparison.params[key] = {}
                comparison.params[key][run_id] = value

        comparison.metrics = dict(all_metrics)

        # Find best performers for each metric
        for metric_name, values in all_metrics.items():
            if not values:
                continue

            # Determine if lower is better based on metric name
            minimize = any(
                term in metric_name.lower()
                for term in ["latency", "error", "loss", "time"]
            )

            if minimize:
                best_id = min(values.keys(), key=lambda k: values[k])
            else:
                best_id = max(values.keys(), key=lambda k: values[k])

            comparison.best_by_metric[metric_name] = best_id

        return comparison

    def get_best_run(
        self,
        metric: str,
        minimize: bool = True,
        filter_string: str = "",
        max_results: int = 100,
    ) -> Optional[Run]:
        """
        Get the best run by a specific metric.

        Args:
            metric: Metric name to optimize
            minimize: If True, find minimum; if False, find maximum
            filter_string: Additional filter criteria
            max_results: Maximum runs to consider

        Returns:
            Best Run object, or None if no runs found
        """
        direction = "ASC" if minimize else "DESC"

        runs = self.search_runs(
            filter_string=filter_string,
            order_by=[f"metrics.{metric} {direction}"],
            max_results=max_results,
        )

        # Filter to only runs that have the metric
        runs_with_metric = [
            r for r in runs
            if metric in r.data.metrics
        ]

        return runs_with_metric[0] if runs_with_metric else None

    def get_metrics_summary(
        self,
        hours: Optional[int] = None,
        metrics: Optional[List[str]] = None,
    ) -> Dict[str, Dict[str, float]]:
        """
        Get summary statistics for metrics.

        Args:
            hours: Only include runs from the last N hours
            metrics: Specific metrics to summarize (None for all)

        Returns:
            Dict mapping metric names to {mean, min, max, count}
        """
        import statistics

        runs = self.get_recent_runs(n=1000, hours=hours)

        # Collect all metric values
        metric_values: Dict[str, List[float]] = defaultdict(list)

        for run in runs:
            for key, value in run.data.metrics.items():
                if metrics is None or key in metrics:
                    metric_values[key].append(value)

        # Calculate statistics
        summary = {}
        for metric_name, values in metric_values.items():
            if not values:
                continue

            summary[metric_name] = {
                "mean": statistics.mean(values),
                "min": min(values),
                "max": max(values),
                "count": len(values),
            }

            if len(values) >= 2:
                summary[metric_name]["stdev"] = statistics.stdev(values)
                summary[metric_name]["median"] = statistics.median(values)

        return summary

    def get_metric_trends(
        self,
        metric: str,
        days: int = 7,
        granularity_hours: int = 1,
    ) -> List[Dict[str, Any]]:
        """
        Get metric trends over time.

        Args:
            metric: Metric name to track
            days: Number of days to look back
            granularity_hours: Time bucket size in hours

        Returns:
            List of {timestamp, mean, min, max, count} dicts
        """
        import statistics

        runs = self.get_recent_runs(n=10000, hours=days * 24)

        # Group runs by time bucket
        buckets: Dict[int, List[float]] = defaultdict(list)
        bucket_size_ms = granularity_hours * 3600 * 1000

        for run in runs:
            if metric not in run.data.metrics:
                continue

            bucket = (run.info.start_time // bucket_size_ms) * bucket_size_ms
            buckets[bucket].append(run.data.metrics[metric])

        # Calculate statistics per bucket
        trends = []
        for bucket_ts, values in sorted(buckets.items()):
            trend = {
                "timestamp": datetime.fromtimestamp(bucket_ts / 1000).isoformat(),
                "count": len(values),
                "mean": statistics.mean(values),
                "min": min(values),
                "max": max(values),
            }
            if len(values) >= 2:
                trend["stdev"] = statistics.stdev(values)
            trends.append(trend)

        return trends

    def get_runs_by_tag(
        self,
        tag_key: str,
        tag_value: str,
        max_results: int = 100,
    ) -> List[Run]:
        """
        Get runs with a specific tag.

        Args:
            tag_key: Tag key to filter by
            tag_value: Tag value to match
            max_results: Maximum runs to return

        Returns:
            List of matching Run objects
        """
        return self.search_runs(
            filter_string=f"tags.{tag_key} = '{tag_value}'",
            max_results=max_results,
        )

    def get_model_runs(
        self,
        model_name: str,
        max_results: int = 100,
    ) -> List[Run]:
        """
        Get runs for a specific model.

        Args:
            model_name: Model name to filter by
            max_results: Maximum runs to return

        Returns:
            List of matching Run objects
        """
        # Try different tag conventions
        runs = self.search_runs(
            filter_string=f"tags.`model.name` = '{model_name}'",
            max_results=max_results,
        )

        if not runs:
            # Try params
            runs = self.search_runs(
                filter_string=f"params.model_name = '{model_name}'",
                max_results=max_results,
            )

        return runs


def compare_experiments(
    experiment_names: List[str],
    metric: str,
    tracking_uri: Optional[str] = None,
) -> Dict[str, Dict[str, float]]:
    """
    Compare metrics across multiple experiments.

    Args:
        experiment_names: Names of experiments to compare
        metric: Metric to compare
        tracking_uri: Override default tracking URI

    Returns:
        Dict mapping experiment names to metric statistics
    """
    results = {}

    for exp_name in experiment_names:
        analyzer = ExperimentAnalyzer(exp_name, tracking_uri=tracking_uri)
        summary = analyzer.get_metrics_summary(metrics=[metric])
        if metric in summary:
            results[exp_name] = summary[metric]

    return results


def promotion_recommendation(
    model_name: str,
    experiment_name: str,
    criteria: Dict[str, Tuple[str, float]],
    tracking_uri: Optional[str] = None,
) -> PromotionRecommendation:
    """
    Generate a recommendation for model promotion.

    Args:
        model_name: Name of the model to evaluate
        experiment_name: Experiment containing evaluation runs
        criteria: Dict of {metric: (comparison, threshold)}
                  comparison is one of: ">=", "<=", ">", "<"
                  e.g., {"eval.accuracy": (">=", 0.9), "total_latency_p95": ("<=", 2.0)}
        tracking_uri: Override default tracking URI

    Returns:
        PromotionRecommendation with decision and reasons
    """
    analyzer = ExperimentAnalyzer(experiment_name, tracking_uri=tracking_uri)

    # Get model runs
    runs = analyzer.get_model_runs(model_name, max_results=10)

    if not runs:
        return PromotionRecommendation(
            model_name=model_name,
            version=None,
            recommended=False,
            reasons=["No runs found for this model"],
            metrics_summary={},
        )

    # Get the most recent run
    latest_run = runs[0]
    metrics = latest_run.data.metrics

    # Evaluate criteria
    reasons = []
    passed = True

    comparisons = {
        ">=": lambda a, b: a >= b,
        "<=": lambda a, b: a <= b,
        ">": lambda a, b: a > b,
        "<": lambda a, b: a < b,
    }

    for metric_name, (comparison, threshold) in criteria.items():
        if metric_name not in metrics:
            reasons.append(f"Metric '{metric_name}' not found")
            passed = False
            continue

        value = metrics[metric_name]
        compare_fn = comparisons.get(comparison)

        if compare_fn is None:
            reasons.append(f"Invalid comparison operator: {comparison}")
            continue

        if compare_fn(value, threshold):
            reasons.append(f"✓ {metric_name}: {value:.4f} {comparison} {threshold}")
        else:
            reasons.append(f"✗ {metric_name}: {value:.4f} NOT {comparison} {threshold}")
            passed = False

    # Extract version from tags if available
    version = None
    if "mlflow.version" in latest_run.data.tags:
        try:
            version = int(latest_run.data.tags["mlflow.version"])
        except ValueError:
            pass

    return PromotionRecommendation(
        model_name=model_name,
        version=version,
        recommended=passed,
        reasons=reasons,
        metrics_summary=dict(metrics),
    )


def get_inference_performance_report(
    service_name: str = "chat-handler",
    hours: int = 24,
    tracking_uri: Optional[str] = None,
) -> Dict[str, Any]:
    """
    Generate an inference performance report for a service.

    Args:
        service_name: Service name (chat-handler, voice-assistant)
        hours: Hours of data to analyze
        tracking_uri: Override default tracking URI

    Returns:
        Performance report dictionary
    """
    experiment_name = f"{service_name.replace('-', '')}-inference"
    analyzer = ExperimentAnalyzer(experiment_name, tracking_uri=tracking_uri)

    # Get summary metrics
    summary = analyzer.get_metrics_summary(hours=hours)

    # Key latency metrics
    latency_metrics = [
        "total_latency_mean",
        "total_latency_p50",
        "total_latency_p95",
        "llm_latency_mean",
        "embedding_latency_mean",
        "rag_search_latency_mean",
    ]

    report = {
        "service": service_name,
        "period_hours": hours,
        "generated_at": datetime.now().isoformat(),
        "latency": {},
        "throughput": {},
        "rag": {},
        "errors": {},
    }

    # Latency section
    for metric in latency_metrics:
        if metric in summary:
            report["latency"][metric] = summary[metric]

    # Throughput
    if "total_requests" in summary:
        report["throughput"]["total_requests"] = summary["total_requests"]["mean"]

    # RAG usage
    rag_metrics = ["rag_enabled_pct", "rag_documents_retrieved_mean", "rag_documents_used_mean"]
    for metric in rag_metrics:
        if metric in summary:
            report["rag"][metric] = summary[metric]

    # Error rate
    if "error_rate" in summary:
        report["errors"]["error_rate_pct"] = summary["error_rate"]["mean"]

    return report