feat: Add MLflow integration utilities

- client: Connection management and helpers - tracker: General experiment tracking - inference_tracker: Async metrics for NATS handlers - model_registry: Model registration with KServe metadata - kfp_components: Kubeflow Pipeline components - experiment_comparison: Run comparison tools - cli: Command-line interface
2026-02-01 20:43:13 -05:00
parent 944bd8bc06
commit 2df3f27af7
10 changed files with 3315 additions and 1 deletions
--- a/mlflow_utils/client.py
+++ b/mlflow_utils/client.py
@@ -0,0 +1,209 @@
+"""
+MLflow Client Configuration and Initialization
+
+Provides a configured MLflow client for all integrations in the LLM workflows.
+Supports both in-cluster and external access patterns.
+"""
+
+import os
+import logging
+from dataclasses import dataclass, field
+from typing import Optional, Dict, Any
+
+import mlflow
+from mlflow.tracking import MlflowClient
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class MLflowConfig:
+    """Configuration for MLflow integration."""
+    
+    # Tracking server URIs
+    tracking_uri: str = field(
+        default_factory=lambda: os.environ.get(
+            "MLFLOW_TRACKING_URI",
+            "http://mlflow.mlflow.svc.cluster.local:80"
+        )
+    )
+    external_uri: str = field(
+        default_factory=lambda: os.environ.get(
+            "MLFLOW_EXTERNAL_URI",
+            "https://mlflow.lab.daviestechlabs.io"
+        )
+    )
+    
+    # Artifact storage (NFS PVC mount)
+    artifact_location: str = field(
+        default_factory=lambda: os.environ.get(
+            "MLFLOW_ARTIFACT_LOCATION",
+            "/mlflow/artifacts"
+        )
+    )
+    
+    # Default experiment settings
+    default_experiment: str = field(
+        default_factory=lambda: os.environ.get(
+            "MLFLOW_DEFAULT_EXPERIMENT",
+            "llm-workflows"
+        )
+    )
+    
+    # Service identification
+    service_name: str = field(
+        default_factory=lambda: os.environ.get(
+            "OTEL_SERVICE_NAME",
+            "unknown-service"
+        )
+    )
+    
+    # Additional tags to add to all runs
+    default_tags: Dict[str, str] = field(default_factory=dict)
+    
+    def __post_init__(self):
+        """Add default tags based on environment."""
+        env_tags = {
+            "environment": os.environ.get("DEPLOYMENT_ENV", "production"),
+            "hostname": os.environ.get("HOSTNAME", "unknown"),
+            "namespace": os.environ.get("OTEL_SERVICE_NAMESPACE", "ai-ml"),
+        }
+        self.default_tags = {**env_tags, **self.default_tags}
+
+
+def get_tracking_uri(external: bool = False) -> str:
+    """
+    Get the appropriate MLflow tracking URI.
+    
+    Args:
+        external: If True, return the external URI for outside-cluster access
+        
+    Returns:
+        The MLflow tracking URI string
+    """
+    config = MLflowConfig()
+    return config.external_uri if external else config.tracking_uri
+
+
+def get_mlflow_client(
+    tracking_uri: Optional[str] = None,
+    configure_global: bool = True
+) -> MlflowClient:
+    """
+    Get a configured MLflow client.
+    
+    Args:
+        tracking_uri: Override the default tracking URI
+        configure_global: If True, also set mlflow.set_tracking_uri()
+        
+    Returns:
+        Configured MlflowClient instance
+    """
+    uri = tracking_uri or get_tracking_uri()
+    
+    if configure_global:
+        mlflow.set_tracking_uri(uri)
+        logger.info(f"MLflow tracking URI set to: {uri}")
+    
+    client = MlflowClient(tracking_uri=uri)
+    return client
+
+
+def ensure_experiment(
+    experiment_name: str,
+    artifact_location: Optional[str] = None,
+    tags: Optional[Dict[str, str]] = None
+) -> str:
+    """
+    Ensure an experiment exists, creating it if necessary.
+    
+    Args:
+        experiment_name: Name of the experiment
+        artifact_location: Override default artifact location
+        tags: Additional tags for the experiment
+        
+    Returns:
+        The experiment ID
+    """
+    config = MLflowConfig()
+    client = get_mlflow_client()
+    
+    # Check if experiment exists
+    experiment = client.get_experiment_by_name(experiment_name)
+    
+    if experiment is None:
+        # Create the experiment
+        artifact_loc = artifact_location or f"{config.artifact_location}/{experiment_name}"
+        experiment_id = client.create_experiment(
+            name=experiment_name,
+            artifact_location=artifact_loc,
+            tags=tags or {}
+        )
+        logger.info(f"Created experiment '{experiment_name}' with ID: {experiment_id}")
+    else:
+        experiment_id = experiment.experiment_id
+        logger.debug(f"Using existing experiment '{experiment_name}' with ID: {experiment_id}")
+    
+    return experiment_id
+
+
+def get_or_create_registered_model(
+    model_name: str,
+    description: Optional[str] = None,
+    tags: Optional[Dict[str, str]] = None
+) -> str:
+    """
+    Get or create a registered model in the Model Registry.
+    
+    Args:
+        model_name: Name of the model to register
+        description: Model description
+        tags: Tags for the model
+        
+    Returns:
+        The registered model name
+    """
+    client = get_mlflow_client()
+    
+    try:
+        # Check if model exists
+        client.get_registered_model(model_name)
+        logger.debug(f"Using existing registered model: {model_name}")
+    except mlflow.exceptions.MlflowException:
+        # Create the model
+        client.create_registered_model(
+            name=model_name,
+            description=description or f"Model for {model_name}",
+            tags=tags or {}
+        )
+        logger.info(f"Created registered model: {model_name}")
+    
+    return model_name
+
+
+def health_check() -> Dict[str, Any]:
+    """
+    Check MLflow server connectivity and return status.
+    
+    Returns:
+        Dictionary with health status information
+    """
+    config = MLflowConfig()
+    result = {
+        "tracking_uri": config.tracking_uri,
+        "external_uri": config.external_uri,
+        "connected": False,
+        "error": None,
+    }
+    
+    try:
+        client = get_mlflow_client(configure_global=False)
+        # Try to list experiments as a health check
+        experiments = client.search_experiments(max_results=1)
+        result["connected"] = True
+        result["experiment_count"] = len(experiments)
+    except Exception as e:
+        result["error"] = str(e)
+        logger.error(f"MLflow health check failed: {e}")
+    
+    return result