kubeflow/evaluation_pipeline.py

#!/usr/bin/env python3
"""
Model Evaluation Pipeline - Kubeflow Pipelines SDK

Evaluates fine-tuned models against benchmarks.
Integrates with Argo Workflows for automated model deployment.
Logs evaluation results to MLflow for experiment tracking.

Usage:
  pip install kfp==2.12.1 mlflow boto3 psycopg2-binary
  python evaluation_pipeline.py
"""

from kfp import dsl
from kfp import compiler
from typing import Dict, List, Any, NamedTuple


# ---- MLflow KFP components (inline to avoid external dep) ----

MLFLOW_IMAGE = "python:3.13-slim"
MLFLOW_PACKAGES = ["mlflow>=2.10.0", "boto3", "psycopg2-binary"]


@dsl.component(base_image=MLFLOW_IMAGE, packages_to_install=MLFLOW_PACKAGES)
def create_mlflow_run(
    experiment_name: str,
    run_name: str,
    mlflow_tracking_uri: str = "http://mlflow.mlflow.svc.cluster.local:80",
    tags: Dict[str, str] = None,
    params: Dict[str, str] = None,
) -> NamedTuple('RunInfo', [('run_id', str), ('experiment_id', str)]):
    """Create an MLflow run and return the run_id."""
    import os, mlflow
    from mlflow.tracking import MlflowClient
    from collections import namedtuple

    mlflow.set_tracking_uri(mlflow_tracking_uri)
    client = MlflowClient()

    exp = client.get_experiment_by_name(experiment_name)
    experiment_id = exp.experiment_id if exp else client.create_experiment(
        name=experiment_name, artifact_location=f"/mlflow/artifacts/{experiment_name}"
    )

    default_tags = {
        "pipeline.type": "evaluation",
        "kfp.run_id": os.environ.get("KFP_RUN_ID", "unknown"),
    }
    if tags:
        default_tags.update(tags)

    run = mlflow.start_run(experiment_id=experiment_id, run_name=run_name, tags=default_tags)
    if params:
        mlflow.log_params(params)
    run_id = run.info.run_id
    mlflow.end_run()

    RunInfo = namedtuple('RunInfo', ['run_id', 'experiment_id'])
    return RunInfo(run_id, experiment_id)


@dsl.component(base_image=MLFLOW_IMAGE, packages_to_install=MLFLOW_PACKAGES)
def log_evaluation_to_mlflow(
    run_id: str,
    model_name: str,
    dataset_name: str,
    metrics: dict,
    results: list,
    mlflow_tracking_uri: str = "http://mlflow.mlflow.svc.cluster.local:80",
) -> str:
    """Log evaluation metrics, params, and sample results to MLflow."""
    import json, tempfile, mlflow
    from mlflow.tracking import MlflowClient
    from pathlib import Path

    mlflow.set_tracking_uri(mlflow_tracking_uri)
    client = MlflowClient()

    client.log_param(run_id, "eval.model_name", model_name)
    client.log_param(run_id, "eval.dataset", dataset_name)

    for key, value in metrics.items():
        client.log_metric(run_id, f"eval.{key}", float(value))

    if results:
        with tempfile.TemporaryDirectory() as tmpdir:
            path = Path(tmpdir) / "evaluation_results.json"
            path.write_text(json.dumps(results, indent=2))
            client.log_artifact(run_id, str(path))

    passed = metrics.get("pass", metrics.get("accuracy", 0) >= 0.7)
    client.set_tag(run_id, "eval.passed", str(passed))
    client.set_tag(run_id, "model.name", model_name)

    # End the run
    client.set_terminated(run_id, status="FINISHED")
    return run_id


@dsl.component(
    base_image="python:3.13-slim",
    packages_to_install=["httpx"]
)
def load_eval_dataset(
    dataset_name: str = "mmlu",
    subset: str = "test",
    limit: int = 100
) -> list:
    """Load evaluation dataset samples."""
    import httpx
    import json

    # For now, use a simple test set
    # In production, this would load from HuggingFace or S3
    test_samples = [
        {
            "question": "What is the capital of France?",
            "choices": ["London", "Berlin", "Paris", "Madrid"],
            "answer": "C"
        },
        {
            "question": "Which planet is known as the Red Planet?",
            "choices": ["Venus", "Mars", "Jupiter", "Saturn"],
            "answer": "B"
        },
        {
            "question": "What is 2 + 2?",
            "choices": ["3", "4", "5", "6"],
            "answer": "B"
        }
    ]

    return test_samples[:limit]


@dsl.component(
    base_image="python:3.13-slim",
    packages_to_install=["httpx"]
)
def run_inference(
    samples: list,
    model_endpoint: str,
    model_name: str = "default"
) -> list:
    """Run inference on evaluation samples."""
    import httpx

    results = []

    with httpx.Client(timeout=120.0) as client:
        for sample in samples:
            prompt = f"""Answer the following multiple choice question.

Question: {sample['question']}
Choices:
A) {sample['choices'][0]}
B) {sample['choices'][1]}
C) {sample['choices'][2]}
D) {sample['choices'][3]}

Answer with just the letter (A, B, C, or D):"""

            response = client.post(
                f"{model_endpoint}/v1/chat/completions",
                json={
                    "model": model_name,
                    "messages": [{"role": "user", "content": prompt}],
                    "max_tokens": 10,
                    "temperature": 0
                }
            )

            result = response.json()
            answer = result["choices"][0]["message"]["content"].strip().upper()

            results.append({
                "question": sample["question"],
                "expected": sample["answer"],
                "predicted": answer[0] if answer else "X",
                "correct": answer.startswith(sample["answer"])
            })

    return results


@dsl.component(
    base_image="python:3.13-slim"
)
def calculate_metrics(
    results: list
) -> dict:
    """Calculate evaluation metrics."""
    correct = sum(1 for r in results if r["correct"])
    total = len(results)

    accuracy = correct / total if total > 0 else 0

    return {
        "accuracy": accuracy,
        "correct": correct,
        "total": total,
        "pass": accuracy >= 0.7  # 70% threshold
    }


@dsl.component(
    base_image="python:3.13-slim",
    packages_to_install=["httpx"]
)
def publish_results(
    metrics: dict,
    model_name: str,
    nats_url: str = "nats://nats.ai-ml.svc.cluster.local:4222"
) -> str:
    """Publish evaluation results to NATS."""
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "nats-py"])

    import asyncio
    import json
    import nats

    async def publish():
        nc = await nats.connect(nats_url)
        await nc.publish(
            f"ai.evaluation.results.{model_name}",
            json.dumps({
                "model": model_name,
                "metrics": metrics,
                "status": "passed" if metrics["pass"] else "failed"
            }).encode()
        )
        await nc.close()

    asyncio.run(publish())
    return "published"


@dsl.pipeline(
    name="model-evaluation-pipeline",
    description="Evaluate model performance on benchmarks"
)
def model_evaluation_pipeline(
    model_endpoint: str = "http://llm-draft.ai-ml.svc.cluster.local:8000",
    model_name: str = "default",
    dataset_name: str = "mmlu",
    sample_limit: int = 100
):
    """
    Model Evaluation Pipeline

    Args:
        model_endpoint: URL of the model inference endpoint
        model_name: Name of the model being evaluated
        dataset_name: Evaluation dataset to use
        sample_limit: Maximum samples to evaluate
    """

    # Create MLflow run for this evaluation
    mlflow_run = create_mlflow_run(
        experiment_name="model-evaluation",
        run_name=f"eval-{model_name}-{dataset_name}",
        params={
            "model_endpoint": model_endpoint,
            "model_name": model_name,
            "dataset_name": dataset_name,
            "sample_limit": str(sample_limit),
        },
    )

    # Load dataset
    load_task = load_eval_dataset(
        dataset_name=dataset_name,
        limit=sample_limit
    )
    load_task.set_caching_options(enable_caching=True)

    # Run inference
    inference_task = run_inference(
        samples=load_task.output,
        model_endpoint=model_endpoint,
        model_name=model_name
    )
    inference_task.set_caching_options(enable_caching=False)

    # Calculate metrics
    metrics_task = calculate_metrics(results=inference_task.output)

    # Log results to MLflow
    log_task = log_evaluation_to_mlflow(
        run_id=mlflow_run.outputs["run_id"],
        model_name=model_name,
        dataset_name=dataset_name,
        metrics=metrics_task.output,
        results=inference_task.output,
    )

    # Publish results to NATS
    publish_task = publish_results(
        metrics=metrics_task.output,
        model_name=model_name
    )
    publish_task.after(log_task)


if __name__ == "__main__":
    compiler.Compiler().compile(
        model_evaluation_pipeline,
        "evaluation_pipeline.yaml"
    )
    print("Compiled: evaluation_pipeline.yaml")