#!/usr/bin/env python3 """ Model Evaluation Pipeline - Kubeflow Pipelines SDK Evaluates fine-tuned models against benchmarks. Integrates with Argo Workflows for automated model deployment. Logs evaluation results to MLflow for experiment tracking. Usage: pip install kfp==2.12.1 mlflow boto3 psycopg2-binary python evaluation_pipeline.py """ from kfp import dsl from kfp import compiler from typing import Dict, List, Any, NamedTuple # ---- MLflow KFP components (inline to avoid external dep) ---- MLFLOW_IMAGE = "python:3.13-slim" MLFLOW_PACKAGES = ["mlflow>=2.10.0", "boto3", "psycopg2-binary"] @dsl.component(base_image=MLFLOW_IMAGE, packages_to_install=MLFLOW_PACKAGES) def create_mlflow_run( experiment_name: str, run_name: str, mlflow_tracking_uri: str = "http://mlflow.mlflow.svc.cluster.local:80", tags: Dict[str, str] = None, params: Dict[str, str] = None, ) -> NamedTuple('RunInfo', [('run_id', str), ('experiment_id', str)]): """Create an MLflow run and return the run_id.""" import os, mlflow from mlflow.tracking import MlflowClient from collections import namedtuple mlflow.set_tracking_uri(mlflow_tracking_uri) client = MlflowClient() exp = client.get_experiment_by_name(experiment_name) experiment_id = exp.experiment_id if exp else client.create_experiment( name=experiment_name, artifact_location=f"/mlflow/artifacts/{experiment_name}" ) default_tags = { "pipeline.type": "evaluation", "kfp.run_id": os.environ.get("KFP_RUN_ID", "unknown"), } if tags: default_tags.update(tags) run = mlflow.start_run(experiment_id=experiment_id, run_name=run_name, tags=default_tags) if params: mlflow.log_params(params) run_id = run.info.run_id mlflow.end_run() RunInfo = namedtuple('RunInfo', ['run_id', 'experiment_id']) return RunInfo(run_id, experiment_id) @dsl.component(base_image=MLFLOW_IMAGE, packages_to_install=MLFLOW_PACKAGES) def log_evaluation_to_mlflow( run_id: str, model_name: str, dataset_name: str, metrics: dict, results: list, mlflow_tracking_uri: str = "http://mlflow.mlflow.svc.cluster.local:80", ) -> str: """Log evaluation metrics, params, and sample results to MLflow.""" import json, tempfile, mlflow from mlflow.tracking import MlflowClient from pathlib import Path mlflow.set_tracking_uri(mlflow_tracking_uri) client = MlflowClient() client.log_param(run_id, "eval.model_name", model_name) client.log_param(run_id, "eval.dataset", dataset_name) for key, value in metrics.items(): client.log_metric(run_id, f"eval.{key}", float(value)) if results: with tempfile.TemporaryDirectory() as tmpdir: path = Path(tmpdir) / "evaluation_results.json" path.write_text(json.dumps(results, indent=2)) client.log_artifact(run_id, str(path)) passed = metrics.get("pass", metrics.get("accuracy", 0) >= 0.7) client.set_tag(run_id, "eval.passed", str(passed)) client.set_tag(run_id, "model.name", model_name) # End the run client.set_terminated(run_id, status="FINISHED") return run_id @dsl.component( base_image="python:3.13-slim", packages_to_install=["httpx"] ) def load_eval_dataset( dataset_name: str = "mmlu", subset: str = "test", limit: int = 100 ) -> list: """Load evaluation dataset samples.""" import httpx import json # For now, use a simple test set # In production, this would load from HuggingFace or S3 test_samples = [ { "question": "What is the capital of France?", "choices": ["London", "Berlin", "Paris", "Madrid"], "answer": "C" }, { "question": "Which planet is known as the Red Planet?", "choices": ["Venus", "Mars", "Jupiter", "Saturn"], "answer": "B" }, { "question": "What is 2 + 2?", "choices": ["3", "4", "5", "6"], "answer": "B" } ] return test_samples[:limit] @dsl.component( base_image="python:3.13-slim", packages_to_install=["httpx"] ) def run_inference( samples: list, model_endpoint: str, model_name: str = "default" ) -> list: """Run inference on evaluation samples.""" import httpx results = [] with httpx.Client(timeout=120.0) as client: for sample in samples: prompt = f"""Answer the following multiple choice question. Question: {sample['question']} Choices: A) {sample['choices'][0]} B) {sample['choices'][1]} C) {sample['choices'][2]} D) {sample['choices'][3]} Answer with just the letter (A, B, C, or D):""" response = client.post( f"{model_endpoint}/v1/chat/completions", json={ "model": model_name, "messages": [{"role": "user", "content": prompt}], "max_tokens": 10, "temperature": 0 } ) result = response.json() answer = result["choices"][0]["message"]["content"].strip().upper() results.append({ "question": sample["question"], "expected": sample["answer"], "predicted": answer[0] if answer else "X", "correct": answer.startswith(sample["answer"]) }) return results @dsl.component( base_image="python:3.13-slim" ) def calculate_metrics( results: list ) -> dict: """Calculate evaluation metrics.""" correct = sum(1 for r in results if r["correct"]) total = len(results) accuracy = correct / total if total > 0 else 0 return { "accuracy": accuracy, "correct": correct, "total": total, "pass": accuracy >= 0.7 # 70% threshold } @dsl.component( base_image="python:3.13-slim", packages_to_install=["httpx"] ) def publish_results( metrics: dict, model_name: str, nats_url: str = "nats://nats.ai-ml.svc.cluster.local:4222" ) -> str: """Publish evaluation results to NATS.""" import subprocess import sys subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "nats-py"]) import asyncio import json import nats async def publish(): nc = await nats.connect(nats_url) await nc.publish( f"ai.evaluation.results.{model_name}", json.dumps({ "model": model_name, "metrics": metrics, "status": "passed" if metrics["pass"] else "failed" }).encode() ) await nc.close() asyncio.run(publish()) return "published" @dsl.pipeline( name="model-evaluation-pipeline", description="Evaluate model performance on benchmarks" ) def model_evaluation_pipeline( model_endpoint: str = "http://llm-draft.ai-ml.svc.cluster.local:8000", model_name: str = "default", dataset_name: str = "mmlu", sample_limit: int = 100 ): """ Model Evaluation Pipeline Args: model_endpoint: URL of the model inference endpoint model_name: Name of the model being evaluated dataset_name: Evaluation dataset to use sample_limit: Maximum samples to evaluate """ # Create MLflow run for this evaluation mlflow_run = create_mlflow_run( experiment_name="model-evaluation", run_name=f"eval-{model_name}-{dataset_name}", params={ "model_endpoint": model_endpoint, "model_name": model_name, "dataset_name": dataset_name, "sample_limit": str(sample_limit), }, ) # Load dataset load_task = load_eval_dataset( dataset_name=dataset_name, limit=sample_limit ) load_task.set_caching_options(enable_caching=True) # Run inference inference_task = run_inference( samples=load_task.output, model_endpoint=model_endpoint, model_name=model_name ) inference_task.set_caching_options(enable_caching=False) # Calculate metrics metrics_task = calculate_metrics(results=inference_task.output) # Log results to MLflow log_task = log_evaluation_to_mlflow( run_id=mlflow_run.outputs["run_id"], model_name=model_name, dataset_name=dataset_name, metrics=metrics_task.output, results=inference_task.output, ) # Publish results to NATS publish_task = publish_results( metrics=metrics_task.output, model_name=model_name ) publish_task.after(log_task) if __name__ == "__main__": compiler.Compiler().compile( model_evaluation_pipeline, "evaluation_pipeline.yaml" ) print("Compiled: evaluation_pipeline.yaml")