Files
kubeflow/evaluation_pipeline.py
Billy D. cee21f124c feat: add MLflow tracking to evaluation pipeline
- Add create_mlflow_run and log_evaluation_to_mlflow KFP components
- Log accuracy, correct/total counts, pass/fail to MLflow experiment
- Upload evaluation_results.json as artifact
- Wire MLflow run into pipeline DAG before NATS publish
2026-02-12 06:15:13 -05:00

314 lines
8.6 KiB
Python

#!/usr/bin/env python3
"""
Model Evaluation Pipeline - Kubeflow Pipelines SDK
Evaluates fine-tuned models against benchmarks.
Integrates with Argo Workflows for automated model deployment.
Logs evaluation results to MLflow for experiment tracking.
Usage:
pip install kfp==2.12.1 mlflow boto3 psycopg2-binary
python evaluation_pipeline.py
"""
from kfp import dsl
from kfp import compiler
from typing import Dict, List, Any, NamedTuple
# ---- MLflow KFP components (inline to avoid external dep) ----
MLFLOW_IMAGE = "python:3.13-slim"
MLFLOW_PACKAGES = ["mlflow>=2.10.0", "boto3", "psycopg2-binary"]
@dsl.component(base_image=MLFLOW_IMAGE, packages_to_install=MLFLOW_PACKAGES)
def create_mlflow_run(
experiment_name: str,
run_name: str,
mlflow_tracking_uri: str = "http://mlflow.mlflow.svc.cluster.local:80",
tags: Dict[str, str] = None,
params: Dict[str, str] = None,
) -> NamedTuple('RunInfo', [('run_id', str), ('experiment_id', str)]):
"""Create an MLflow run and return the run_id."""
import os, mlflow
from mlflow.tracking import MlflowClient
from collections import namedtuple
mlflow.set_tracking_uri(mlflow_tracking_uri)
client = MlflowClient()
exp = client.get_experiment_by_name(experiment_name)
experiment_id = exp.experiment_id if exp else client.create_experiment(
name=experiment_name, artifact_location=f"/mlflow/artifacts/{experiment_name}"
)
default_tags = {
"pipeline.type": "evaluation",
"kfp.run_id": os.environ.get("KFP_RUN_ID", "unknown"),
}
if tags:
default_tags.update(tags)
run = mlflow.start_run(experiment_id=experiment_id, run_name=run_name, tags=default_tags)
if params:
mlflow.log_params(params)
run_id = run.info.run_id
mlflow.end_run()
RunInfo = namedtuple('RunInfo', ['run_id', 'experiment_id'])
return RunInfo(run_id, experiment_id)
@dsl.component(base_image=MLFLOW_IMAGE, packages_to_install=MLFLOW_PACKAGES)
def log_evaluation_to_mlflow(
run_id: str,
model_name: str,
dataset_name: str,
metrics: dict,
results: list,
mlflow_tracking_uri: str = "http://mlflow.mlflow.svc.cluster.local:80",
) -> str:
"""Log evaluation metrics, params, and sample results to MLflow."""
import json, tempfile, mlflow
from mlflow.tracking import MlflowClient
from pathlib import Path
mlflow.set_tracking_uri(mlflow_tracking_uri)
client = MlflowClient()
client.log_param(run_id, "eval.model_name", model_name)
client.log_param(run_id, "eval.dataset", dataset_name)
for key, value in metrics.items():
client.log_metric(run_id, f"eval.{key}", float(value))
if results:
with tempfile.TemporaryDirectory() as tmpdir:
path = Path(tmpdir) / "evaluation_results.json"
path.write_text(json.dumps(results, indent=2))
client.log_artifact(run_id, str(path))
passed = metrics.get("pass", metrics.get("accuracy", 0) >= 0.7)
client.set_tag(run_id, "eval.passed", str(passed))
client.set_tag(run_id, "model.name", model_name)
# End the run
client.set_terminated(run_id, status="FINISHED")
return run_id
@dsl.component(
base_image="python:3.13-slim",
packages_to_install=["httpx"]
)
def load_eval_dataset(
dataset_name: str = "mmlu",
subset: str = "test",
limit: int = 100
) -> list:
"""Load evaluation dataset samples."""
import httpx
import json
# For now, use a simple test set
# In production, this would load from HuggingFace or S3
test_samples = [
{
"question": "What is the capital of France?",
"choices": ["London", "Berlin", "Paris", "Madrid"],
"answer": "C"
},
{
"question": "Which planet is known as the Red Planet?",
"choices": ["Venus", "Mars", "Jupiter", "Saturn"],
"answer": "B"
},
{
"question": "What is 2 + 2?",
"choices": ["3", "4", "5", "6"],
"answer": "B"
}
]
return test_samples[:limit]
@dsl.component(
base_image="python:3.13-slim",
packages_to_install=["httpx"]
)
def run_inference(
samples: list,
model_endpoint: str,
model_name: str = "default"
) -> list:
"""Run inference on evaluation samples."""
import httpx
results = []
with httpx.Client(timeout=120.0) as client:
for sample in samples:
prompt = f"""Answer the following multiple choice question.
Question: {sample['question']}
Choices:
A) {sample['choices'][0]}
B) {sample['choices'][1]}
C) {sample['choices'][2]}
D) {sample['choices'][3]}
Answer with just the letter (A, B, C, or D):"""
response = client.post(
f"{model_endpoint}/v1/chat/completions",
json={
"model": model_name,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 10,
"temperature": 0
}
)
result = response.json()
answer = result["choices"][0]["message"]["content"].strip().upper()
results.append({
"question": sample["question"],
"expected": sample["answer"],
"predicted": answer[0] if answer else "X",
"correct": answer.startswith(sample["answer"])
})
return results
@dsl.component(
base_image="python:3.13-slim"
)
def calculate_metrics(
results: list
) -> dict:
"""Calculate evaluation metrics."""
correct = sum(1 for r in results if r["correct"])
total = len(results)
accuracy = correct / total if total > 0 else 0
return {
"accuracy": accuracy,
"correct": correct,
"total": total,
"pass": accuracy >= 0.7 # 70% threshold
}
@dsl.component(
base_image="python:3.13-slim",
packages_to_install=["httpx"]
)
def publish_results(
metrics: dict,
model_name: str,
nats_url: str = "nats://nats.ai-ml.svc.cluster.local:4222"
) -> str:
"""Publish evaluation results to NATS."""
import subprocess
import sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "nats-py"])
import asyncio
import json
import nats
async def publish():
nc = await nats.connect(nats_url)
await nc.publish(
f"ai.evaluation.results.{model_name}",
json.dumps({
"model": model_name,
"metrics": metrics,
"status": "passed" if metrics["pass"] else "failed"
}).encode()
)
await nc.close()
asyncio.run(publish())
return "published"
@dsl.pipeline(
name="model-evaluation-pipeline",
description="Evaluate model performance on benchmarks"
)
def model_evaluation_pipeline(
model_endpoint: str = "http://llm-draft.ai-ml.svc.cluster.local:8000",
model_name: str = "default",
dataset_name: str = "mmlu",
sample_limit: int = 100
):
"""
Model Evaluation Pipeline
Args:
model_endpoint: URL of the model inference endpoint
model_name: Name of the model being evaluated
dataset_name: Evaluation dataset to use
sample_limit: Maximum samples to evaluate
"""
# Create MLflow run for this evaluation
mlflow_run = create_mlflow_run(
experiment_name="model-evaluation",
run_name=f"eval-{model_name}-{dataset_name}",
params={
"model_endpoint": model_endpoint,
"model_name": model_name,
"dataset_name": dataset_name,
"sample_limit": str(sample_limit),
},
)
# Load dataset
load_task = load_eval_dataset(
dataset_name=dataset_name,
limit=sample_limit
)
load_task.set_caching_options(enable_caching=True)
# Run inference
inference_task = run_inference(
samples=load_task.output,
model_endpoint=model_endpoint,
model_name=model_name
)
inference_task.set_caching_options(enable_caching=False)
# Calculate metrics
metrics_task = calculate_metrics(results=inference_task.output)
# Log results to MLflow
log_task = log_evaluation_to_mlflow(
run_id=mlflow_run.outputs["run_id"],
model_name=model_name,
dataset_name=dataset_name,
metrics=metrics_task.output,
results=inference_task.output,
)
# Publish results to NATS
publish_task = publish_results(
metrics=metrics_task.output,
model_name=model_name
)
publish_task.after(log_task)
if __name__ == "__main__":
compiler.Compiler().compile(
model_evaluation_pipeline,
"evaluation_pipeline.yaml"
)
print("Compiled: evaluation_pipeline.yaml")