- Add create_mlflow_run and log_evaluation_to_mlflow KFP components - Log accuracy, correct/total counts, pass/fail to MLflow experiment - Upload evaluation_results.json as artifact - Wire MLflow run into pipeline DAG before NATS publish
314 lines
8.6 KiB
Python
314 lines
8.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Model Evaluation Pipeline - Kubeflow Pipelines SDK
|
|
|
|
Evaluates fine-tuned models against benchmarks.
|
|
Integrates with Argo Workflows for automated model deployment.
|
|
Logs evaluation results to MLflow for experiment tracking.
|
|
|
|
Usage:
|
|
pip install kfp==2.12.1 mlflow boto3 psycopg2-binary
|
|
python evaluation_pipeline.py
|
|
"""
|
|
|
|
from kfp import dsl
|
|
from kfp import compiler
|
|
from typing import Dict, List, Any, NamedTuple
|
|
|
|
|
|
# ---- MLflow KFP components (inline to avoid external dep) ----
|
|
|
|
MLFLOW_IMAGE = "python:3.13-slim"
|
|
MLFLOW_PACKAGES = ["mlflow>=2.10.0", "boto3", "psycopg2-binary"]
|
|
|
|
|
|
@dsl.component(base_image=MLFLOW_IMAGE, packages_to_install=MLFLOW_PACKAGES)
|
|
def create_mlflow_run(
|
|
experiment_name: str,
|
|
run_name: str,
|
|
mlflow_tracking_uri: str = "http://mlflow.mlflow.svc.cluster.local:80",
|
|
tags: Dict[str, str] = None,
|
|
params: Dict[str, str] = None,
|
|
) -> NamedTuple('RunInfo', [('run_id', str), ('experiment_id', str)]):
|
|
"""Create an MLflow run and return the run_id."""
|
|
import os, mlflow
|
|
from mlflow.tracking import MlflowClient
|
|
from collections import namedtuple
|
|
|
|
mlflow.set_tracking_uri(mlflow_tracking_uri)
|
|
client = MlflowClient()
|
|
|
|
exp = client.get_experiment_by_name(experiment_name)
|
|
experiment_id = exp.experiment_id if exp else client.create_experiment(
|
|
name=experiment_name, artifact_location=f"/mlflow/artifacts/{experiment_name}"
|
|
)
|
|
|
|
default_tags = {
|
|
"pipeline.type": "evaluation",
|
|
"kfp.run_id": os.environ.get("KFP_RUN_ID", "unknown"),
|
|
}
|
|
if tags:
|
|
default_tags.update(tags)
|
|
|
|
run = mlflow.start_run(experiment_id=experiment_id, run_name=run_name, tags=default_tags)
|
|
if params:
|
|
mlflow.log_params(params)
|
|
run_id = run.info.run_id
|
|
mlflow.end_run()
|
|
|
|
RunInfo = namedtuple('RunInfo', ['run_id', 'experiment_id'])
|
|
return RunInfo(run_id, experiment_id)
|
|
|
|
|
|
@dsl.component(base_image=MLFLOW_IMAGE, packages_to_install=MLFLOW_PACKAGES)
|
|
def log_evaluation_to_mlflow(
|
|
run_id: str,
|
|
model_name: str,
|
|
dataset_name: str,
|
|
metrics: dict,
|
|
results: list,
|
|
mlflow_tracking_uri: str = "http://mlflow.mlflow.svc.cluster.local:80",
|
|
) -> str:
|
|
"""Log evaluation metrics, params, and sample results to MLflow."""
|
|
import json, tempfile, mlflow
|
|
from mlflow.tracking import MlflowClient
|
|
from pathlib import Path
|
|
|
|
mlflow.set_tracking_uri(mlflow_tracking_uri)
|
|
client = MlflowClient()
|
|
|
|
client.log_param(run_id, "eval.model_name", model_name)
|
|
client.log_param(run_id, "eval.dataset", dataset_name)
|
|
|
|
for key, value in metrics.items():
|
|
client.log_metric(run_id, f"eval.{key}", float(value))
|
|
|
|
if results:
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
path = Path(tmpdir) / "evaluation_results.json"
|
|
path.write_text(json.dumps(results, indent=2))
|
|
client.log_artifact(run_id, str(path))
|
|
|
|
passed = metrics.get("pass", metrics.get("accuracy", 0) >= 0.7)
|
|
client.set_tag(run_id, "eval.passed", str(passed))
|
|
client.set_tag(run_id, "model.name", model_name)
|
|
|
|
# End the run
|
|
client.set_terminated(run_id, status="FINISHED")
|
|
return run_id
|
|
|
|
|
|
@dsl.component(
|
|
base_image="python:3.13-slim",
|
|
packages_to_install=["httpx"]
|
|
)
|
|
def load_eval_dataset(
|
|
dataset_name: str = "mmlu",
|
|
subset: str = "test",
|
|
limit: int = 100
|
|
) -> list:
|
|
"""Load evaluation dataset samples."""
|
|
import httpx
|
|
import json
|
|
|
|
# For now, use a simple test set
|
|
# In production, this would load from HuggingFace or S3
|
|
test_samples = [
|
|
{
|
|
"question": "What is the capital of France?",
|
|
"choices": ["London", "Berlin", "Paris", "Madrid"],
|
|
"answer": "C"
|
|
},
|
|
{
|
|
"question": "Which planet is known as the Red Planet?",
|
|
"choices": ["Venus", "Mars", "Jupiter", "Saturn"],
|
|
"answer": "B"
|
|
},
|
|
{
|
|
"question": "What is 2 + 2?",
|
|
"choices": ["3", "4", "5", "6"],
|
|
"answer": "B"
|
|
}
|
|
]
|
|
|
|
return test_samples[:limit]
|
|
|
|
|
|
@dsl.component(
|
|
base_image="python:3.13-slim",
|
|
packages_to_install=["httpx"]
|
|
)
|
|
def run_inference(
|
|
samples: list,
|
|
model_endpoint: str,
|
|
model_name: str = "default"
|
|
) -> list:
|
|
"""Run inference on evaluation samples."""
|
|
import httpx
|
|
|
|
results = []
|
|
|
|
with httpx.Client(timeout=120.0) as client:
|
|
for sample in samples:
|
|
prompt = f"""Answer the following multiple choice question.
|
|
|
|
Question: {sample['question']}
|
|
Choices:
|
|
A) {sample['choices'][0]}
|
|
B) {sample['choices'][1]}
|
|
C) {sample['choices'][2]}
|
|
D) {sample['choices'][3]}
|
|
|
|
Answer with just the letter (A, B, C, or D):"""
|
|
|
|
response = client.post(
|
|
f"{model_endpoint}/v1/chat/completions",
|
|
json={
|
|
"model": model_name,
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
"max_tokens": 10,
|
|
"temperature": 0
|
|
}
|
|
)
|
|
|
|
result = response.json()
|
|
answer = result["choices"][0]["message"]["content"].strip().upper()
|
|
|
|
results.append({
|
|
"question": sample["question"],
|
|
"expected": sample["answer"],
|
|
"predicted": answer[0] if answer else "X",
|
|
"correct": answer.startswith(sample["answer"])
|
|
})
|
|
|
|
return results
|
|
|
|
|
|
@dsl.component(
|
|
base_image="python:3.13-slim"
|
|
)
|
|
def calculate_metrics(
|
|
results: list
|
|
) -> dict:
|
|
"""Calculate evaluation metrics."""
|
|
correct = sum(1 for r in results if r["correct"])
|
|
total = len(results)
|
|
|
|
accuracy = correct / total if total > 0 else 0
|
|
|
|
return {
|
|
"accuracy": accuracy,
|
|
"correct": correct,
|
|
"total": total,
|
|
"pass": accuracy >= 0.7 # 70% threshold
|
|
}
|
|
|
|
|
|
@dsl.component(
|
|
base_image="python:3.13-slim",
|
|
packages_to_install=["httpx"]
|
|
)
|
|
def publish_results(
|
|
metrics: dict,
|
|
model_name: str,
|
|
nats_url: str = "nats://nats.ai-ml.svc.cluster.local:4222"
|
|
) -> str:
|
|
"""Publish evaluation results to NATS."""
|
|
import subprocess
|
|
import sys
|
|
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "nats-py"])
|
|
|
|
import asyncio
|
|
import json
|
|
import nats
|
|
|
|
async def publish():
|
|
nc = await nats.connect(nats_url)
|
|
await nc.publish(
|
|
f"ai.evaluation.results.{model_name}",
|
|
json.dumps({
|
|
"model": model_name,
|
|
"metrics": metrics,
|
|
"status": "passed" if metrics["pass"] else "failed"
|
|
}).encode()
|
|
)
|
|
await nc.close()
|
|
|
|
asyncio.run(publish())
|
|
return "published"
|
|
|
|
|
|
@dsl.pipeline(
|
|
name="model-evaluation-pipeline",
|
|
description="Evaluate model performance on benchmarks"
|
|
)
|
|
def model_evaluation_pipeline(
|
|
model_endpoint: str = "http://llm-draft.ai-ml.svc.cluster.local:8000",
|
|
model_name: str = "default",
|
|
dataset_name: str = "mmlu",
|
|
sample_limit: int = 100
|
|
):
|
|
"""
|
|
Model Evaluation Pipeline
|
|
|
|
Args:
|
|
model_endpoint: URL of the model inference endpoint
|
|
model_name: Name of the model being evaluated
|
|
dataset_name: Evaluation dataset to use
|
|
sample_limit: Maximum samples to evaluate
|
|
"""
|
|
|
|
# Create MLflow run for this evaluation
|
|
mlflow_run = create_mlflow_run(
|
|
experiment_name="model-evaluation",
|
|
run_name=f"eval-{model_name}-{dataset_name}",
|
|
params={
|
|
"model_endpoint": model_endpoint,
|
|
"model_name": model_name,
|
|
"dataset_name": dataset_name,
|
|
"sample_limit": str(sample_limit),
|
|
},
|
|
)
|
|
|
|
# Load dataset
|
|
load_task = load_eval_dataset(
|
|
dataset_name=dataset_name,
|
|
limit=sample_limit
|
|
)
|
|
load_task.set_caching_options(enable_caching=True)
|
|
|
|
# Run inference
|
|
inference_task = run_inference(
|
|
samples=load_task.output,
|
|
model_endpoint=model_endpoint,
|
|
model_name=model_name
|
|
)
|
|
inference_task.set_caching_options(enable_caching=False)
|
|
|
|
# Calculate metrics
|
|
metrics_task = calculate_metrics(results=inference_task.output)
|
|
|
|
# Log results to MLflow
|
|
log_task = log_evaluation_to_mlflow(
|
|
run_id=mlflow_run.outputs["run_id"],
|
|
model_name=model_name,
|
|
dataset_name=dataset_name,
|
|
metrics=metrics_task.output,
|
|
results=inference_task.output,
|
|
)
|
|
|
|
# Publish results to NATS
|
|
publish_task = publish_results(
|
|
metrics=metrics_task.output,
|
|
model_name=model_name
|
|
)
|
|
publish_task.after(log_task)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
compiler.Compiler().compile(
|
|
model_evaluation_pipeline,
|
|
"evaluation_pipeline.yaml"
|
|
)
|
|
print("Compiled: evaluation_pipeline.yaml")
|