feat: Add Kubeflow Pipeline definitions

- voice_pipeline: STT → RAG → LLM → TTS - document_ingestion_pipeline: Extract → Chunk → Embed → Milvus - document_ingestion_mlflow_pipeline: With MLflow tracking - evaluation_pipeline: Model benchmarking - kfp-sync-job: K8s job to sync pipelines
2026-02-01 20:41:13 -05:00
parent c36655b570
commit c26e4e5ef0
6 changed files with 1690 additions and 1 deletions
--- a/document_ingestion_pipeline.py
+++ b/document_ingestion_pipeline.py
@@ -0,0 +1,257 @@
+#!/usr/bin/env python3
+"""
+Document Ingestion Pipeline - Kubeflow Pipelines SDK
+
+Ingests documents into Milvus vector database with embeddings.
+Can be triggered from Argo Workflows via the kfp-trigger template.
+
+Usage:
+  pip install kfp==2.12.1
+  python document_ingestion_pipeline.py
+  # Upload document_ingestion.yaml to Kubeflow Pipelines UI
+"""
+
+from kfp import dsl
+from kfp import compiler
+from typing import List
+
+
+@dsl.component(
+    base_image="python:3.13-slim",
+    packages_to_install=["httpx", "beautifulsoup4", "pypdf2", "docx2txt"]
+)
+def extract_text(
+    source_url: str,
+    source_type: str = "auto"
+) -> str:
+    """Extract text from various document formats."""
+    import httpx
+    from pathlib import Path
+    import tempfile
+
+    with httpx.Client(timeout=120.0) as client:
+        response = client.get(source_url)
+        content = response.content
+
+    # Detect type if auto
+    if source_type == "auto":
+        if source_url.endswith(".pdf"):
+            source_type = "pdf"
+        elif source_url.endswith(".docx"):
+            source_type = "docx"
+        elif source_url.endswith(".html") or source_url.endswith(".htm"):
+            source_type = "html"
+        else:
+            source_type = "text"
+
+    if source_type == "pdf":
+        from PyPDF2 import PdfReader
+        import io
+        reader = PdfReader(io.BytesIO(content))
+        text = "\n\n".join([page.extract_text() for page in reader.pages])
+    elif source_type == "docx":
+        import docx2txt
+        with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f:
+            f.write(content)
+            text = docx2txt.process(f.name)
+    elif source_type == "html":
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup(content, "html.parser")
+        text = soup.get_text(separator="\n")
+    else:
+        text = content.decode("utf-8", errors="ignore")
+
+    return text
+
+
+@dsl.component(
+    base_image="python:3.13-slim",
+    packages_to_install=["tiktoken"]
+)
+def chunk_text(
+    text: str,
+    chunk_size: int = 500,
+    overlap: int = 50
+) -> list:
+    """Split text into overlapping chunks."""
+    import tiktoken
+
+    enc = tiktoken.get_encoding("cl100k_base")
+    tokens = enc.encode(text)
+
+    chunks = []
+    start = 0
+    while start < len(tokens):
+        end = min(start + chunk_size, len(tokens))
+        chunk_tokens = tokens[start:end]
+        chunk_text = enc.decode(chunk_tokens)
+        chunks.append({
+            "text": chunk_text,
+            "start_token": start,
+            "end_token": end
+        })
+        start += chunk_size - overlap
+
+    return chunks
+
+
+@dsl.component(
+    base_image="python:3.13-slim",
+    packages_to_install=["httpx"]
+)
+def generate_embeddings_batch(
+    chunks: list,
+    embeddings_url: str = "http://embeddings-predictor.ai-ml.svc.cluster.local",
+    batch_size: int = 32
+) -> list:
+    """Generate embeddings for all chunks."""
+    import httpx
+
+    embedded_chunks = []
+    texts = [c["text"] for c in chunks]
+
+    with httpx.Client(timeout=300.0) as client:
+        for i in range(0, len(texts), batch_size):
+            batch = texts[i:i + batch_size]
+            response = client.post(
+                f"{embeddings_url}/embeddings",
+                json={"input": batch, "model": "bge-small-en-v1.5"}
+            )
+            result = response.json()
+
+            for j, embedding_data in enumerate(result["data"]):
+                chunk = chunks[i + j].copy()
+                chunk["embedding"] = embedding_data["embedding"]
+                embedded_chunks.append(chunk)
+
+    return embedded_chunks
+
+
+@dsl.component(
+    base_image="python:3.13-slim",
+    packages_to_install=["pymilvus"]
+)
+def upsert_to_milvus(
+    chunks: list,
+    collection_name: str,
+    source_name: str,
+    milvus_host: str = "milvus.ai-ml.svc.cluster.local"
+) -> int:
+    """Upsert embeddings to Milvus collection."""
+    from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, utility
+
+    connections.connect(host=milvus_host, port=19530)
+
+    # Create collection if it doesn't exist
+    if not utility.has_collection(collection_name):
+        fields = [
+            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
+            FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=8192),
+            FieldSchema(name="source", dtype=DataType.VARCHAR, max_length=512),
+            FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=1024)
+        ]
+        schema = CollectionSchema(fields, description="Document embeddings")
+        Collection(name=collection_name, schema=schema)
+
+    collection = Collection(collection_name)
+
+    # Prepare data
+    data = [
+        [chunk["text"] for chunk in chunks],
+        [source_name for _ in chunks],
+        [chunk["embedding"] for chunk in chunks]
+    ]
+
+    # Insert
+    result = collection.insert(data)
+    collection.flush()
+
+    # Create index if needed
+    if not collection.has_index():
+        collection.create_index(
+            field_name="embedding",
+            index_params={
+                "metric_type": "COSINE",
+                "index_type": "HNSW",
+                "params": {"M": 16, "efConstruction": 256}
+            }
+        )
+
+    return len(chunks)
+
+
+@dsl.pipeline(
+    name="document-ingestion-pipeline",
+    description="Ingest documents into Milvus: Extract -> Chunk -> Embed -> Store"
+)
+def document_ingestion_pipeline(
+    source_url: str,
+    collection_name: str = "knowledge_base",
+    source_name: str = "",
+    chunk_size: int = 500,
+    chunk_overlap: int = 50
+):
+    """
+    Document Ingestion Pipeline
+
+    Args:
+        source_url: URL to the document (PDF, DOCX, HTML, or plain text)
+        collection_name: Milvus collection to store embeddings
+        source_name: Human-readable name for the source
+        chunk_size: Token count per chunk
+        chunk_overlap: Overlap between chunks
+    """
+
+    # Step 1: Extract text from document
+    extract_task = extract_text(source_url=source_url)
+    extract_task.set_caching_options(enable_caching=True)
+
+    # Step 2: Chunk the text
+    chunk_task = chunk_text(
+        text=extract_task.output,
+        chunk_size=chunk_size,
+        overlap=chunk_overlap
+    )
+    chunk_task.set_caching_options(enable_caching=True)
+
+    # Step 3: Generate embeddings
+    embed_task = generate_embeddings_batch(chunks=chunk_task.output)
+    embed_task.set_caching_options(enable_caching=True)
+
+    # Step 4: Store in Milvus
+    store_task = upsert_to_milvus(
+        chunks=embed_task.output,
+        collection_name=collection_name,
+        source_name=source_name if source_name else source_url
+    )
+
+
+@dsl.pipeline(
+    name="batch-document-ingestion",
+    description="Ingest multiple documents in parallel"
+)
+def batch_document_ingestion_pipeline(
+    source_urls: List[str],
+    collection_name: str = "knowledge_base"
+):
+    """
+    Batch Document Ingestion
+
+    Args:
+        source_urls: List of document URLs to ingest
+        collection_name: Target Milvus collection
+    """
+    with dsl.ParallelFor(source_urls) as url:
+        document_ingestion_pipeline(
+            source_url=url,
+            collection_name=collection_name
+        )
+
+
+if __name__ == "__main__":
+    # Primary pipeline - filename must match Python file for sync
+    compiler.Compiler().compile(
+        document_ingestion_pipeline,
+        "document_ingestion_pipeline.yaml"
+    )
+    print("Compiled: document_ingestion_pipeline.yaml")