#!/usr/bin/env python3 """ Document Ingestion Pipeline - Kubeflow Pipelines SDK Ingests documents into Milvus vector database with embeddings. Can be triggered from Argo Workflows via the kfp-trigger template. Usage: pip install kfp==2.12.1 python document_ingestion_pipeline.py # Upload document_ingestion.yaml to Kubeflow Pipelines UI """ from kfp import dsl from kfp import compiler from typing import List @dsl.component( base_image="python:3.13-slim", packages_to_install=["httpx", "beautifulsoup4", "pypdf2", "docx2txt"] ) def extract_text( source_url: str, source_type: str = "auto" ) -> str: """Extract text from various document formats.""" import httpx from pathlib import Path import tempfile with httpx.Client(timeout=120.0) as client: response = client.get(source_url) content = response.content # Detect type if auto if source_type == "auto": if source_url.endswith(".pdf"): source_type = "pdf" elif source_url.endswith(".docx"): source_type = "docx" elif source_url.endswith(".html") or source_url.endswith(".htm"): source_type = "html" else: source_type = "text" if source_type == "pdf": from PyPDF2 import PdfReader import io reader = PdfReader(io.BytesIO(content)) text = "\n\n".join([page.extract_text() for page in reader.pages]) elif source_type == "docx": import docx2txt with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f: f.write(content) text = docx2txt.process(f.name) elif source_type == "html": from bs4 import BeautifulSoup soup = BeautifulSoup(content, "html.parser") text = soup.get_text(separator="\n") else: text = content.decode("utf-8", errors="ignore") return text @dsl.component( base_image="python:3.13-slim", packages_to_install=["tiktoken"] ) def chunk_text( text: str, chunk_size: int = 500, overlap: int = 50 ) -> list: """Split text into overlapping chunks.""" import tiktoken enc = tiktoken.get_encoding("cl100k_base") tokens = enc.encode(text) chunks = [] start = 0 while start < len(tokens): end = min(start + chunk_size, len(tokens)) chunk_tokens = tokens[start:end] chunk_text = enc.decode(chunk_tokens) chunks.append({ "text": chunk_text, "start_token": start, "end_token": end }) start += chunk_size - overlap return chunks @dsl.component( base_image="python:3.13-slim", packages_to_install=["httpx"] ) def generate_embeddings_batch( chunks: list, embeddings_url: str = "http://embeddings-predictor.ai-ml.svc.cluster.local", batch_size: int = 32 ) -> list: """Generate embeddings for all chunks.""" import httpx embedded_chunks = [] texts = [c["text"] for c in chunks] with httpx.Client(timeout=300.0) as client: for i in range(0, len(texts), batch_size): batch = texts[i:i + batch_size] response = client.post( f"{embeddings_url}/embeddings", json={"input": batch, "model": "bge-small-en-v1.5"} ) result = response.json() for j, embedding_data in enumerate(result["data"]): chunk = chunks[i + j].copy() chunk["embedding"] = embedding_data["embedding"] embedded_chunks.append(chunk) return embedded_chunks @dsl.component( base_image="python:3.13-slim", packages_to_install=["pymilvus"] ) def upsert_to_milvus( chunks: list, collection_name: str, source_name: str, milvus_host: str = "milvus.ai-ml.svc.cluster.local" ) -> int: """Upsert embeddings to Milvus collection.""" from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, utility connections.connect(host=milvus_host, port=19530) # Create collection if it doesn't exist if not utility.has_collection(collection_name): fields = [ FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True), FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=8192), FieldSchema(name="source", dtype=DataType.VARCHAR, max_length=512), FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=1024) ] schema = CollectionSchema(fields, description="Document embeddings") Collection(name=collection_name, schema=schema) collection = Collection(collection_name) # Prepare data data = [ [chunk["text"] for chunk in chunks], [source_name for _ in chunks], [chunk["embedding"] for chunk in chunks] ] # Insert result = collection.insert(data) collection.flush() # Create index if needed if not collection.has_index(): collection.create_index( field_name="embedding", index_params={ "metric_type": "COSINE", "index_type": "HNSW", "params": {"M": 16, "efConstruction": 256} } ) return len(chunks) @dsl.pipeline( name="document-ingestion-pipeline", description="Ingest documents into Milvus: Extract -> Chunk -> Embed -> Store" ) def document_ingestion_pipeline( source_url: str, collection_name: str = "knowledge_base", source_name: str = "", chunk_size: int = 500, chunk_overlap: int = 50 ): """ Document Ingestion Pipeline Args: source_url: URL to the document (PDF, DOCX, HTML, or plain text) collection_name: Milvus collection to store embeddings source_name: Human-readable name for the source chunk_size: Token count per chunk chunk_overlap: Overlap between chunks """ # Step 1: Extract text from document extract_task = extract_text(source_url=source_url) extract_task.set_caching_options(enable_caching=True) # Step 2: Chunk the text chunk_task = chunk_text( text=extract_task.output, chunk_size=chunk_size, overlap=chunk_overlap ) chunk_task.set_caching_options(enable_caching=True) # Step 3: Generate embeddings embed_task = generate_embeddings_batch(chunks=chunk_task.output) embed_task.set_caching_options(enable_caching=True) # Step 4: Store in Milvus store_task = upsert_to_milvus( chunks=embed_task.output, collection_name=collection_name, source_name=source_name if source_name else source_url ) @dsl.pipeline( name="batch-document-ingestion", description="Ingest multiple documents in parallel" ) def batch_document_ingestion_pipeline( source_urls: List[str], collection_name: str = "knowledge_base" ): """ Batch Document Ingestion Args: source_urls: List of document URLs to ingest collection_name: Target Milvus collection """ with dsl.ParallelFor(source_urls) as url: document_ingestion_pipeline( source_url=url, collection_name=collection_name ) if __name__ == "__main__": # Primary pipeline - filename must match Python file for sync compiler.Compiler().compile( document_ingestion_pipeline, "document_ingestion_pipeline.yaml" ) print("Compiled: document_ingestion_pipeline.yaml")