feat: Add Kubeflow Pipeline definitions
- voice_pipeline: STT → RAG → LLM → TTS - document_ingestion_pipeline: Extract → Chunk → Embed → Milvus - document_ingestion_mlflow_pipeline: With MLflow tracking - evaluation_pipeline: Model benchmarking - kfp-sync-job: K8s job to sync pipelines
This commit is contained in:
257
document_ingestion_pipeline.py
Normal file
257
document_ingestion_pipeline.py
Normal file
@@ -0,0 +1,257 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Document Ingestion Pipeline - Kubeflow Pipelines SDK
|
||||
|
||||
Ingests documents into Milvus vector database with embeddings.
|
||||
Can be triggered from Argo Workflows via the kfp-trigger template.
|
||||
|
||||
Usage:
|
||||
pip install kfp==2.12.1
|
||||
python document_ingestion_pipeline.py
|
||||
# Upload document_ingestion.yaml to Kubeflow Pipelines UI
|
||||
"""
|
||||
|
||||
from kfp import dsl
|
||||
from kfp import compiler
|
||||
from typing import List
|
||||
|
||||
|
||||
@dsl.component(
|
||||
base_image="python:3.13-slim",
|
||||
packages_to_install=["httpx", "beautifulsoup4", "pypdf2", "docx2txt"]
|
||||
)
|
||||
def extract_text(
|
||||
source_url: str,
|
||||
source_type: str = "auto"
|
||||
) -> str:
|
||||
"""Extract text from various document formats."""
|
||||
import httpx
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
|
||||
with httpx.Client(timeout=120.0) as client:
|
||||
response = client.get(source_url)
|
||||
content = response.content
|
||||
|
||||
# Detect type if auto
|
||||
if source_type == "auto":
|
||||
if source_url.endswith(".pdf"):
|
||||
source_type = "pdf"
|
||||
elif source_url.endswith(".docx"):
|
||||
source_type = "docx"
|
||||
elif source_url.endswith(".html") or source_url.endswith(".htm"):
|
||||
source_type = "html"
|
||||
else:
|
||||
source_type = "text"
|
||||
|
||||
if source_type == "pdf":
|
||||
from PyPDF2 import PdfReader
|
||||
import io
|
||||
reader = PdfReader(io.BytesIO(content))
|
||||
text = "\n\n".join([page.extract_text() for page in reader.pages])
|
||||
elif source_type == "docx":
|
||||
import docx2txt
|
||||
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f:
|
||||
f.write(content)
|
||||
text = docx2txt.process(f.name)
|
||||
elif source_type == "html":
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
text = soup.get_text(separator="\n")
|
||||
else:
|
||||
text = content.decode("utf-8", errors="ignore")
|
||||
|
||||
return text
|
||||
|
||||
|
||||
@dsl.component(
|
||||
base_image="python:3.13-slim",
|
||||
packages_to_install=["tiktoken"]
|
||||
)
|
||||
def chunk_text(
|
||||
text: str,
|
||||
chunk_size: int = 500,
|
||||
overlap: int = 50
|
||||
) -> list:
|
||||
"""Split text into overlapping chunks."""
|
||||
import tiktoken
|
||||
|
||||
enc = tiktoken.get_encoding("cl100k_base")
|
||||
tokens = enc.encode(text)
|
||||
|
||||
chunks = []
|
||||
start = 0
|
||||
while start < len(tokens):
|
||||
end = min(start + chunk_size, len(tokens))
|
||||
chunk_tokens = tokens[start:end]
|
||||
chunk_text = enc.decode(chunk_tokens)
|
||||
chunks.append({
|
||||
"text": chunk_text,
|
||||
"start_token": start,
|
||||
"end_token": end
|
||||
})
|
||||
start += chunk_size - overlap
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
@dsl.component(
|
||||
base_image="python:3.13-slim",
|
||||
packages_to_install=["httpx"]
|
||||
)
|
||||
def generate_embeddings_batch(
|
||||
chunks: list,
|
||||
embeddings_url: str = "http://embeddings-predictor.ai-ml.svc.cluster.local",
|
||||
batch_size: int = 32
|
||||
) -> list:
|
||||
"""Generate embeddings for all chunks."""
|
||||
import httpx
|
||||
|
||||
embedded_chunks = []
|
||||
texts = [c["text"] for c in chunks]
|
||||
|
||||
with httpx.Client(timeout=300.0) as client:
|
||||
for i in range(0, len(texts), batch_size):
|
||||
batch = texts[i:i + batch_size]
|
||||
response = client.post(
|
||||
f"{embeddings_url}/embeddings",
|
||||
json={"input": batch, "model": "bge-small-en-v1.5"}
|
||||
)
|
||||
result = response.json()
|
||||
|
||||
for j, embedding_data in enumerate(result["data"]):
|
||||
chunk = chunks[i + j].copy()
|
||||
chunk["embedding"] = embedding_data["embedding"]
|
||||
embedded_chunks.append(chunk)
|
||||
|
||||
return embedded_chunks
|
||||
|
||||
|
||||
@dsl.component(
|
||||
base_image="python:3.13-slim",
|
||||
packages_to_install=["pymilvus"]
|
||||
)
|
||||
def upsert_to_milvus(
|
||||
chunks: list,
|
||||
collection_name: str,
|
||||
source_name: str,
|
||||
milvus_host: str = "milvus.ai-ml.svc.cluster.local"
|
||||
) -> int:
|
||||
"""Upsert embeddings to Milvus collection."""
|
||||
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, utility
|
||||
|
||||
connections.connect(host=milvus_host, port=19530)
|
||||
|
||||
# Create collection if it doesn't exist
|
||||
if not utility.has_collection(collection_name):
|
||||
fields = [
|
||||
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
|
||||
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=8192),
|
||||
FieldSchema(name="source", dtype=DataType.VARCHAR, max_length=512),
|
||||
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=1024)
|
||||
]
|
||||
schema = CollectionSchema(fields, description="Document embeddings")
|
||||
Collection(name=collection_name, schema=schema)
|
||||
|
||||
collection = Collection(collection_name)
|
||||
|
||||
# Prepare data
|
||||
data = [
|
||||
[chunk["text"] for chunk in chunks],
|
||||
[source_name for _ in chunks],
|
||||
[chunk["embedding"] for chunk in chunks]
|
||||
]
|
||||
|
||||
# Insert
|
||||
result = collection.insert(data)
|
||||
collection.flush()
|
||||
|
||||
# Create index if needed
|
||||
if not collection.has_index():
|
||||
collection.create_index(
|
||||
field_name="embedding",
|
||||
index_params={
|
||||
"metric_type": "COSINE",
|
||||
"index_type": "HNSW",
|
||||
"params": {"M": 16, "efConstruction": 256}
|
||||
}
|
||||
)
|
||||
|
||||
return len(chunks)
|
||||
|
||||
|
||||
@dsl.pipeline(
|
||||
name="document-ingestion-pipeline",
|
||||
description="Ingest documents into Milvus: Extract -> Chunk -> Embed -> Store"
|
||||
)
|
||||
def document_ingestion_pipeline(
|
||||
source_url: str,
|
||||
collection_name: str = "knowledge_base",
|
||||
source_name: str = "",
|
||||
chunk_size: int = 500,
|
||||
chunk_overlap: int = 50
|
||||
):
|
||||
"""
|
||||
Document Ingestion Pipeline
|
||||
|
||||
Args:
|
||||
source_url: URL to the document (PDF, DOCX, HTML, or plain text)
|
||||
collection_name: Milvus collection to store embeddings
|
||||
source_name: Human-readable name for the source
|
||||
chunk_size: Token count per chunk
|
||||
chunk_overlap: Overlap between chunks
|
||||
"""
|
||||
|
||||
# Step 1: Extract text from document
|
||||
extract_task = extract_text(source_url=source_url)
|
||||
extract_task.set_caching_options(enable_caching=True)
|
||||
|
||||
# Step 2: Chunk the text
|
||||
chunk_task = chunk_text(
|
||||
text=extract_task.output,
|
||||
chunk_size=chunk_size,
|
||||
overlap=chunk_overlap
|
||||
)
|
||||
chunk_task.set_caching_options(enable_caching=True)
|
||||
|
||||
# Step 3: Generate embeddings
|
||||
embed_task = generate_embeddings_batch(chunks=chunk_task.output)
|
||||
embed_task.set_caching_options(enable_caching=True)
|
||||
|
||||
# Step 4: Store in Milvus
|
||||
store_task = upsert_to_milvus(
|
||||
chunks=embed_task.output,
|
||||
collection_name=collection_name,
|
||||
source_name=source_name if source_name else source_url
|
||||
)
|
||||
|
||||
|
||||
@dsl.pipeline(
|
||||
name="batch-document-ingestion",
|
||||
description="Ingest multiple documents in parallel"
|
||||
)
|
||||
def batch_document_ingestion_pipeline(
|
||||
source_urls: List[str],
|
||||
collection_name: str = "knowledge_base"
|
||||
):
|
||||
"""
|
||||
Batch Document Ingestion
|
||||
|
||||
Args:
|
||||
source_urls: List of document URLs to ingest
|
||||
collection_name: Target Milvus collection
|
||||
"""
|
||||
with dsl.ParallelFor(source_urls) as url:
|
||||
document_ingestion_pipeline(
|
||||
source_url=url,
|
||||
collection_name=collection_name
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Primary pipeline - filename must match Python file for sync
|
||||
compiler.Compiler().compile(
|
||||
document_ingestion_pipeline,
|
||||
"document_ingestion_pipeline.yaml"
|
||||
)
|
||||
print("Compiled: document_ingestion_pipeline.yaml")
|
||||
Reference in New Issue
Block a user