Files
argo/document-ingestion.yaml
Billy D. 7104698eee feat: Add ML training and batch inference workflows
- batch-inference: LLM inference with optional RAG
- qlora-training: QLoRA adapter fine-tuning from Milvus
- hybrid-ml-training: Multi-GPU distributed training
- coqui-voice-training: XTTS voice cloning
- document-ingestion: Ingest documents to Milvus
- eventsource-kfp: Argo Events / Kubeflow integration
- kfp-integration: Bridge between Argo and Kubeflow
2026-02-01 20:39:42 -05:00

370 lines
12 KiB
YAML

# Document Ingestion Workflow
# Ingests documents from a source URL into Milvus vector database
# Triggered via NATS: ai.pipeline.trigger with pipeline="document-ingestion"
---
apiVersion: argoproj.io/v1alpha1
kind: WorkflowTemplate
metadata:
name: document-ingestion
namespace: ai-ml
labels:
app.kubernetes.io/name: document-ingestion
app.kubernetes.io/part-of: llm-workflows
spec:
entrypoint: ingest-documents
serviceAccountName: argo-workflow
arguments:
parameters:
- name: source-url
description: "URL to fetch documents from (S3, HTTP, or local path)"
- name: collection-name
value: "knowledge_base"
description: "Milvus collection name"
- name: chunk-size
value: "512"
description: "Text chunk size in characters"
- name: chunk-overlap
value: "50"
description: "Overlap between chunks"
templates:
- name: ingest-documents
dag:
tasks:
- name: fetch-documents
template: fetch-docs
arguments:
parameters:
- name: source-url
value: "{{workflow.parameters.source-url}}"
- name: chunk-documents
template: chunk-docs
dependencies: [fetch-documents]
arguments:
parameters:
- name: chunk-size
value: "{{workflow.parameters.chunk-size}}"
- name: chunk-overlap
value: "{{workflow.parameters.chunk-overlap}}"
artifacts:
- name: documents
from: "{{tasks.fetch-documents.outputs.artifacts.documents}}"
- name: generate-embeddings
template: embed-docs
dependencies: [chunk-documents]
arguments:
artifacts:
- name: chunks
from: "{{tasks.chunk-documents.outputs.artifacts.chunks}}"
- name: store-in-milvus
template: store-docs
dependencies: [generate-embeddings]
arguments:
parameters:
- name: collection-name
value: "{{workflow.parameters.collection-name}}"
artifacts:
- name: embeddings
from: "{{tasks.generate-embeddings.outputs.artifacts.embeddings}}"
- name: fetch-docs
inputs:
parameters:
- name: source-url
outputs:
artifacts:
- name: documents
path: /tmp/documents
container:
image: python:3.13-slim
command: [python]
args:
- -c
- |
import json
import os
import urllib.request
from pathlib import Path
source_url = "{{inputs.parameters.source-url}}"
output_dir = Path("/tmp/documents")
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Fetching documents from: {source_url}")
# Handle different source types
if source_url.startswith("s3://"):
import subprocess
subprocess.run(["pip", "install", "boto3", "-q"], check=True)
import boto3
s3 = boto3.client("s3")
bucket, prefix = source_url[5:].split("/", 1)
response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
for obj in response.get("Contents", []):
key = obj["Key"]
local_path = output_dir / Path(key).name
s3.download_file(bucket, key, str(local_path))
print(f"Downloaded: {key}")
elif source_url.startswith("http"):
# Single file download
filename = source_url.split("/")[-1] or "document.txt"
local_path = output_dir / filename
urllib.request.urlretrieve(source_url, local_path)
print(f"Downloaded: {filename}")
else:
print(f"Unsupported URL scheme: {source_url}")
exit(1)
# List downloaded files
files = list(output_dir.glob("*"))
print(f"Downloaded {len(files)} files")
# Create manifest
manifest = {"files": [str(f) for f in files]}
with open(output_dir / "manifest.json", "w") as f:
json.dump(manifest, f)
resources:
requests:
memory: 256Mi
cpu: 100m
- name: chunk-docs
inputs:
parameters:
- name: chunk-size
- name: chunk-overlap
artifacts:
- name: documents
path: /tmp/documents
outputs:
artifacts:
- name: chunks
path: /tmp/chunks
container:
image: python:3.13-slim
command: [python]
args:
- -c
- |
import json
from pathlib import Path
chunk_size = int("{{inputs.parameters.chunk-size}}")
chunk_overlap = int("{{inputs.parameters.chunk-overlap}}")
input_dir = Path("/tmp/documents")
output_dir = Path("/tmp/chunks")
output_dir.mkdir(parents=True, exist_ok=True)
# Load manifest
with open(input_dir / "manifest.json") as f:
manifest = json.load(f)
all_chunks = []
for filepath in manifest["files"]:
filepath = Path(filepath)
if not filepath.exists():
continue
print(f"Processing: {filepath.name}")
# Read file content
try:
with open(filepath, "r", encoding="utf-8") as f:
content = f.read()
except Exception as e:
print(f"Error reading {filepath}: {e}")
continue
# Simple chunking
chunks = []
start = 0
while start < len(content):
end = start + chunk_size
chunk = content[start:end]
if chunk.strip():
chunks.append({
"text": chunk,
"source": filepath.name,
"chunk_index": len(chunks)
})
start = end - chunk_overlap
all_chunks.extend(chunks)
print(f" Created {len(chunks)} chunks")
# Save chunks
with open(output_dir / "chunks.json", "w") as f:
json.dump({"chunks": all_chunks}, f)
print(f"Total chunks: {len(all_chunks)}")
resources:
requests:
memory: 512Mi
cpu: 100m
- name: embed-docs
inputs:
artifacts:
- name: chunks
path: /tmp/chunks
outputs:
artifacts:
- name: embeddings
path: /tmp/embeddings
container:
image: python:3.13-slim
command: [python]
args:
- -c
- |
import subprocess
subprocess.run(["pip", "install", "httpx", "-q"], check=True)
import json
import httpx
from pathlib import Path
EMBEDDINGS_URL = "http://embeddings-predictor.ai-ml.svc.cluster.local"
BATCH_SIZE = 32
input_dir = Path("/tmp/chunks")
output_dir = Path("/tmp/embeddings")
output_dir.mkdir(parents=True, exist_ok=True)
# Load chunks
with open(input_dir / "chunks.json") as f:
data = json.load(f)
chunks = data["chunks"]
print(f"Generating embeddings for {len(chunks)} chunks")
# Generate embeddings in batches
all_embeddings = []
with httpx.Client(timeout=120.0) as client:
for i in range(0, len(chunks), BATCH_SIZE):
batch = chunks[i:i+BATCH_SIZE]
texts = [c["text"] for c in batch]
response = client.post(
f"{EMBEDDINGS_URL}/embeddings",
json={"input": texts, "model": "bge"}
)
result = response.json()
for j, emb_data in enumerate(result.get("data", [])):
all_embeddings.append({
"text": batch[j]["text"],
"source": batch[j]["source"],
"chunk_index": batch[j]["chunk_index"],
"embedding": emb_data["embedding"]
})
print(f" Processed batch {i//BATCH_SIZE + 1}/{(len(chunks)-1)//BATCH_SIZE + 1}")
# Save embeddings
with open(output_dir / "embeddings.json", "w") as f:
json.dump({"embeddings": all_embeddings}, f)
print(f"Generated {len(all_embeddings)} embeddings")
envFrom:
- configMapRef:
name: ai-services-config
resources:
requests:
memory: 1Gi
cpu: 200m
- name: store-docs
inputs:
parameters:
- name: collection-name
artifacts:
- name: embeddings
path: /tmp/embeddings
container:
image: python:3.13-slim
command: [python]
args:
- -c
- |
import subprocess
subprocess.run(["pip", "install", "pymilvus", "-q"], check=True)
import json
from pathlib import Path
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, utility
MILVUS_HOST = "milvus.ai-ml.svc.cluster.local"
MILVUS_PORT = 19530
COLLECTION_NAME = "{{inputs.parameters.collection-name}}"
EMBEDDING_DIM = 1024 # BGE-large dimension
input_dir = Path("/tmp/embeddings")
# Load embeddings
with open(input_dir / "embeddings.json") as f:
data = json.load(f)
embeddings = data["embeddings"]
print(f"Storing {len(embeddings)} embeddings in Milvus")
# Connect to Milvus
connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
print("Connected to Milvus")
# Create collection if not exists
if not utility.has_collection(COLLECTION_NAME):
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535),
FieldSchema(name="source", dtype=DataType.VARCHAR, max_length=1024),
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=EMBEDDING_DIM)
]
schema = CollectionSchema(fields, description="Knowledge base documents")
collection = Collection(COLLECTION_NAME, schema)
# Create HNSW index
index_params = {
"metric_type": "COSINE",
"index_type": "HNSW",
"params": {"M": 16, "efConstruction": 256}
}
collection.create_index("embedding", index_params)
print(f"Created collection: {COLLECTION_NAME}")
else:
collection = Collection(COLLECTION_NAME)
print(f"Using existing collection: {COLLECTION_NAME}")
# Insert data in batches
BATCH_SIZE = 100
for i in range(0, len(embeddings), BATCH_SIZE):
batch = embeddings[i:i+BATCH_SIZE]
data = [
[e["text"] for e in batch],
[e["source"] for e in batch],
[e["embedding"] for e in batch]
]
collection.insert(data)
print(f" Inserted batch {i//BATCH_SIZE + 1}/{(len(embeddings)-1)//BATCH_SIZE + 1}")
# Flush to ensure data is persisted
collection.flush()
print(f"Successfully stored {len(embeddings)} documents")
connections.disconnect("default")
envFrom:
- configMapRef:
name: ai-services-config
resources:
requests:
memory: 512Mi
cpu: 100m