- batch-inference: LLM inference with optional RAG - qlora-training: QLoRA adapter fine-tuning from Milvus - hybrid-ml-training: Multi-GPU distributed training - coqui-voice-training: XTTS voice cloning - document-ingestion: Ingest documents to Milvus - eventsource-kfp: Argo Events / Kubeflow integration - kfp-integration: Bridge between Argo and Kubeflow
370 lines
12 KiB
YAML
370 lines
12 KiB
YAML
# Document Ingestion Workflow
|
|
# Ingests documents from a source URL into Milvus vector database
|
|
# Triggered via NATS: ai.pipeline.trigger with pipeline="document-ingestion"
|
|
---
|
|
apiVersion: argoproj.io/v1alpha1
|
|
kind: WorkflowTemplate
|
|
metadata:
|
|
name: document-ingestion
|
|
namespace: ai-ml
|
|
labels:
|
|
app.kubernetes.io/name: document-ingestion
|
|
app.kubernetes.io/part-of: llm-workflows
|
|
spec:
|
|
entrypoint: ingest-documents
|
|
serviceAccountName: argo-workflow
|
|
|
|
arguments:
|
|
parameters:
|
|
- name: source-url
|
|
description: "URL to fetch documents from (S3, HTTP, or local path)"
|
|
- name: collection-name
|
|
value: "knowledge_base"
|
|
description: "Milvus collection name"
|
|
- name: chunk-size
|
|
value: "512"
|
|
description: "Text chunk size in characters"
|
|
- name: chunk-overlap
|
|
value: "50"
|
|
description: "Overlap between chunks"
|
|
|
|
templates:
|
|
- name: ingest-documents
|
|
dag:
|
|
tasks:
|
|
- name: fetch-documents
|
|
template: fetch-docs
|
|
arguments:
|
|
parameters:
|
|
- name: source-url
|
|
value: "{{workflow.parameters.source-url}}"
|
|
|
|
- name: chunk-documents
|
|
template: chunk-docs
|
|
dependencies: [fetch-documents]
|
|
arguments:
|
|
parameters:
|
|
- name: chunk-size
|
|
value: "{{workflow.parameters.chunk-size}}"
|
|
- name: chunk-overlap
|
|
value: "{{workflow.parameters.chunk-overlap}}"
|
|
artifacts:
|
|
- name: documents
|
|
from: "{{tasks.fetch-documents.outputs.artifacts.documents}}"
|
|
|
|
- name: generate-embeddings
|
|
template: embed-docs
|
|
dependencies: [chunk-documents]
|
|
arguments:
|
|
artifacts:
|
|
- name: chunks
|
|
from: "{{tasks.chunk-documents.outputs.artifacts.chunks}}"
|
|
|
|
- name: store-in-milvus
|
|
template: store-docs
|
|
dependencies: [generate-embeddings]
|
|
arguments:
|
|
parameters:
|
|
- name: collection-name
|
|
value: "{{workflow.parameters.collection-name}}"
|
|
artifacts:
|
|
- name: embeddings
|
|
from: "{{tasks.generate-embeddings.outputs.artifacts.embeddings}}"
|
|
|
|
- name: fetch-docs
|
|
inputs:
|
|
parameters:
|
|
- name: source-url
|
|
outputs:
|
|
artifacts:
|
|
- name: documents
|
|
path: /tmp/documents
|
|
container:
|
|
image: python:3.13-slim
|
|
command: [python]
|
|
args:
|
|
- -c
|
|
- |
|
|
import json
|
|
import os
|
|
import urllib.request
|
|
from pathlib import Path
|
|
|
|
source_url = "{{inputs.parameters.source-url}}"
|
|
output_dir = Path("/tmp/documents")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f"Fetching documents from: {source_url}")
|
|
|
|
# Handle different source types
|
|
if source_url.startswith("s3://"):
|
|
import subprocess
|
|
subprocess.run(["pip", "install", "boto3", "-q"], check=True)
|
|
import boto3
|
|
s3 = boto3.client("s3")
|
|
bucket, prefix = source_url[5:].split("/", 1)
|
|
response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
|
|
for obj in response.get("Contents", []):
|
|
key = obj["Key"]
|
|
local_path = output_dir / Path(key).name
|
|
s3.download_file(bucket, key, str(local_path))
|
|
print(f"Downloaded: {key}")
|
|
elif source_url.startswith("http"):
|
|
# Single file download
|
|
filename = source_url.split("/")[-1] or "document.txt"
|
|
local_path = output_dir / filename
|
|
urllib.request.urlretrieve(source_url, local_path)
|
|
print(f"Downloaded: {filename}")
|
|
else:
|
|
print(f"Unsupported URL scheme: {source_url}")
|
|
exit(1)
|
|
|
|
# List downloaded files
|
|
files = list(output_dir.glob("*"))
|
|
print(f"Downloaded {len(files)} files")
|
|
|
|
# Create manifest
|
|
manifest = {"files": [str(f) for f in files]}
|
|
with open(output_dir / "manifest.json", "w") as f:
|
|
json.dump(manifest, f)
|
|
resources:
|
|
requests:
|
|
memory: 256Mi
|
|
cpu: 100m
|
|
|
|
- name: chunk-docs
|
|
inputs:
|
|
parameters:
|
|
- name: chunk-size
|
|
- name: chunk-overlap
|
|
artifacts:
|
|
- name: documents
|
|
path: /tmp/documents
|
|
outputs:
|
|
artifacts:
|
|
- name: chunks
|
|
path: /tmp/chunks
|
|
container:
|
|
image: python:3.13-slim
|
|
command: [python]
|
|
args:
|
|
- -c
|
|
- |
|
|
import json
|
|
from pathlib import Path
|
|
|
|
chunk_size = int("{{inputs.parameters.chunk-size}}")
|
|
chunk_overlap = int("{{inputs.parameters.chunk-overlap}}")
|
|
|
|
input_dir = Path("/tmp/documents")
|
|
output_dir = Path("/tmp/chunks")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Load manifest
|
|
with open(input_dir / "manifest.json") as f:
|
|
manifest = json.load(f)
|
|
|
|
all_chunks = []
|
|
|
|
for filepath in manifest["files"]:
|
|
filepath = Path(filepath)
|
|
if not filepath.exists():
|
|
continue
|
|
|
|
print(f"Processing: {filepath.name}")
|
|
|
|
# Read file content
|
|
try:
|
|
with open(filepath, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
except Exception as e:
|
|
print(f"Error reading {filepath}: {e}")
|
|
continue
|
|
|
|
# Simple chunking
|
|
chunks = []
|
|
start = 0
|
|
while start < len(content):
|
|
end = start + chunk_size
|
|
chunk = content[start:end]
|
|
if chunk.strip():
|
|
chunks.append({
|
|
"text": chunk,
|
|
"source": filepath.name,
|
|
"chunk_index": len(chunks)
|
|
})
|
|
start = end - chunk_overlap
|
|
|
|
all_chunks.extend(chunks)
|
|
print(f" Created {len(chunks)} chunks")
|
|
|
|
# Save chunks
|
|
with open(output_dir / "chunks.json", "w") as f:
|
|
json.dump({"chunks": all_chunks}, f)
|
|
|
|
print(f"Total chunks: {len(all_chunks)}")
|
|
resources:
|
|
requests:
|
|
memory: 512Mi
|
|
cpu: 100m
|
|
|
|
- name: embed-docs
|
|
inputs:
|
|
artifacts:
|
|
- name: chunks
|
|
path: /tmp/chunks
|
|
outputs:
|
|
artifacts:
|
|
- name: embeddings
|
|
path: /tmp/embeddings
|
|
container:
|
|
image: python:3.13-slim
|
|
command: [python]
|
|
args:
|
|
- -c
|
|
- |
|
|
import subprocess
|
|
subprocess.run(["pip", "install", "httpx", "-q"], check=True)
|
|
|
|
import json
|
|
import httpx
|
|
from pathlib import Path
|
|
|
|
EMBEDDINGS_URL = "http://embeddings-predictor.ai-ml.svc.cluster.local"
|
|
BATCH_SIZE = 32
|
|
|
|
input_dir = Path("/tmp/chunks")
|
|
output_dir = Path("/tmp/embeddings")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Load chunks
|
|
with open(input_dir / "chunks.json") as f:
|
|
data = json.load(f)
|
|
chunks = data["chunks"]
|
|
|
|
print(f"Generating embeddings for {len(chunks)} chunks")
|
|
|
|
# Generate embeddings in batches
|
|
all_embeddings = []
|
|
with httpx.Client(timeout=120.0) as client:
|
|
for i in range(0, len(chunks), BATCH_SIZE):
|
|
batch = chunks[i:i+BATCH_SIZE]
|
|
texts = [c["text"] for c in batch]
|
|
|
|
response = client.post(
|
|
f"{EMBEDDINGS_URL}/embeddings",
|
|
json={"input": texts, "model": "bge"}
|
|
)
|
|
result = response.json()
|
|
|
|
for j, emb_data in enumerate(result.get("data", [])):
|
|
all_embeddings.append({
|
|
"text": batch[j]["text"],
|
|
"source": batch[j]["source"],
|
|
"chunk_index": batch[j]["chunk_index"],
|
|
"embedding": emb_data["embedding"]
|
|
})
|
|
|
|
print(f" Processed batch {i//BATCH_SIZE + 1}/{(len(chunks)-1)//BATCH_SIZE + 1}")
|
|
|
|
# Save embeddings
|
|
with open(output_dir / "embeddings.json", "w") as f:
|
|
json.dump({"embeddings": all_embeddings}, f)
|
|
|
|
print(f"Generated {len(all_embeddings)} embeddings")
|
|
envFrom:
|
|
- configMapRef:
|
|
name: ai-services-config
|
|
resources:
|
|
requests:
|
|
memory: 1Gi
|
|
cpu: 200m
|
|
|
|
- name: store-docs
|
|
inputs:
|
|
parameters:
|
|
- name: collection-name
|
|
artifacts:
|
|
- name: embeddings
|
|
path: /tmp/embeddings
|
|
container:
|
|
image: python:3.13-slim
|
|
command: [python]
|
|
args:
|
|
- -c
|
|
- |
|
|
import subprocess
|
|
subprocess.run(["pip", "install", "pymilvus", "-q"], check=True)
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, utility
|
|
|
|
MILVUS_HOST = "milvus.ai-ml.svc.cluster.local"
|
|
MILVUS_PORT = 19530
|
|
COLLECTION_NAME = "{{inputs.parameters.collection-name}}"
|
|
EMBEDDING_DIM = 1024 # BGE-large dimension
|
|
|
|
input_dir = Path("/tmp/embeddings")
|
|
|
|
# Load embeddings
|
|
with open(input_dir / "embeddings.json") as f:
|
|
data = json.load(f)
|
|
embeddings = data["embeddings"]
|
|
|
|
print(f"Storing {len(embeddings)} embeddings in Milvus")
|
|
|
|
# Connect to Milvus
|
|
connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
|
|
print("Connected to Milvus")
|
|
|
|
# Create collection if not exists
|
|
if not utility.has_collection(COLLECTION_NAME):
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
|
|
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535),
|
|
FieldSchema(name="source", dtype=DataType.VARCHAR, max_length=1024),
|
|
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=EMBEDDING_DIM)
|
|
]
|
|
schema = CollectionSchema(fields, description="Knowledge base documents")
|
|
collection = Collection(COLLECTION_NAME, schema)
|
|
|
|
# Create HNSW index
|
|
index_params = {
|
|
"metric_type": "COSINE",
|
|
"index_type": "HNSW",
|
|
"params": {"M": 16, "efConstruction": 256}
|
|
}
|
|
collection.create_index("embedding", index_params)
|
|
print(f"Created collection: {COLLECTION_NAME}")
|
|
else:
|
|
collection = Collection(COLLECTION_NAME)
|
|
print(f"Using existing collection: {COLLECTION_NAME}")
|
|
|
|
# Insert data in batches
|
|
BATCH_SIZE = 100
|
|
for i in range(0, len(embeddings), BATCH_SIZE):
|
|
batch = embeddings[i:i+BATCH_SIZE]
|
|
|
|
data = [
|
|
[e["text"] for e in batch],
|
|
[e["source"] for e in batch],
|
|
[e["embedding"] for e in batch]
|
|
]
|
|
|
|
collection.insert(data)
|
|
print(f" Inserted batch {i//BATCH_SIZE + 1}/{(len(embeddings)-1)//BATCH_SIZE + 1}")
|
|
|
|
# Flush to ensure data is persisted
|
|
collection.flush()
|
|
print(f"Successfully stored {len(embeddings)} documents")
|
|
|
|
connections.disconnect("default")
|
|
envFrom:
|
|
- configMapRef:
|
|
name: ai-services-config
|
|
resources:
|
|
requests:
|
|
memory: 512Mi
|
|
cpu: 100m
|