feat: Add ML training and batch inference workflows
- batch-inference: LLM inference with optional RAG - qlora-training: QLoRA adapter fine-tuning from Milvus - hybrid-ml-training: Multi-GPU distributed training - coqui-voice-training: XTTS voice cloning - document-ingestion: Ingest documents to Milvus - eventsource-kfp: Argo Events / Kubeflow integration - kfp-integration: Bridge between Argo and Kubeflow
This commit is contained in:
369
document-ingestion.yaml
Normal file
369
document-ingestion.yaml
Normal file
@@ -0,0 +1,369 @@
|
||||
# Document Ingestion Workflow
|
||||
# Ingests documents from a source URL into Milvus vector database
|
||||
# Triggered via NATS: ai.pipeline.trigger with pipeline="document-ingestion"
|
||||
---
|
||||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: WorkflowTemplate
|
||||
metadata:
|
||||
name: document-ingestion
|
||||
namespace: ai-ml
|
||||
labels:
|
||||
app.kubernetes.io/name: document-ingestion
|
||||
app.kubernetes.io/part-of: llm-workflows
|
||||
spec:
|
||||
entrypoint: ingest-documents
|
||||
serviceAccountName: argo-workflow
|
||||
|
||||
arguments:
|
||||
parameters:
|
||||
- name: source-url
|
||||
description: "URL to fetch documents from (S3, HTTP, or local path)"
|
||||
- name: collection-name
|
||||
value: "knowledge_base"
|
||||
description: "Milvus collection name"
|
||||
- name: chunk-size
|
||||
value: "512"
|
||||
description: "Text chunk size in characters"
|
||||
- name: chunk-overlap
|
||||
value: "50"
|
||||
description: "Overlap between chunks"
|
||||
|
||||
templates:
|
||||
- name: ingest-documents
|
||||
dag:
|
||||
tasks:
|
||||
- name: fetch-documents
|
||||
template: fetch-docs
|
||||
arguments:
|
||||
parameters:
|
||||
- name: source-url
|
||||
value: "{{workflow.parameters.source-url}}"
|
||||
|
||||
- name: chunk-documents
|
||||
template: chunk-docs
|
||||
dependencies: [fetch-documents]
|
||||
arguments:
|
||||
parameters:
|
||||
- name: chunk-size
|
||||
value: "{{workflow.parameters.chunk-size}}"
|
||||
- name: chunk-overlap
|
||||
value: "{{workflow.parameters.chunk-overlap}}"
|
||||
artifacts:
|
||||
- name: documents
|
||||
from: "{{tasks.fetch-documents.outputs.artifacts.documents}}"
|
||||
|
||||
- name: generate-embeddings
|
||||
template: embed-docs
|
||||
dependencies: [chunk-documents]
|
||||
arguments:
|
||||
artifacts:
|
||||
- name: chunks
|
||||
from: "{{tasks.chunk-documents.outputs.artifacts.chunks}}"
|
||||
|
||||
- name: store-in-milvus
|
||||
template: store-docs
|
||||
dependencies: [generate-embeddings]
|
||||
arguments:
|
||||
parameters:
|
||||
- name: collection-name
|
||||
value: "{{workflow.parameters.collection-name}}"
|
||||
artifacts:
|
||||
- name: embeddings
|
||||
from: "{{tasks.generate-embeddings.outputs.artifacts.embeddings}}"
|
||||
|
||||
- name: fetch-docs
|
||||
inputs:
|
||||
parameters:
|
||||
- name: source-url
|
||||
outputs:
|
||||
artifacts:
|
||||
- name: documents
|
||||
path: /tmp/documents
|
||||
container:
|
||||
image: python:3.13-slim
|
||||
command: [python]
|
||||
args:
|
||||
- -c
|
||||
- |
|
||||
import json
|
||||
import os
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
|
||||
source_url = "{{inputs.parameters.source-url}}"
|
||||
output_dir = Path("/tmp/documents")
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"Fetching documents from: {source_url}")
|
||||
|
||||
# Handle different source types
|
||||
if source_url.startswith("s3://"):
|
||||
import subprocess
|
||||
subprocess.run(["pip", "install", "boto3", "-q"], check=True)
|
||||
import boto3
|
||||
s3 = boto3.client("s3")
|
||||
bucket, prefix = source_url[5:].split("/", 1)
|
||||
response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
|
||||
for obj in response.get("Contents", []):
|
||||
key = obj["Key"]
|
||||
local_path = output_dir / Path(key).name
|
||||
s3.download_file(bucket, key, str(local_path))
|
||||
print(f"Downloaded: {key}")
|
||||
elif source_url.startswith("http"):
|
||||
# Single file download
|
||||
filename = source_url.split("/")[-1] or "document.txt"
|
||||
local_path = output_dir / filename
|
||||
urllib.request.urlretrieve(source_url, local_path)
|
||||
print(f"Downloaded: {filename}")
|
||||
else:
|
||||
print(f"Unsupported URL scheme: {source_url}")
|
||||
exit(1)
|
||||
|
||||
# List downloaded files
|
||||
files = list(output_dir.glob("*"))
|
||||
print(f"Downloaded {len(files)} files")
|
||||
|
||||
# Create manifest
|
||||
manifest = {"files": [str(f) for f in files]}
|
||||
with open(output_dir / "manifest.json", "w") as f:
|
||||
json.dump(manifest, f)
|
||||
resources:
|
||||
requests:
|
||||
memory: 256Mi
|
||||
cpu: 100m
|
||||
|
||||
- name: chunk-docs
|
||||
inputs:
|
||||
parameters:
|
||||
- name: chunk-size
|
||||
- name: chunk-overlap
|
||||
artifacts:
|
||||
- name: documents
|
||||
path: /tmp/documents
|
||||
outputs:
|
||||
artifacts:
|
||||
- name: chunks
|
||||
path: /tmp/chunks
|
||||
container:
|
||||
image: python:3.13-slim
|
||||
command: [python]
|
||||
args:
|
||||
- -c
|
||||
- |
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
chunk_size = int("{{inputs.parameters.chunk-size}}")
|
||||
chunk_overlap = int("{{inputs.parameters.chunk-overlap}}")
|
||||
|
||||
input_dir = Path("/tmp/documents")
|
||||
output_dir = Path("/tmp/chunks")
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Load manifest
|
||||
with open(input_dir / "manifest.json") as f:
|
||||
manifest = json.load(f)
|
||||
|
||||
all_chunks = []
|
||||
|
||||
for filepath in manifest["files"]:
|
||||
filepath = Path(filepath)
|
||||
if not filepath.exists():
|
||||
continue
|
||||
|
||||
print(f"Processing: {filepath.name}")
|
||||
|
||||
# Read file content
|
||||
try:
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
except Exception as e:
|
||||
print(f"Error reading {filepath}: {e}")
|
||||
continue
|
||||
|
||||
# Simple chunking
|
||||
chunks = []
|
||||
start = 0
|
||||
while start < len(content):
|
||||
end = start + chunk_size
|
||||
chunk = content[start:end]
|
||||
if chunk.strip():
|
||||
chunks.append({
|
||||
"text": chunk,
|
||||
"source": filepath.name,
|
||||
"chunk_index": len(chunks)
|
||||
})
|
||||
start = end - chunk_overlap
|
||||
|
||||
all_chunks.extend(chunks)
|
||||
print(f" Created {len(chunks)} chunks")
|
||||
|
||||
# Save chunks
|
||||
with open(output_dir / "chunks.json", "w") as f:
|
||||
json.dump({"chunks": all_chunks}, f)
|
||||
|
||||
print(f"Total chunks: {len(all_chunks)}")
|
||||
resources:
|
||||
requests:
|
||||
memory: 512Mi
|
||||
cpu: 100m
|
||||
|
||||
- name: embed-docs
|
||||
inputs:
|
||||
artifacts:
|
||||
- name: chunks
|
||||
path: /tmp/chunks
|
||||
outputs:
|
||||
artifacts:
|
||||
- name: embeddings
|
||||
path: /tmp/embeddings
|
||||
container:
|
||||
image: python:3.13-slim
|
||||
command: [python]
|
||||
args:
|
||||
- -c
|
||||
- |
|
||||
import subprocess
|
||||
subprocess.run(["pip", "install", "httpx", "-q"], check=True)
|
||||
|
||||
import json
|
||||
import httpx
|
||||
from pathlib import Path
|
||||
|
||||
EMBEDDINGS_URL = "http://embeddings-predictor.ai-ml.svc.cluster.local"
|
||||
BATCH_SIZE = 32
|
||||
|
||||
input_dir = Path("/tmp/chunks")
|
||||
output_dir = Path("/tmp/embeddings")
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Load chunks
|
||||
with open(input_dir / "chunks.json") as f:
|
||||
data = json.load(f)
|
||||
chunks = data["chunks"]
|
||||
|
||||
print(f"Generating embeddings for {len(chunks)} chunks")
|
||||
|
||||
# Generate embeddings in batches
|
||||
all_embeddings = []
|
||||
with httpx.Client(timeout=120.0) as client:
|
||||
for i in range(0, len(chunks), BATCH_SIZE):
|
||||
batch = chunks[i:i+BATCH_SIZE]
|
||||
texts = [c["text"] for c in batch]
|
||||
|
||||
response = client.post(
|
||||
f"{EMBEDDINGS_URL}/embeddings",
|
||||
json={"input": texts, "model": "bge"}
|
||||
)
|
||||
result = response.json()
|
||||
|
||||
for j, emb_data in enumerate(result.get("data", [])):
|
||||
all_embeddings.append({
|
||||
"text": batch[j]["text"],
|
||||
"source": batch[j]["source"],
|
||||
"chunk_index": batch[j]["chunk_index"],
|
||||
"embedding": emb_data["embedding"]
|
||||
})
|
||||
|
||||
print(f" Processed batch {i//BATCH_SIZE + 1}/{(len(chunks)-1)//BATCH_SIZE + 1}")
|
||||
|
||||
# Save embeddings
|
||||
with open(output_dir / "embeddings.json", "w") as f:
|
||||
json.dump({"embeddings": all_embeddings}, f)
|
||||
|
||||
print(f"Generated {len(all_embeddings)} embeddings")
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: ai-services-config
|
||||
resources:
|
||||
requests:
|
||||
memory: 1Gi
|
||||
cpu: 200m
|
||||
|
||||
- name: store-docs
|
||||
inputs:
|
||||
parameters:
|
||||
- name: collection-name
|
||||
artifacts:
|
||||
- name: embeddings
|
||||
path: /tmp/embeddings
|
||||
container:
|
||||
image: python:3.13-slim
|
||||
command: [python]
|
||||
args:
|
||||
- -c
|
||||
- |
|
||||
import subprocess
|
||||
subprocess.run(["pip", "install", "pymilvus", "-q"], check=True)
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, utility
|
||||
|
||||
MILVUS_HOST = "milvus.ai-ml.svc.cluster.local"
|
||||
MILVUS_PORT = 19530
|
||||
COLLECTION_NAME = "{{inputs.parameters.collection-name}}"
|
||||
EMBEDDING_DIM = 1024 # BGE-large dimension
|
||||
|
||||
input_dir = Path("/tmp/embeddings")
|
||||
|
||||
# Load embeddings
|
||||
with open(input_dir / "embeddings.json") as f:
|
||||
data = json.load(f)
|
||||
embeddings = data["embeddings"]
|
||||
|
||||
print(f"Storing {len(embeddings)} embeddings in Milvus")
|
||||
|
||||
# Connect to Milvus
|
||||
connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
|
||||
print("Connected to Milvus")
|
||||
|
||||
# Create collection if not exists
|
||||
if not utility.has_collection(COLLECTION_NAME):
|
||||
fields = [
|
||||
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
|
||||
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535),
|
||||
FieldSchema(name="source", dtype=DataType.VARCHAR, max_length=1024),
|
||||
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=EMBEDDING_DIM)
|
||||
]
|
||||
schema = CollectionSchema(fields, description="Knowledge base documents")
|
||||
collection = Collection(COLLECTION_NAME, schema)
|
||||
|
||||
# Create HNSW index
|
||||
index_params = {
|
||||
"metric_type": "COSINE",
|
||||
"index_type": "HNSW",
|
||||
"params": {"M": 16, "efConstruction": 256}
|
||||
}
|
||||
collection.create_index("embedding", index_params)
|
||||
print(f"Created collection: {COLLECTION_NAME}")
|
||||
else:
|
||||
collection = Collection(COLLECTION_NAME)
|
||||
print(f"Using existing collection: {COLLECTION_NAME}")
|
||||
|
||||
# Insert data in batches
|
||||
BATCH_SIZE = 100
|
||||
for i in range(0, len(embeddings), BATCH_SIZE):
|
||||
batch = embeddings[i:i+BATCH_SIZE]
|
||||
|
||||
data = [
|
||||
[e["text"] for e in batch],
|
||||
[e["source"] for e in batch],
|
||||
[e["embedding"] for e in batch]
|
||||
]
|
||||
|
||||
collection.insert(data)
|
||||
print(f" Inserted batch {i//BATCH_SIZE + 1}/{(len(embeddings)-1)//BATCH_SIZE + 1}")
|
||||
|
||||
# Flush to ensure data is persisted
|
||||
collection.flush()
|
||||
print(f"Successfully stored {len(embeddings)} documents")
|
||||
|
||||
connections.disconnect("default")
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: ai-services-config
|
||||
resources:
|
||||
requests:
|
||||
memory: 512Mi
|
||||
cpu: 100m
|
||||
Reference in New Issue
Block a user