Files
argo/coqui-voice-training.yaml
Billy D. 7104698eee feat: Add ML training and batch inference workflows
- batch-inference: LLM inference with optional RAG
- qlora-training: QLoRA adapter fine-tuning from Milvus
- hybrid-ml-training: Multi-GPU distributed training
- coqui-voice-training: XTTS voice cloning
- document-ingestion: Ingest documents to Milvus
- eventsource-kfp: Argo Events / Kubeflow integration
- kfp-integration: Bridge between Argo and Kubeflow
2026-02-01 20:39:42 -05:00

970 lines
34 KiB
YAML

# Coqui TTS Voice Training Workflow
# Trains a custom voice model using Coqui TTS from audio samples
# Triggered via NATS: ai.pipeline.trigger with pipeline="coqui-voice-training"
---
apiVersion: argoproj.io/v1alpha1
kind: WorkflowTemplate
metadata:
name: coqui-voice-training
namespace: ai-ml
labels:
app.kubernetes.io/name: coqui-voice-training
app.kubernetes.io/part-of: llm-workflows
spec:
entrypoint: train-voice
serviceAccountName: argo-workflow
arguments:
parameters:
- name: audio-source
description: "URL to audio files (S3 bucket, HTTP, or NFS path with .wav/.mp3 files)"
- name: transcripts-source
description: "URL to transcripts file (CSV with audio_file,transcript columns) - leave empty to auto-transcribe"
value: ""
- name: voice-name
description: "Name for the trained voice model"
value: "custom-voice"
- name: base-model
description: "Base TTS model to fine-tune from"
value: "tts_models/en/ljspeech/vits"
- name: language
description: "Language code (e.g., en, de, fr, es)"
value: "en"
- name: num-epochs
description: "Number of training epochs"
value: "100"
- name: batch-size
description: "Training batch size"
value: "16"
- name: learning-rate
description: "Learning rate for training"
value: "0.0001"
- name: sample-rate
description: "Target sample rate for audio (Hz)"
value: "22050"
- name: output-path
description: "Path to store the trained model (S3 or NFS)"
value: "/models/tts/custom"
volumeClaimTemplates:
- metadata:
name: training-workspace
spec:
accessModes: ["ReadWriteMany"]
storageClassName: nfs-slow
resources:
requests:
storage: 50Gi
templates:
- name: train-voice
dag:
tasks:
- name: fetch-audio
template: fetch-audio-files
arguments:
parameters:
- name: audio-source
value: "{{workflow.parameters.audio-source}}"
- name: fetch-transcripts
template: fetch-transcript-file
arguments:
parameters:
- name: transcripts-source
value: "{{workflow.parameters.transcripts-source}}"
- name: preprocess-audio
template: preprocess
dependencies: [fetch-audio]
arguments:
parameters:
- name: sample-rate
value: "{{workflow.parameters.sample-rate}}"
artifacts:
- name: raw-audio
from: "{{tasks.fetch-audio.outputs.artifacts.audio-files}}"
- name: generate-transcripts
template: transcribe-audio
dependencies: [preprocess-audio, fetch-transcripts]
when: "{{workflow.parameters.transcripts-source}} == ''"
arguments:
parameters:
- name: language
value: "{{workflow.parameters.language}}"
artifacts:
- name: audio-files
from: "{{tasks.preprocess-audio.outputs.artifacts.processed-audio}}"
- name: prepare-dataset
template: prepare-coqui-dataset
dependencies: [preprocess-audio, generate-transcripts, fetch-transcripts]
arguments:
parameters:
- name: voice-name
value: "{{workflow.parameters.voice-name}}"
- name: language
value: "{{workflow.parameters.language}}"
artifacts:
- name: audio-files
from: "{{tasks.preprocess-audio.outputs.artifacts.processed-audio}}"
- name: transcripts
from: "{{=workflow.parameters.transcriptsSource != '' ? tasks.fetch-transcripts.outputs.artifacts.transcripts : tasks.generate-transcripts.outputs.artifacts.transcripts}}"
optional: true
- name: train-model
template: train-tts
dependencies: [prepare-dataset]
arguments:
parameters:
- name: voice-name
value: "{{workflow.parameters.voice-name}}"
- name: base-model
value: "{{workflow.parameters.base-model}}"
- name: language
value: "{{workflow.parameters.language}}"
- name: num-epochs
value: "{{workflow.parameters.num-epochs}}"
- name: batch-size
value: "{{workflow.parameters.batch-size}}"
- name: learning-rate
value: "{{workflow.parameters.learning-rate}}"
artifacts:
- name: dataset
from: "{{tasks.prepare-dataset.outputs.artifacts.dataset}}"
- name: export-model
template: export-trained-model
dependencies: [train-model]
arguments:
parameters:
- name: voice-name
value: "{{workflow.parameters.voice-name}}"
- name: output-path
value: "{{workflow.parameters.output-path}}"
artifacts:
- name: trained-model
from: "{{tasks.train-model.outputs.artifacts.model}}"
# Template: Fetch audio files from source
- name: fetch-audio-files
inputs:
parameters:
- name: audio-source
outputs:
artifacts:
- name: audio-files
path: /tmp/audio
container:
image: python:3.13-slim
command: [python]
args:
- -c
- |
import os
import subprocess
import urllib.request
from pathlib import Path
import shutil
source_url = "{{inputs.parameters.audio-source}}"
output_dir = Path("/tmp/audio")
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Fetching audio from: {source_url}")
if source_url.startswith("s3://"):
subprocess.run(["pip", "install", "boto3", "-q"], check=True)
import boto3
s3 = boto3.client("s3")
bucket, prefix = source_url[5:].split("/", 1)
response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
audio_extensions = {".wav", ".mp3", ".flac", ".ogg", ".m4a"}
for obj in response.get("Contents", []):
key = obj["Key"]
if Path(key).suffix.lower() in audio_extensions:
local_path = output_dir / Path(key).name
s3.download_file(bucket, key, str(local_path))
print(f"Downloaded: {key}")
elif source_url.startswith("http"):
# Handle single file or directory listing
filename = source_url.split("/")[-1]
if any(ext in filename.lower() for ext in [".wav", ".mp3", ".flac", ".zip"]):
local_path = output_dir / filename
urllib.request.urlretrieve(source_url, local_path)
print(f"Downloaded: {filename}")
# Extract if zip
if filename.endswith(".zip"):
shutil.unpack_archive(local_path, output_dir)
os.remove(local_path)
print("Extracted zip archive")
else:
print(f"URL doesn't appear to be an audio file: {source_url}")
exit(1)
elif source_url.startswith("/"):
# Local/NFS path
src_path = Path(source_url)
if src_path.is_dir():
audio_extensions = {".wav", ".mp3", ".flac", ".ogg", ".m4a"}
for f in src_path.iterdir():
if f.suffix.lower() in audio_extensions:
shutil.copy(f, output_dir / f.name)
print(f"Copied: {f.name}")
elif src_path.is_file():
shutil.copy(src_path, output_dir / src_path.name)
else:
print(f"Path not found: {source_url}")
exit(1)
else:
print(f"Unsupported source: {source_url}")
exit(1)
# Count files
audio_files = list(output_dir.glob("*"))
print(f"Total audio files: {len(audio_files)}")
if len(audio_files) == 0:
print("Error: No audio files found!")
exit(1)
resources:
requests:
memory: 512Mi
cpu: 200m
# Template: Fetch transcripts file
- name: fetch-transcript-file
inputs:
parameters:
- name: transcripts-source
outputs:
artifacts:
- name: transcripts
path: /tmp/transcripts
optional: true
container:
image: python:3.13-slim
command: [python]
args:
- -c
- |
import os
import subprocess
import urllib.request
from pathlib import Path
import shutil
source_url = "{{inputs.parameters.transcripts-source}}"
output_dir = Path("/tmp/transcripts")
output_dir.mkdir(parents=True, exist_ok=True)
if not source_url or source_url.strip() == "":
print("No transcripts source provided - will auto-transcribe")
# Create empty placeholder
(output_dir / "placeholder.txt").write_text("auto-transcribe")
exit(0)
print(f"Fetching transcripts from: {source_url}")
if source_url.startswith("s3://"):
subprocess.run(["pip", "install", "boto3", "-q"], check=True)
import boto3
s3 = boto3.client("s3")
bucket, key = source_url[5:].split("/", 1)
local_path = output_dir / Path(key).name
s3.download_file(bucket, key, str(local_path))
print(f"Downloaded: {key}")
elif source_url.startswith("http"):
filename = source_url.split("/")[-1] or "transcripts.csv"
local_path = output_dir / filename
urllib.request.urlretrieve(source_url, local_path)
print(f"Downloaded: {filename}")
elif source_url.startswith("/"):
src_path = Path(source_url)
if src_path.is_file():
shutil.copy(src_path, output_dir / src_path.name)
print(f"Copied: {src_path.name}")
else:
print(f"File not found: {source_url}")
exit(1)
else:
print(f"Unsupported source: {source_url}")
exit(1)
resources:
requests:
memory: 256Mi
cpu: 100m
# Template: Preprocess audio files
- name: preprocess
inputs:
parameters:
- name: sample-rate
artifacts:
- name: raw-audio
path: /tmp/raw-audio
outputs:
artifacts:
- name: processed-audio
path: /tmp/processed-audio
container:
image: python:3.13-slim
command: [bash]
args:
- -c
- |
set -e
# Install ffmpeg and dependencies
apt-get update && apt-get install -y ffmpeg > /dev/null 2>&1
pip install -q pydub numpy soundfile
python3 << 'EOF'
import os
from pathlib import Path
from pydub import AudioSegment
import soundfile as sf
SAMPLE_RATE = int("{{inputs.parameters.sample-rate}}")
input_dir = Path("/tmp/raw-audio")
output_dir = Path("/tmp/processed-audio")
output_dir.mkdir(parents=True, exist_ok=True)
audio_extensions = {".wav", ".mp3", ".flac", ".ogg", ".m4a"}
for audio_file in input_dir.iterdir():
if audio_file.suffix.lower() not in audio_extensions:
continue
print(f"Processing: {audio_file.name}")
try:
# Load audio
audio = AudioSegment.from_file(str(audio_file))
# Convert to mono if stereo
if audio.channels > 1:
audio = audio.set_channels(1)
# Resample to target sample rate
audio = audio.set_frame_rate(SAMPLE_RATE)
# Normalize audio
audio = audio.normalize()
# Export as WAV
output_file = output_dir / f"{audio_file.stem}.wav"
audio.export(str(output_file), format="wav")
print(f" -> Saved: {output_file.name}")
except Exception as e:
print(f" -> Error processing {audio_file.name}: {e}")
continue
processed_files = list(output_dir.glob("*.wav"))
print(f"\nProcessed {len(processed_files)} audio files")
if len(processed_files) == 0:
print("Error: No files were successfully processed!")
exit(1)
EOF
resources:
requests:
memory: 2Gi
cpu: "1"
# Template: Auto-transcribe audio using Coqui STT
- name: transcribe-audio
inputs:
parameters:
- name: language
artifacts:
- name: audio-files
path: /tmp/audio
outputs:
artifacts:
- name: transcripts
path: /tmp/transcripts
container:
image: ghcr.io/coqui-ai/stt:latest
command: [bash]
args:
- -c
- |
set -e
# Install additional dependencies
pip install -q numpy scipy
python3 << 'EOF'
import csv
import os
import wave
import numpy as np
from pathlib import Path
from stt import Model
LANGUAGE = "{{inputs.parameters.language}}"
input_dir = Path("/tmp/audio")
output_dir = Path("/tmp/transcripts")
output_dir.mkdir(parents=True, exist_ok=True)
# Model paths - Coqui STT models are typically pre-installed in the container
# or can be downloaded from https://coqui.ai/models
MODEL_DIR = Path("/models/stt")
# Try to find model files
model_file = None
scorer_file = None
# Check for language-specific models
lang_model_dir = MODEL_DIR / LANGUAGE
if lang_model_dir.exists():
for f in lang_model_dir.glob("*.tflite"):
model_file = f
for f in lang_model_dir.glob("*.scorer"):
scorer_file = f
# Fallback to default English model location
if model_file is None:
default_paths = [
MODEL_DIR / "model.tflite",
Path("/usr/share/stt/model.tflite"),
Path("/opt/stt/model.tflite"),
]
for p in default_paths:
if p.exists():
model_file = p
break
if model_file is None:
# Download model if not found
print("Downloading Coqui STT model...")
import urllib.request
import tarfile
model_url = "https://github.com/coqui-ai/STT-models/releases/download/english/coqui-stt-1.0.0-lg-vocab.tflite"
scorer_url = "https://github.com/coqui-ai/STT-models/releases/download/english/coqui-stt-1.0.0-lg-vocab.scorer"
MODEL_DIR.mkdir(parents=True, exist_ok=True)
model_file = MODEL_DIR / "model.tflite"
scorer_file = MODEL_DIR / "model.scorer"
urllib.request.urlretrieve(model_url, model_file)
urllib.request.urlretrieve(scorer_url, scorer_file)
print("Model downloaded successfully")
print(f"Loading Coqui STT model: {model_file}")
model = Model(str(model_file))
if scorer_file and scorer_file.exists():
print(f"Loading scorer: {scorer_file}")
model.enableExternalScorer(str(scorer_file))
transcripts = []
for audio_file in sorted(input_dir.glob("*.wav")):
print(f"Transcribing: {audio_file.name}")
try:
# Read WAV file
with wave.open(str(audio_file), 'rb') as w:
sample_rate = w.getframerate()
frames = w.getnframes()
audio_data = w.readframes(frames)
# Convert to int16 array
audio = np.frombuffer(audio_data, dtype=np.int16)
# Resample if needed (Coqui STT expects 16kHz)
if sample_rate != 16000:
from scipy import signal
audio = signal.resample(audio, int(len(audio) * 16000 / sample_rate))
audio = audio.astype(np.int16)
# Run inference
text = model.stt(audio)
transcripts.append({
"audio_file": audio_file.name,
"transcript": text
})
print(f" -> {text[:100] if text else '(empty)'}...")
except Exception as e:
print(f" -> Error: {e}")
continue
# Write CSV
csv_file = output_dir / "transcripts.csv"
with open(csv_file, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=["audio_file", "transcript"])
writer.writeheader()
writer.writerows(transcripts)
print(f"\nTranscribed {len(transcripts)} files")
print(f"Saved to: {csv_file}")
EOF
resources:
requests:
memory: 4Gi
cpu: "2"
limits:
memory: 8Gi
cpu: "4"
# Template: Prepare dataset in Coqui TTS format
- name: prepare-coqui-dataset
inputs:
parameters:
- name: voice-name
- name: language
artifacts:
- name: audio-files
path: /tmp/audio
- name: transcripts
path: /tmp/transcripts
optional: true
outputs:
artifacts:
- name: dataset
path: /tmp/dataset
container:
image: python:3.13-slim
command: [python]
args:
- -c
- |
import csv
import json
import os
import shutil
from pathlib import Path
VOICE_NAME = "{{inputs.parameters.voice-name}}"
LANGUAGE = "{{inputs.parameters.language}}"
audio_dir = Path("/tmp/audio")
transcripts_dir = Path("/tmp/transcripts")
output_dir = Path("/tmp/dataset")
wavs_dir = output_dir / "wavs"
wavs_dir.mkdir(parents=True, exist_ok=True)
print(f"Preparing Coqui TTS dataset for voice: {VOICE_NAME}")
# Find transcripts file
transcripts_file = None
for f in transcripts_dir.glob("*.csv"):
transcripts_file = f
break
if transcripts_file is None:
# Check for .txt files (simple format: filename|text)
for f in transcripts_dir.glob("*.txt"):
if f.name != "placeholder.txt":
transcripts_file = f
break
if transcripts_file is None:
print("Error: No transcripts file found!")
exit(1)
print(f"Using transcripts: {transcripts_file}")
# Parse transcripts
transcripts = {}
if transcripts_file.suffix == ".csv":
with open(transcripts_file, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
# Handle various column name conventions
audio = row.get("audio_file") or row.get("audio") or row.get("file") or row.get("wav")
text = row.get("transcript") or row.get("text") or row.get("sentence")
if audio and text:
transcripts[audio] = text.strip()
else:
# Simple pipe-separated format: filename|text
with open(transcripts_file, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if "|" in line:
parts = line.split("|", 1)
if len(parts) == 2:
transcripts[parts[0]] = parts[1]
print(f"Loaded {len(transcripts)} transcripts")
# Copy audio files and create metadata
metadata_lines = []
for audio_file in sorted(audio_dir.glob("*.wav")):
# Try to match transcript
text = None
for key in [audio_file.name, audio_file.stem, audio_file.stem + ".wav"]:
if key in transcripts:
text = transcripts[key]
break
if text is None:
print(f"Warning: No transcript for {audio_file.name}, skipping")
continue
# Copy audio file
dest_file = wavs_dir / audio_file.name
shutil.copy(audio_file, dest_file)
# Add to metadata (LJSpeech format: filename|text|text)
# Coqui uses: audio_file|text|text (normalized text optional)
metadata_lines.append(f"{audio_file.stem}|{text}|{text}")
# Write metadata.csv
metadata_file = output_dir / "metadata.csv"
with open(metadata_file, "w", encoding="utf-8") as f:
f.write("\n".join(metadata_lines))
print(f"Created dataset with {len(metadata_lines)} samples")
# Create dataset config
config = {
"name": VOICE_NAME,
"language": LANGUAGE,
"num_samples": len(metadata_lines),
"format": "ljspeech"
}
with open(output_dir / "dataset_config.json", "w") as f:
json.dump(config, f, indent=2)
print(f"Dataset ready at: {output_dir}")
if len(metadata_lines) < 10:
print("Warning: Very small dataset! Recommend at least 100+ samples for good results.")
resources:
requests:
memory: 1Gi
cpu: 500m
# Template: Train Coqui TTS model
- name: train-tts
inputs:
parameters:
- name: voice-name
- name: base-model
- name: language
- name: num-epochs
- name: batch-size
- name: learning-rate
artifacts:
- name: dataset
path: /tmp/dataset
outputs:
artifacts:
- name: model
path: /tmp/output
container:
image: ghcr.io/coqui-ai/tts:latest
command: [bash]
args:
- -c
- |
set -e
VOICE_NAME="{{inputs.parameters.voice-name}}"
BASE_MODEL="{{inputs.parameters.base-model}}"
LANGUAGE="{{inputs.parameters.language}}"
NUM_EPOCHS="{{inputs.parameters.num-epochs}}"
BATCH_SIZE="{{inputs.parameters.batch-size}}"
LEARNING_RATE="{{inputs.parameters.learning-rate}}"
DATASET_DIR="/tmp/dataset"
OUTPUT_DIR="/tmp/output"
mkdir -p "$OUTPUT_DIR"
echo "=== Coqui TTS Voice Training ==="
echo "Voice Name: $VOICE_NAME"
echo "Base Model: $BASE_MODEL"
echo "Language: $LANGUAGE"
echo "Epochs: $NUM_EPOCHS"
echo "Batch Size: $BATCH_SIZE"
echo "Learning Rate: $LEARNING_RATE"
echo ""
# Download base model if specified for fine-tuning
RESTORE_PATH=""
if [ "$BASE_MODEL" != "" ] && [ "$BASE_MODEL" != "none" ]; then
echo "Downloading base model for fine-tuning: $BASE_MODEL"
# Use tts to download the model and get its path
MODEL_PATH=$(python3 -c "
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer
from pathlib import Path
import os
model_name = '$BASE_MODEL'
manager = ModelManager()
# Download the model
model_path, config_path, _ = manager.download_model(model_name)
print(model_path)
")
RESTORE_PATH="$MODEL_PATH"
echo "Base model path: $RESTORE_PATH"
fi
# Create and run training script following Coqui docs pattern
python3 << EOF
import os
from pathlib import Path
# Trainer: Where the magic happens
from trainer import Trainer, TrainerArgs
# Model configs
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import Vits
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
# Paths
DATASET_DIR = Path("$DATASET_DIR")
OUTPUT_DIR = Path("$OUTPUT_DIR")
RESTORE_PATH = "$RESTORE_PATH" if "$RESTORE_PATH" else None
print(f"Dataset: {DATASET_DIR}")
print(f"Output: {OUTPUT_DIR}")
print(f"Restore from: {RESTORE_PATH}")
# Define dataset config (LJSpeech format)
dataset_config = BaseDatasetConfig(
formatter="ljspeech",
meta_file_train="metadata.csv",
path=str(DATASET_DIR),
language="$LANGUAGE",
)
# Initialize training configuration
config = VitsConfig(
run_name="$VOICE_NAME",
output_path=str(OUTPUT_DIR),
datasets=[dataset_config],
batch_size=int("$BATCH_SIZE"),
eval_batch_size=max(1, int("$BATCH_SIZE") // 2),
num_loader_workers=4,
num_eval_loader_workers=2,
run_eval=True,
test_delay_epochs=5,
epochs=int("$NUM_EPOCHS"),
text_cleaner="phoneme_cleaners",
use_phonemes=True,
phoneme_language="$LANGUAGE",
phoneme_cache_path=str(OUTPUT_DIR / "phoneme_cache"),
compute_input_seq_cache=True,
print_step=25,
print_eval=False,
mixed_precision=True,
save_step=500,
save_n_checkpoints=3,
save_best_after=1000,
lr=float("$LEARNING_RATE"),
# Audio settings for typical voice cloning
audio={
"sample_rate": 22050,
"resample": True,
"do_trim_silence": True,
"trim_db": 45,
},
)
# Initialize the audio processor
# Used for feature extraction and audio I/O
ap = AudioProcessor.init_from_config(config)
# Initialize the tokenizer
# Converts text to sequences of token IDs
tokenizer, config = TTSTokenizer.init_from_config(config)
# Load data samples
# Each sample is [text, audio_file_path, speaker_name]
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
print(f"Training samples: {len(train_samples)}")
print(f"Eval samples: {len(eval_samples)}")
# Initialize the model
model = Vits(config, ap, tokenizer, speaker_manager=None)
# Set up trainer arguments
trainer_args = TrainerArgs(
restore_path=RESTORE_PATH,
skip_train_epoch=False,
)
# Initialize the trainer
trainer = Trainer(
trainer_args,
config,
output_path=str(OUTPUT_DIR),
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
)
# Start training
print("\n" + "=" * 50)
print("Starting training...")
print("=" * 50 + "\n")
trainer.fit()
print("\n" + "=" * 50)
print("Training complete!")
print("=" * 50)
EOF
echo ""
echo "Training complete!"
echo "Model saved to: $OUTPUT_DIR"
ls -la "$OUTPUT_DIR"
resources:
requests:
memory: 16Gi
cpu: "4"
nvidia.com/gpu: "1"
limits:
memory: 32Gi
cpu: "8"
nvidia.com/gpu: "1"
volumeMounts:
- name: training-workspace
mountPath: /tmp/workspace
# Template: Export trained model
- name: export-trained-model
inputs:
parameters:
- name: voice-name
- name: output-path
artifacts:
- name: trained-model
path: /tmp/trained-model
outputs:
artifacts:
- name: exported-model
path: /tmp/exported
container:
image: python:3.13-slim
command: [bash]
args:
- -c
- |
set -e
pip install -q boto3
python3 << 'EOF'
import json
import os
import shutil
import subprocess
from pathlib import Path
from datetime import datetime
VOICE_NAME = "{{inputs.parameters.voice-name}}"
OUTPUT_PATH = "{{inputs.parameters.output-path}}"
model_dir = Path("/tmp/trained-model")
export_dir = Path("/tmp/exported")
export_dir.mkdir(parents=True, exist_ok=True)
print(f"Exporting trained model: {VOICE_NAME}")
print(f"Target path: {OUTPUT_PATH}")
# Find best checkpoint
checkpoints = list(model_dir.glob("best_model*.pth")) + list(model_dir.glob("checkpoint_*.pth"))
if not checkpoints:
checkpoints = list(model_dir.glob("*.pth"))
if not checkpoints:
print("Error: No model checkpoints found!")
exit(1)
# Sort by modification time and get newest
checkpoints.sort(key=lambda x: x.stat().st_mtime, reverse=True)
best_checkpoint = checkpoints[0]
print(f"Using checkpoint: {best_checkpoint.name}")
# Create export package
package_dir = export_dir / VOICE_NAME
package_dir.mkdir(parents=True, exist_ok=True)
# Copy model files
shutil.copy(best_checkpoint, package_dir / "model.pth")
# Copy config if exists
config_file = model_dir / "config.json"
if config_file.exists():
shutil.copy(config_file, package_dir / "config.json")
# Create model info
model_info = {
"name": VOICE_NAME,
"created_at": datetime.now().isoformat(),
"checkpoint": best_checkpoint.name,
"type": "coqui-tts"
}
with open(package_dir / "model_info.json", "w") as f:
json.dump(model_info, f, indent=2)
# Create tarball
archive_name = f"{VOICE_NAME}.tar.gz"
shutil.make_archive(
str(export_dir / VOICE_NAME),
"gztar",
export_dir,
VOICE_NAME
)
print(f"Created archive: {archive_name}")
# Upload to destination
if OUTPUT_PATH.startswith("s3://"):
import boto3
s3 = boto3.client("s3")
bucket, key = OUTPUT_PATH[5:].split("/", 1)
key = f"{key}/{archive_name}"
s3.upload_file(str(export_dir / archive_name), bucket, key)
print(f"Uploaded to: s3://{bucket}/{key}")
elif OUTPUT_PATH.startswith("/"):
# Local/NFS path
dest_path = Path(OUTPUT_PATH)
dest_path.mkdir(parents=True, exist_ok=True)
shutil.copy(export_dir / archive_name, dest_path / archive_name)
# Also copy uncompressed for easy access
shutil.copytree(package_dir, dest_path / VOICE_NAME, dirs_exist_ok=True)
print(f"Saved to: {dest_path / archive_name}")
print("\nExport complete!")
print(f"Model package contents:")
for f in package_dir.iterdir():
print(f" - {f.name}")
EOF
resources:
requests:
memory: 1Gi
cpu: 500m