fix: use type=raw for Docker tags to preserve v prefix

docker/metadata-action type=semver strips the v prefix, causing tag mismatch between git tags (v0.1.3) and Docker tags (0.1.3). Switch to type=raw to pass through the version as-is.
fix: switch Docker build to plain docker build/push with insecure registry
2026-02-22 09:58:40 -05:00 · 2026-02-21 22:37:55 -05:00 · 2026-02-21 19:34:27 -05:00 · 2026-02-21 18:05:40 -05:00 · 2026-02-21 15:36:46 -05:00 · 2026-02-21 15:30:04 -05:00
18 changed files with 1235 additions and 3531 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,9 @@
+.git
+.gitignore
+*.md
+LICENSE
+renovate.json
+*_test.go
+e2e_test.go
+__pycache__
+.env*
--- a/.gitea/workflows/build-push.yaml
+++ b/.gitea/workflows/build-push.yaml
@@ -0,0 +1,196 @@
+name: CI
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+env:
+  NTFY_URL: http://ntfy.observability.svc.cluster.local:80
+  GOPRIVATE: git.daviestechlabs.io
+  REGISTRY: gitea-http.gitea.svc.cluster.local:3000/daviestechlabs
+  REGISTRY_HOST: gitea-http.gitea.svc.cluster.local:3000
+  IMAGE_NAME: chat-handler
+
+jobs:
+  lint:
+    name: Lint
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version-file: go.mod
+          cache: true
+
+      - name: Configure private modules
+        run: git config --global url."https://gitea-actions:${{ secrets.DISPATCH_TOKEN }}@git.daviestechlabs.io/".insteadOf "https://git.daviestechlabs.io/"
+
+      - name: Run go vet
+        run: go vet ./...
+
+      - name: Install golangci-lint
+        run: |
+          curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/HEAD/install.sh | sh -s -- -b "$(go env GOPATH)/bin"
+          echo "$(go env GOPATH)/bin" >> $GITHUB_PATH
+
+      - name: Run golangci-lint
+        run: golangci-lint run ./...
+
+  test:
+    name: Test
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version-file: go.mod
+          cache: true
+
+      - name: Configure private modules
+        run: git config --global url."https://gitea-actions:${{ secrets.DISPATCH_TOKEN }}@git.daviestechlabs.io/".insteadOf "https://git.daviestechlabs.io/"
+
+      - name: Verify dependencies
+        run: go mod verify
+
+      - name: Build
+        run: go build -v ./...
+
+      - name: Run tests
+        run: go test -v -race -coverprofile=coverage.out -covermode=atomic ./...
+
+  release:
+    name: Release
+    runs-on: ubuntu-latest
+    needs: [lint, test]
+    if: gitea.ref == 'refs/heads/main' && gitea.event_name == 'push'
+    outputs:
+      version: ${{ steps.version.outputs.version }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Determine version bump
+        id: version
+        run: |
+          # Get latest tag or default to v0.0.0
+          LATEST=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0")
+          VERSION=${LATEST#v}
+          IFS='.' read -r MAJOR MINOR PATCH <<< "$VERSION"
+
+          # Check commit message for keywords
+          MSG="${{ gitea.event.head_commit.message }}"
+          if echo "$MSG" | grep -qiE "^major:|BREAKING CHANGE"; then
+            MAJOR=$((MAJOR + 1)); MINOR=0; PATCH=0
+            BUMP="major"
+          elif echo "$MSG" | grep -qiE "^(minor:|feat:)"; then
+            MINOR=$((MINOR + 1)); PATCH=0
+            BUMP="minor"
+          else
+            PATCH=$((PATCH + 1))
+            BUMP="patch"
+          fi
+
+          NEW_VERSION="v${MAJOR}.${MINOR}.${PATCH}"
+          echo "version=$NEW_VERSION" >> $GITHUB_OUTPUT
+          echo "bump=$BUMP" >> $GITHUB_OUTPUT
+          echo "Bumping $LATEST → $NEW_VERSION ($BUMP)"
+
+      - name: Create and push tag
+        run: |
+          git config user.name "gitea-actions[bot]"
+          git config user.email "actions@git.daviestechlabs.io"
+          git tag -a ${{ steps.version.outputs.version }} -m "Release ${{ steps.version.outputs.version }}"
+          git push origin ${{ steps.version.outputs.version }}
+
+  docker:
+    name: Docker Build & Push
+    runs-on: ubuntu-latest
+    needs: [lint, test, release]
+    if: gitea.ref == 'refs/heads/main' && gitea.event_name == 'push'
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Configure insecure registry
+        run: |
+          sudo mkdir -p /etc/docker
+          echo '{"insecure-registries": ["${{ env.REGISTRY_HOST }}"]}' | sudo tee /etc/docker/daemon.json
+          sudo kill -SIGHUP "$(pidof dockerd)" || true
+          sleep 3
+
+      - name: Login to Gitea Registry
+        run: echo "${{ secrets.REGISTRY_TOKEN }}" | docker login "${{ env.REGISTRY_HOST }}" -u "${{ secrets.REGISTRY_USER }}" --password-stdin
+
+      - name: Login to Docker Hub
+        if: vars.DOCKERHUB_USERNAME != ''
+        run: echo "${{ secrets.DOCKERHUB_TOKEN }}" | docker login -u "${{ vars.DOCKERHUB_USERNAME }}" --password-stdin
+
+      - name: Extract metadata
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          tags: |
+            type=raw,value=${{ needs.release.outputs.version }}
+            type=raw,value=latest,enable={{is_default_branch}}
+
+      - name: Build and push
+        run: |
+          # Build with all tags
+          TAGS=""
+          while IFS= read -r tag; do
+            [ -n "$tag" ] && TAGS="$TAGS -t $tag"
+          done <<< "${{ steps.meta.outputs.tags }}"
+          docker build $TAGS \
+            --label "org.opencontainers.image.source=${{ gitea.server_url }}/${{ gitea.repository }}" \
+            --label "org.opencontainers.image.revision=${{ gitea.sha }}" \
+            .
+          # Push each tag
+          while IFS= read -r tag; do
+            [ -n "$tag" ] && docker push "$tag"
+          done <<< "${{ steps.meta.outputs.tags }}"
+
+
+  notify:
+    name: Notify
+    runs-on: ubuntu-latest
+    needs: [lint, test, release, docker]
+    if: always()
+    steps:
+      - name: Notify on success
+        if: needs.lint.result == 'success' && needs.test.result == 'success'
+        run: |
+          curl -s \
+            -H "Title: ✅ CI Passed: ${{ gitea.repository }}" \
+            -H "Priority: default" \
+            -H "Tags: white_check_mark,github" \
+            -H "Click: ${{ gitea.server_url }}/${{ gitea.repository }}/actions/runs/${{ gitea.run_id }}" \
+            -d "Branch: ${{ gitea.ref_name }}
+          Commit: ${{ gitea.event.head_commit.message || gitea.sha }}
+          Release: ${{ needs.release.result == 'success' && needs.release.outputs.version || 'skipped' }}
+          Docker: ${{ needs.docker.result }}" \
+            ${{ env.NTFY_URL }}/gitea-ci
+
+      - name: Notify on failure
+        if: needs.lint.result == 'failure' || needs.test.result == 'failure'
+        run: |
+          curl -s \
+            -H "Title: ❌ CI Failed: ${{ gitea.repository }}" \
+            -H "Priority: high" \
+            -H "Tags: x,github" \
+            -H "Click: ${{ gitea.server_url }}/${{ gitea.repository }}/actions/runs/${{ gitea.run_id }}" \
+            -d "Branch: ${{ gitea.ref_name }}
+          Commit: ${{ gitea.event.head_commit.message || gitea.sha }}
+          Lint: ${{ needs.lint.result }}
+          Test: ${{ needs.test.result }}" \
+            ${{ env.NTFY_URL }}/gitea-ci
--- a/.gitea/workflows/ci.yml
+++ b/.gitea/workflows/ci.yml
@@ -1,129 +0,0 @@
-name: CI
-
-on:
-  push:
-    branches: [main]
-  pull_request:
-    branches: [main]
-
-env:
-  NTFY_URL: http://ntfy.observability.svc.cluster.local:80
-
-jobs:
-  lint:
-    name: Lint
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Set up uv
-        run: curl -LsSf https://astral.sh/uv/install.sh | sh && echo "$HOME/.local/bin" >> $GITHUB_PATH
-
-      - name: Set up Python
-        run: uv python install 3.13
-
-      - name: Install dependencies
-        run: uv sync --frozen --extra dev
-
-      - name: Run ruff check
-        run: uv run ruff check .
-
-      - name: Run ruff format check
-        run: uv run ruff format --check .
-
-  test:
-    name: Test
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Set up uv
-        run: curl -LsSf https://astral.sh/uv/install.sh | sh && echo "$HOME/.local/bin" >> $GITHUB_PATH
-
-      - name: Set up Python
-        run: uv python install 3.13
-
-      - name: Install dependencies
-        run: uv sync --frozen --extra dev
-
-      - name: Run tests
-        run: uv run pytest -v
-
-  release:
-    name: Release
-    runs-on: ubuntu-latest
-    needs: [lint, test]
-    if: gitea.ref == 'refs/heads/main' && gitea.event_name == 'push'
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Determine version bump
-        id: version
-        run: |
-          # Get latest tag or default to v0.0.0
-          LATEST=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0")
-          VERSION=${LATEST#v}
-          IFS='.' read -r MAJOR MINOR PATCH <<< "$VERSION"
-          
-          # Check commit message for keywords
-          MSG="${{ gitea.event.head_commit.message }}"
-          if echo "$MSG" | grep -qiE "^major:|BREAKING CHANGE"; then
-            MAJOR=$((MAJOR + 1)); MINOR=0; PATCH=0
-            BUMP="major"
-          elif echo "$MSG" | grep -qiE "^(minor:|feat:)"; then
-            MINOR=$((MINOR + 1)); PATCH=0
-            BUMP="minor"
-          else
-            PATCH=$((PATCH + 1))
-            BUMP="patch"
-          fi
-          
-          NEW_VERSION="v${MAJOR}.${MINOR}.${PATCH}"
-          echo "version=$NEW_VERSION" >> $GITHUB_OUTPUT
-          echo "bump=$BUMP" >> $GITHUB_OUTPUT
-          echo "Bumping $LATEST → $NEW_VERSION ($BUMP)"
-
-      - name: Create and push tag
-        run: |
-          git config user.name "gitea-actions[bot]"
-          git config user.email "actions@git.daviestechlabs.io"
-          git tag -a ${{ steps.version.outputs.version }} -m "Release ${{ steps.version.outputs.version }}"
-          git push origin ${{ steps.version.outputs.version }}
-
-  notify:
-    name: Notify
-    runs-on: ubuntu-latest
-    needs: [lint, test, release]
-    if: always()
-    steps:
-      - name: Notify on success
-        if: needs.lint.result == 'success' && needs.test.result == 'success'
-        run: |
-          curl -s \
-            -H "Title: ✅ CI Passed: ${{ gitea.repository }}" \
-            -H "Priority: default" \
-            -H "Tags: white_check_mark,github" \
-            -H "Click: ${{ gitea.server_url }}/${{ gitea.repository }}/actions/runs/${{ gitea.run_id }}" \
-            -d "Branch: ${{ gitea.ref_name }}
-          Commit: ${{ gitea.event.head_commit.message || gitea.sha }}
-          Release: ${{ needs.release.result == 'success' && 'created' || 'skipped' }}" \
-            ${{ env.NTFY_URL }}/gitea-ci
-
-      - name: Notify on failure
-        if: needs.lint.result == 'failure' || needs.test.result == 'failure'
-        run: |
-          curl -s \
-            -H "Title: ❌ CI Failed: ${{ gitea.repository }}" \
-            -H "Priority: high" \
-            -H "Tags: x,github" \
-            -H "Click: ${{ gitea.server_url }}/${{ gitea.repository }}/actions/runs/${{ gitea.run_id }}" \
-            -d "Branch: ${{ gitea.ref_name }}
-          Commit: ${{ gitea.event.head_commit.message || gitea.sha }}
-          Lint: ${{ needs.lint.result }}
-          Test: ${{ needs.test.result }}" \
-            ${{ env.NTFY_URL }}/gitea-ci
--- a/.gitea/workflows/update-dependency.yml
+++ b/.gitea/workflows/update-dependency.yml
@@ -0,0 +1,60 @@
+name: Update handler-base
+
+on:
+  repository_dispatch:
+    types: [handler-base-release]
+
+env:
+  NTFY_URL: http://ntfy.observability.svc.cluster.local:80
+  GOPRIVATE: git.daviestechlabs.io
+
+jobs:
+  update:
+    name: Update handler-base dependency
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          token: ${{ secrets.DISPATCH_TOKEN }}
+
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version-file: go.mod
+          cache: true
+
+      - name: Configure Git
+        run: |
+          git config user.name "gitea-actions[bot]"
+          git config user.email "actions@git.daviestechlabs.io"
+          git config --global url."https://gitea-actions:${{ secrets.DISPATCH_TOKEN }}@git.daviestechlabs.io/".insteadOf "https://git.daviestechlabs.io/"
+
+      - name: Update handler-base
+        run: |
+          VERSION="${{ gitea.event.client_payload.version }}"
+          echo "Updating handler-base to ${VERSION}"
+          go get git.daviestechlabs.io/daviestechlabs/handler-base@${VERSION}
+          go mod tidy
+
+      - name: Commit and push
+        run: |
+          VERSION="${{ gitea.event.client_payload.version }}"
+          if git diff --quiet go.mod go.sum; then
+            echo "No changes to commit"
+            exit 0
+          fi
+          git add go.mod go.sum
+          git commit -m "chore(deps): bump handler-base to ${VERSION}"
+          git push
+
+      - name: Notify
+        if: success()
+        run: |
+          VERSION="${{ gitea.event.client_payload.version }}"
+          curl -s \
+            -H "Title: 📦 Dep Update: ${{ gitea.repository }}" \
+            -H "Priority: default" \
+            -H "Tags: package,github" \
+            -d "handler-base updated to ${VERSION}" \
+            ${{ env.NTFY_URL }}/gitea-ci
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,4 @@ ENV/
 .env
 .env.local
 *.log
+chat-handler
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,32 +0,0 @@
-# Pre-commit hooks for chat-handler
-# Install: pip install pre-commit && pre-commit install
-# Run: pre-commit run --all-files
-
-repos:
-  # Ruff - fast Python linter and formatter
-  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.4.4
-    hooks:
-      - id: ruff
-        args: [--fix, --exit-non-zero-on-fix]
-      - id: ruff-format
-
-  # Standard pre-commit hooks
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
-    hooks:
-      - id: trailing-whitespace
-      - id: end-of-file-fixer
-      - id: check-yaml
-      - id: check-added-large-files
-        args: [--maxkb=500]
-      - id: check-merge-conflict
-      - id: detect-private-key
-
-  # Type checking (optional - uncomment when ready)
-  # - repo: https://github.com/pre-commit/mirrors-mypy
-  #   rev: v1.10.0
-  #   hooks:
-  #     - id: mypy
-  #       additional_dependencies: [types-all]
-  #       args: [--ignore-missing-imports]
--- a/27
+++ b/27
@@ -1,9 +1,26 @@
-# Chat Handler - Using handler-base
-ARG BASE_TAG=latest
-FROM ghcr.io/billy-davies-2/handler-base:${BASE_TAG}
+# Build stage
+FROM golang:1.25-alpine AS builder

 WORKDIR /app

-COPY chat_handler.py .
+RUN apk add --no-cache ca-certificates git

-CMD ["python", "chat_handler.py"]
+ENV GOPRIVATE=git.daviestechlabs.io
+ENV GONOSUMCHECK=git.daviestechlabs.io
+
+COPY go.mod go.sum ./
+RUN --mount=type=secret,id=netrc,target=/root/.netrc go mod download
+
+COPY . .
+
+RUN CGO_ENABLED=0 GOOS=linux GOAMD64=v3 go build -ldflags="-w -s" -o /chat-handler .
+
+# Runtime stage
+FROM scratch
+
+COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/
+COPY --from=builder /chat-handler /chat-handler
+
+USER 65534:65534
+
+ENTRYPOINT ["/chat-handler"]
--- a/chat_handler.py
+++ b/chat_handler.py
@@ -1,316 +0,0 @@
-#!/usr/bin/env python3
-"""
-Chat Handler Service (Refactored)
-
-Text-based chat pipeline using handler-base:
-1. Listen for text on NATS subject "ai.chat.user.*.message"
-2. If RAG enabled (premium/explicit): embed → Milvus search → rerank
-3. Generate response with vLLM (with or without RAG context)
-4. Optionally stream chunks to "ai.chat.response.stream.{request_id}"
-5. Optionally synthesize speech with XTTS
-6. Publish result to "ai.chat.response.{request_id}" (or custom response_subject)
-"""
-
-import base64
-import logging
-from typing import Any, Optional
-
-from nats.aio.msg import Msg
-
-from handler_base import Handler, Settings
-from handler_base.clients import (
-    EmbeddingsClient,
-    RerankerClient,
-    LLMClient,
-    TTSClient,
-    MilvusClient,
-)
-from handler_base.telemetry import create_span
-
-logger = logging.getLogger("chat-handler")
-
-
-class ChatSettings(Settings):
-    """Chat handler specific settings."""
-
-    service_name: str = "chat-handler"
-
-    # RAG settings
-    rag_top_k: int = 10
-    rag_rerank_top_k: int = 5
-    rag_collection: str = "documents"
-
-    # Response settings
-    include_sources: bool = True
-    enable_tts: bool = False
-    tts_language: str = "en"
-
-
-class ChatHandler(Handler):
-    """
-    Chat request handler with RAG pipeline.
-
-    Subscribes to: ai.chat.user.*.message  (JetStream durable "chat-handler")
-
-    Request format (msgpack):
-    {
-        "request_id": "uuid",
-        "user_id": "user-123",
-        "username": "john_doe",
-        "message": "user question",
-        "premium": false,
-        "enable_rag": true,
-        "enable_reranker": true,
-        "enable_streaming": true,
-        "top_k": 5,
-        "session_id": "session-abc",
-        "system_prompt": "optional custom system prompt"
-    }
-
-    Response format (msgpack):
-    {
-        "user_id": "user-123",
-        "response": "generated response",
-        "response_text": "generated response",
-        "used_rag": true,
-        "rag_sources": ["source1", "source2"],
-        "success": true
-    }
-    """
-
-    def __init__(self):
-        self.chat_settings = ChatSettings()
-        super().__init__(
-            subject="ai.chat.user.*.message",
-            settings=self.chat_settings,
-            queue_group="chat-handlers",
-        )
-
-    async def setup(self) -> None:
-        """Initialize service clients."""
-        logger.info("Initializing service clients...")
-
-        self.embeddings = EmbeddingsClient(self.chat_settings)
-        self.reranker = RerankerClient(self.chat_settings)
-        self.llm = LLMClient(self.chat_settings)
-        self.milvus = MilvusClient(self.chat_settings)
-
-        # TTS is optional
-        if self.chat_settings.enable_tts:
-            self.tts = TTSClient(self.chat_settings)
-        else:
-            self.tts = None
-
-        # Connect to Milvus
-        await self.milvus.connect(self.chat_settings.rag_collection)
-
-        logger.info("Service clients initialized")
-
-    async def teardown(self) -> None:
-        """Clean up service clients."""
-        logger.info("Closing service clients...")
-
-        await self.embeddings.close()
-        await self.reranker.close()
-        await self.llm.close()
-        await self.milvus.close()
-
-        if self.tts:
-            await self.tts.close()
-
-        logger.info("Service clients closed")
-
-    async def handle_message(self, msg: Msg, data: Any) -> Optional[dict]:
-        """Handle incoming chat request."""
-        request_id = data.get("request_id", "unknown")
-        user_id = data.get("user_id", "unknown")
-        query = data.get("message", "") or data.get("query", "")
-        premium = data.get("premium", False)
-        enable_rag = data.get("enable_rag", premium)
-        enable_reranker = data.get("enable_reranker", enable_rag)
-        enable_streaming = data.get("enable_streaming", False)
-        top_k = data.get("top_k", self.chat_settings.rag_top_k)
-        collection = data.get("collection", self.chat_settings.rag_collection)
-        enable_tts = data.get("enable_tts", self.chat_settings.enable_tts)
-        system_prompt = data.get("system_prompt")
-        # companions-frontend may set a custom response subject
-        response_subject = data.get("response_subject", f"ai.chat.response.{request_id}")
-
-        logger.info(f"Processing request {request_id}: {query[:50]}...")
-
-        with create_span("chat.process") as span:
-            if span:
-                span.set_attribute("request.id", request_id)
-                span.set_attribute("user.id", user_id)
-                span.set_attribute("query.length", len(query))
-                span.set_attribute("premium", premium)
-                span.set_attribute("rag.enabled", enable_rag)
-
-            context = ""
-            rag_sources: list[str] = []
-            used_rag = False
-
-            # Only run RAG pipeline when enabled (premium users or explicit flag)
-            if enable_rag:
-                # 1. Generate query embedding
-                embedding = await self._get_embedding(query)
-
-                # 2. Search Milvus for context
-                documents = await self._search_context(
-                    embedding,
-                    collection,
-                    top_k=top_k,
-                )
-
-                # 3. Optionally rerank documents
-                if enable_reranker and documents:
-                    reranked = await self._rerank_documents(query, documents)
-                else:
-                    reranked = documents
-
-                # 4. Build context from top documents
-                if reranked:
-                    context = self._build_context(reranked)
-                    rag_sources = [
-                        d.get("source", d.get("document", "")[:80]) for d in reranked[:3]
-                    ]
-                    used_rag = True
-
-            # 5. Generate LLM response (with or without RAG context)
-            response_text = await self._generate_response(
-                query,
-                context or None,
-                system_prompt,
-            )
-
-            # 6. Stream response chunks if requested
-            if enable_streaming:
-                stream_subject = f"ai.chat.response.stream.{request_id}"
-                await self._publish_streaming_chunks(
-                    stream_subject,
-                    request_id,
-                    response_text,
-                )
-
-            # 7. Optionally synthesize speech
-            audio_b64 = None
-            if enable_tts and self.tts:
-                audio_b64 = await self._synthesize_speech(response_text)
-
-            # Build response (compatible with companions-frontend NATSChatResponse)
-            result: dict[str, Any] = {
-                "user_id": user_id,
-                "response": response_text,
-                "response_text": response_text,
-                "used_rag": used_rag,
-                "rag_sources": rag_sources,
-                "success": True,
-            }
-
-            if audio_b64:
-                result["audio"] = audio_b64
-
-            logger.info(f"Completed request {request_id} (rag={used_rag})")
-
-            # Publish to the response subject the frontend is waiting on
-            await self.nats.publish(response_subject, result)
-
-            return result
-
-    async def _get_embedding(self, text: str) -> list[float]:
-        """Generate embedding for query text."""
-        with create_span("chat.embedding"):
-            return await self.embeddings.embed_single(text)
-
-    async def _search_context(
-        self,
-        embedding: list[float],
-        collection: str,
-        top_k: int | None = None,
-    ) -> list[dict]:
-        """Search Milvus for relevant documents."""
-        with create_span("chat.search"):
-            return await self.milvus.search_with_texts(
-                embedding,
-                limit=top_k or self.chat_settings.rag_top_k,
-                text_field="text",
-                metadata_fields=["source", "title"],
-            )
-
-    async def _rerank_documents(self, query: str, documents: list[dict]) -> list[dict]:
-        """Rerank documents by relevance to query."""
-        with create_span("chat.rerank"):
-            texts = [d.get("text", "") for d in documents]
-            return await self.reranker.rerank(
-                query, texts, top_k=self.chat_settings.rag_rerank_top_k
-            )
-
-    def _build_context(self, documents: list[dict]) -> str:
-        """Build context string from ranked documents."""
-        context_parts = []
-        for i, doc in enumerate(documents, 1):
-            text = doc.get("document", "")
-            context_parts.append(f"[{i}] {text}")
-        return "\n\n".join(context_parts)
-
-    async def _generate_response(
-        self,
-        query: str,
-        context: Optional[str] = None,
-        system_prompt: Optional[str] = None,
-    ) -> str:
-        """Generate LLM response, optionally augmented with RAG context."""
-        with create_span("chat.generate"):
-            return await self.llm.generate(
-                query,
-                context=context,
-                system_prompt=system_prompt,
-            )
-
-    async def _publish_streaming_chunks(
-        self,
-        subject: str,
-        request_id: str,
-        full_text: str,
-    ) -> None:
-        """Publish response as streaming chunks for real-time display."""
-        import time
-
-        words = full_text.split(" ")
-        chunk_size = 4
-        for i in range(0, len(words), chunk_size):
-            token_chunk = " ".join(words[i : i + chunk_size])
-            await self.nats.publish(
-                subject,
-                {
-                    "request_id": request_id,
-                    "type": "chunk",
-                    "content": token_chunk,
-                    "done": False,
-                    "timestamp": time.time(),
-                },
-            )
-        # Send done marker
-        await self.nats.publish(
-            subject,
-            {
-                "request_id": request_id,
-                "type": "done",
-                "content": "",
-                "done": True,
-                "timestamp": time.time(),
-            },
-        )
-
-    async def _synthesize_speech(self, text: str) -> str:
-        """Synthesize speech and return base64 encoded audio."""
-        with create_span("chat.tts"):
-            audio_bytes = await self.tts.synthesize(
-                text,
-                language=self.chat_settings.tts_language,
-            )
-            return base64.b64encode(audio_bytes).decode()
-
-
-if __name__ == "__main__":
-    ChatHandler().run()
--- a/e2e_test.go
+++ b/e2e_test.go
@@ -0,0 +1,484 @@
+package main
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"git.daviestechlabs.io/daviestechlabs/handler-base/clients"
+	"git.daviestechlabs.io/daviestechlabs/handler-base/messages"
+	"google.golang.org/protobuf/proto"
+)
+
+// ────────────────────────────────────────────────────────────────────────────
+// E2E tests: exercise the full chat pipeline with mock backends
+// ────────────────────────────────────────────────────────────────────────────
+
+// mockBackends starts httptest servers simulating all downstream services.
+type mockBackends struct {
+	Embeddings *httptest.Server
+	Reranker   *httptest.Server
+	LLM        *httptest.Server
+	TTS        *httptest.Server
+}
+
+func newMockBackends(t *testing.T) *mockBackends {
+	t.Helper()
+	m := &mockBackends{}
+
+	m.Embeddings = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		_ = json.NewEncoder(w).Encode(map[string]any{
+			"data": []map[string]any{
+				{"embedding": []float64{0.1, 0.2, 0.3, 0.4}},
+			},
+		})
+	}))
+	t.Cleanup(m.Embeddings.Close)
+
+	m.Reranker = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		_ = json.NewEncoder(w).Encode(map[string]any{
+			"results": []map[string]any{
+				{"index": 0, "relevance_score": 0.95},
+			},
+		})
+	}))
+	t.Cleanup(m.Reranker.Close)
+
+	m.LLM = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		var req map[string]any
+		_ = json.NewDecoder(r.Body).Decode(&req)
+		_ = json.NewEncoder(w).Encode(map[string]any{
+			"choices": []map[string]any{
+				{"message": map[string]any{
+					"content": "Paris is the capital of France.",
+				}},
+			},
+		})
+	}))
+	t.Cleanup(m.LLM.Close)
+
+	m.TTS = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		_, _ = w.Write([]byte{0xDE, 0xAD, 0xBE, 0xEF})
+	}))
+	t.Cleanup(m.TTS.Close)
+
+	return m
+}
+
+func TestChatPipeline_LLMOnly(t *testing.T) {
+	m := newMockBackends(t)
+	llm := clients.NewLLMClient(m.LLM.URL, 5*time.Second)
+
+	// Simulate what main.go does for a non-RAG request.
+	response, err := llm.Generate(context.Background(), "What is the capital of France?", "", "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if response != "Paris is the capital of France." {
+		t.Errorf("response = %q", response)
+	}
+}
+
+func TestChatPipeline_WithRAG(t *testing.T) {
+	m := newMockBackends(t)
+	embeddings := clients.NewEmbeddingsClient(m.Embeddings.URL, 5*time.Second, "bge")
+	reranker := clients.NewRerankerClient(m.Reranker.URL, 5*time.Second)
+	llm := clients.NewLLMClient(m.LLM.URL, 5*time.Second)
+
+	ctx := context.Background()
+
+	// 1. Embed query
+	embedding, err := embeddings.EmbedSingle(ctx, "What is the capital of France?")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(embedding) == 0 {
+		t.Fatal("empty embedding")
+	}
+
+	// 2. Rerank (with mock documents)
+	docs := []string{"France is a country in Europe", "Paris is its capital"}
+	results, err := reranker.Rerank(ctx, "capital of France", docs, 2)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(results) == 0 {
+		t.Fatal("no rerank results")
+	}
+	if results[0].Score == 0 {
+		t.Error("expected non-zero score")
+	}
+
+	// 3. Generate with context
+	contextText := results[0].Document
+	response, err := llm.Generate(ctx, "capital of France?", contextText, "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if response == "" {
+		t.Error("empty response")
+	}
+}
+
+func TestChatPipeline_WithTTS(t *testing.T) {
+	m := newMockBackends(t)
+	llm := clients.NewLLMClient(m.LLM.URL, 5*time.Second)
+	tts := clients.NewTTSClient(m.TTS.URL, 5*time.Second, "en")
+
+	ctx := context.Background()
+
+	response, err := llm.Generate(ctx, "hello", "", "")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	audio, err := tts.Synthesize(ctx, response, "en", "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(audio) == 0 {
+		t.Error("empty audio")
+	}
+}
+
+func TestChatPipeline_LLMTimeout(t *testing.T) {
+	// Simulate slow LLM.
+	slow := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		time.Sleep(200 * time.Millisecond)
+		_ = json.NewEncoder(w).Encode(map[string]any{
+			"choices": []map[string]any{
+				{"message": map[string]any{"content": "late response"}},
+			},
+		})
+	}))
+	defer slow.Close()
+
+	llm := clients.NewLLMClient(slow.URL, 100*time.Millisecond)
+	_, err := llm.Generate(context.Background(), "hello", "", "")
+	if err == nil {
+		t.Error("expected timeout error")
+	}
+}
+
+func TestChatPipeline_TypedDecoding(t *testing.T) {
+	// Verify typed struct decoding from proto (same path as OnTypedMessage).
+	original := &messages.ChatRequest{
+		RequestId:       "req-e2e-001",
+		UserId:          "user-1",
+		Message:         "hello",
+		Premium:         true,
+		EnableRag:       false,
+		EnableStreaming: false,
+		SystemPrompt:    "Be brief.",
+	}
+	data, _ := proto.Marshal(original)
+
+	var req messages.ChatRequest
+	if err := proto.Unmarshal(data, &req); err != nil {
+		t.Fatal(err)
+	}
+
+	if req.RequestId != "req-e2e-001" {
+		t.Errorf("RequestID = %q", req.RequestId)
+	}
+	if req.UserId != "user-1" {
+		t.Errorf("UserID = %q", req.UserId)
+	}
+	if messages.EffectiveQuery(&req) != "hello" {
+		t.Errorf("query = %q", messages.EffectiveQuery(&req))
+	}
+	if req.EnableRag {
+		t.Error("EnableRAG should be false")
+	}
+	if req.SystemPrompt != "Be brief." {
+		t.Errorf("SystemPrompt = %q", req.SystemPrompt)
+	}
+}
+
+// ────────────────────────────────────────────────────────────────────────────
+// Streaming tests: exercise StreamGenerate path (the real SSE pipeline)
+// ────────────────────────────────────────────────────────────────────────────
+
+// sseChunk builds an OpenAI-compatible SSE data line.
+func sseChunk(content string) string {
+	return fmt.Sprintf("data: {\"choices\":[{\"delta\":{\"content\":%q}}]}\n\n", content)
+}
+
+// newStreamingLLM creates a mock LLM server that responds with SSE-streamed tokens.
+func newStreamingLLM(t *testing.T, tokens []string) *httptest.Server {
+	t.Helper()
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		var req map[string]any
+		_ = json.NewDecoder(r.Body).Decode(&req)
+		// Verify stream=true was requested.
+		if stream, ok := req["stream"].(bool); !ok || !stream {
+			t.Error("expected stream=true in request body")
+		}
+
+		w.Header().Set("Content-Type", "text/event-stream")
+		w.Header().Set("Cache-Control", "no-cache")
+		flusher, _ := w.(http.Flusher)
+
+		// Role-only chunk (should be skipped by StreamGenerate)
+		_, _ = fmt.Fprintf(w, "data: {\"choices\":[{\"delta\":{\"role\":\"assistant\"}}]}\n\n")
+		if flusher != nil {
+			flusher.Flush()
+		}
+
+		for _, tok := range tokens {
+			_, _ = fmt.Fprint(w, sseChunk(tok))
+			if flusher != nil {
+				flusher.Flush()
+			}
+		}
+		_, _ = fmt.Fprintf(w, "data: [DONE]\n\n")
+		if flusher != nil {
+			flusher.Flush()
+		}
+	}))
+	t.Cleanup(srv.Close)
+	return srv
+}
+
+func TestChatPipeline_StreamGenerate(t *testing.T) {
+	tokens := []string{"Paris", " is", " the", " capital", " of", " France", "."}
+	srv := newStreamingLLM(t, tokens)
+	llm := clients.NewLLMClient(srv.URL, 5*time.Second)
+
+	var mu sync.Mutex
+	var received []string
+	full, err := llm.StreamGenerate(context.Background(), "capital of France?", "", "", func(token string) {
+		mu.Lock()
+		defer mu.Unlock()
+		received = append(received, token)
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if full != "Paris is the capital of France." {
+		t.Errorf("full = %q", full)
+	}
+	if len(received) != len(tokens) {
+		t.Errorf("callback count = %d, want %d", len(received), len(tokens))
+	}
+	for i, tok := range tokens {
+		if received[i] != tok {
+			t.Errorf("token[%d] = %q, want %q", i, received[i], tok)
+		}
+	}
+}
+
+func TestChatPipeline_StreamWithSystemPrompt(t *testing.T) {
+	srv := newStreamingLLM(t, []string{"Hello", "!"})
+	llm := clients.NewLLMClient(srv.URL, 5*time.Second)
+
+	full, err := llm.StreamGenerate(context.Background(), "greet me", "", "You are a friendly assistant.", func(token string) {})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if full != "Hello!" {
+		t.Errorf("full = %q", full)
+	}
+}
+
+func TestChatPipeline_StreamWithRAGContext(t *testing.T) {
+	m := newMockBackends(t)
+	srv := newStreamingLLM(t, []string{"The", " answer", " is", " 42"})
+	embeddings := clients.NewEmbeddingsClient(m.Embeddings.URL, 5*time.Second, "bge")
+	llm := clients.NewLLMClient(srv.URL, 5*time.Second)
+
+	ctx := context.Background()
+
+	// 1. Embed
+	embedding, err := embeddings.EmbedSingle(ctx, "deep thought")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(embedding) == 0 {
+		t.Fatal("empty embedding")
+	}
+
+	// 2. Stream with context
+	var tokens []string
+	full, err := llm.StreamGenerate(ctx, "deep thought", "The answer to everything is 42.", "", func(tok string) {
+		tokens = append(tokens, tok)
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if full != "The answer is 42" {
+		t.Errorf("full = %q", full)
+	}
+	if len(tokens) != 4 {
+		t.Errorf("token count = %d, want 4", len(tokens))
+	}
+}
+
+func TestChatPipeline_StreamNilCallback(t *testing.T) {
+	srv := newStreamingLLM(t, []string{"ok"})
+	llm := clients.NewLLMClient(srv.URL, 5*time.Second)
+
+	full, err := llm.StreamGenerate(context.Background(), "test", "", "", nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if full != "ok" {
+		t.Errorf("full = %q", full)
+	}
+}
+
+func TestChatPipeline_StreamTimeout(t *testing.T) {
+	slow := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		time.Sleep(200 * time.Millisecond)
+		w.Header().Set("Content-Type", "text/event-stream")
+		_, _ = fmt.Fprint(w, sseChunk("late"))
+		_, _ = fmt.Fprint(w, "data: [DONE]\n\n")
+	}))
+	defer slow.Close()
+
+	llm := clients.NewLLMClient(slow.URL, 100*time.Millisecond)
+	_, err := llm.StreamGenerate(context.Background(), "hello", "", "", nil)
+	if err == nil {
+		t.Error("expected timeout error")
+	}
+}
+
+func TestChatPipeline_StreamHTTPError(t *testing.T) {
+	errSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusInternalServerError)
+		_, _ = w.Write([]byte("internal error"))
+	}))
+	defer errSrv.Close()
+
+	llm := clients.NewLLMClient(errSrv.URL, 5*time.Second)
+	_, err := llm.StreamGenerate(context.Background(), "hello", "", "", nil)
+	if err == nil {
+		t.Error("expected error for HTTP 500")
+	}
+	if !strings.Contains(err.Error(), "500") {
+		t.Errorf("error = %q, should mention status 500", err)
+	}
+}
+
+func TestChatPipeline_StreamContextCanceled(t *testing.T) {
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel() // cancel immediately
+
+	srv := newStreamingLLM(t, []string{"should", "not", "arrive"})
+	llm := clients.NewLLMClient(srv.URL, 5*time.Second)
+
+	_, err := llm.StreamGenerate(ctx, "hello", "", "", nil)
+	if err == nil {
+		t.Error("expected context canceled error")
+	}
+}
+
+func TestChatPipeline_StreamFallbackToNonStreaming(t *testing.T) {
+	// Simulate the branching in main.go: non-streaming uses Generate(),
+	// streaming uses StreamGenerate(). Verify both paths work from same mock.
+	m := newMockBackends(t)
+	streamSrv := newStreamingLLM(t, []string{"streamed", " answer"})
+
+	nonStreamLLM := clients.NewLLMClient(m.LLM.URL, 5*time.Second)
+	streamLLM := clients.NewLLMClient(streamSrv.URL, 5*time.Second)
+
+	ctx := context.Background()
+
+	// Non-streaming path
+	resp1, err := nonStreamLLM.Generate(ctx, "hello", "", "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if resp1 != "Paris is the capital of France." {
+		t.Errorf("non-stream = %q", resp1)
+	}
+
+	// Streaming path
+	var tokens []string
+	resp2, err := streamLLM.StreamGenerate(ctx, "hello", "", "", func(tok string) {
+		tokens = append(tokens, tok)
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if resp2 != "streamed answer" {
+		t.Errorf("stream = %q", resp2)
+	}
+	if len(tokens) != 2 {
+		t.Errorf("token count = %d", len(tokens))
+	}
+}
+
+// ────────────────────────────────────────────────────────────────────────────
+// Benchmark: full chat pipeline overhead (mock backends)
+// ────────────────────────────────────────────────────────────────────────────
+
+func BenchmarkChatPipeline_LLMOnly(b *testing.B) {
+	llmSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		_, _ = w.Write([]byte(`{"choices":[{"message":{"content":"answer"}}]}`))
+	}))
+	defer llmSrv.Close()
+
+	llm := clients.NewLLMClient(llmSrv.URL, 10*time.Second)
+	ctx := context.Background()
+
+	b.ResetTimer()
+	for b.Loop() {
+		_, _ = llm.Generate(ctx, "question", "", "")
+	}
+}
+
+func BenchmarkChatPipeline_RAGFlow(b *testing.B) {
+	embedSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		_, _ = w.Write([]byte(`{"data":[{"embedding":[0.1,0.2]}]}`))
+	}))
+	defer embedSrv.Close()
+
+	rerankSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		_, _ = w.Write([]byte(`{"results":[{"index":0,"relevance_score":0.9}]}`))
+	}))
+	defer rerankSrv.Close()
+
+	llmSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		_, _ = w.Write([]byte(`{"choices":[{"message":{"content":"answer"}}]}`))
+	}))
+	defer llmSrv.Close()
+
+	embed := clients.NewEmbeddingsClient(embedSrv.URL, 10*time.Second, "bge")
+	rerank := clients.NewRerankerClient(rerankSrv.URL, 10*time.Second)
+	llm := clients.NewLLMClient(llmSrv.URL, 10*time.Second)
+	ctx := context.Background()
+
+	b.ResetTimer()
+	for b.Loop() {
+		_, _ = embed.EmbedSingle(ctx, "question")
+		_, _ = rerank.Rerank(ctx, "question", []string{"doc1", "doc2"}, 2)
+		_, _ = llm.Generate(ctx, "question", "context", "")
+	}
+}
+
+func BenchmarkChatPipeline_StreamGenerate(b *testing.B) {
+	tokens := []string{"one", " two", " three", " four", " five"}
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/event-stream")
+		for _, tok := range tokens {
+			_, _ = fmt.Fprintf(w, "data: {\"choices\":[{\"delta\":{\"content\":%q}}]}\n\n", tok)
+		}
+		_, _ = fmt.Fprint(w, "data: [DONE]\n\n")
+	}))
+	defer srv.Close()
+
+	llm := clients.NewLLMClient(srv.URL, 10*time.Second)
+	ctx := context.Background()
+
+	b.ResetTimer()
+	for b.Loop() {
+		_, _ = llm.StreamGenerate(ctx, "question", "", "", func(string) {})
+	}
+}
--- a/go.mod
+++ b/go.mod
@@ -0,0 +1,39 @@
+module git.daviestechlabs.io/daviestechlabs/chat-handler
+
+go 1.25.1
+
+require (
+	git.daviestechlabs.io/daviestechlabs/handler-base v1.0.0
+	github.com/nats-io/nats.go v1.48.0
+	google.golang.org/protobuf v1.36.11
+)
+
+require (
+	github.com/cenkalti/backoff/v5 v5.0.3 // indirect
+	github.com/cespare/xxhash/v2 v2.3.0 // indirect
+	github.com/fsnotify/fsnotify v1.9.0 // indirect
+	github.com/go-logr/logr v1.4.3 // indirect
+	github.com/go-logr/stdr v1.2.2 // indirect
+	github.com/google/uuid v1.6.0 // indirect
+	github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.7 // indirect
+	github.com/klauspost/compress v1.18.0 // indirect
+	github.com/nats-io/nkeys v0.4.11 // indirect
+	github.com/nats-io/nuid v1.0.1 // indirect
+	go.opentelemetry.io/auto/sdk v1.2.1 // indirect
+	go.opentelemetry.io/otel v1.40.0 // indirect
+	go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.40.0 // indirect
+	go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.40.0 // indirect
+	go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0 // indirect
+	go.opentelemetry.io/otel/metric v1.40.0 // indirect
+	go.opentelemetry.io/otel/sdk v1.40.0 // indirect
+	go.opentelemetry.io/otel/sdk/metric v1.40.0 // indirect
+	go.opentelemetry.io/otel/trace v1.40.0 // indirect
+	go.opentelemetry.io/proto/otlp v1.9.0 // indirect
+	golang.org/x/crypto v0.47.0 // indirect
+	golang.org/x/net v0.49.0 // indirect
+	golang.org/x/sys v0.40.0 // indirect
+	golang.org/x/text v0.33.0 // indirect
+	google.golang.org/genproto/googleapis/api v0.0.0-20260128011058-8636f8732409 // indirect
+	google.golang.org/genproto/googleapis/rpc v0.0.0-20260128011058-8636f8732409 // indirect
+	google.golang.org/grpc v1.78.0 // indirect
+)
--- a/go.sum
+++ b/go.sum
@@ -0,0 +1,77 @@
+git.daviestechlabs.io/daviestechlabs/handler-base v1.0.0 h1:pB3ehOKaDYQfbyRBKQXrB9curqSFteLrDveoElRKnBY=
+git.daviestechlabs.io/daviestechlabs/handler-base v1.0.0/go.mod h1:zocOHFt8yY3cW4+Xi37sNr5Tw7KcjGFSZqgWYxPWyqA=
+github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM=
+github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw=
+github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
+github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
+github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
+github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
+github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
+github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
+github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
+github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
+github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
+github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
+github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
+github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
+github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
+github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.7 h1:X+2YciYSxvMQK0UZ7sg45ZVabVZBeBuvMkmuI2V3Fak=
+github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.7/go.mod h1:lW34nIZuQ8UDPdkon5fmfp2l3+ZkQ2me/+oecHYLOII=
+github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
+github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
+github.com/nats-io/nats.go v1.48.0 h1:pSFyXApG+yWU/TgbKCjmm5K4wrHu86231/w84qRVR+U=
+github.com/nats-io/nats.go v1.48.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g=
+github.com/nats-io/nkeys v0.4.11 h1:q44qGV008kYd9W1b1nEBkNzvnWxtRSQ7A8BoqRrcfa0=
+github.com/nats-io/nkeys v0.4.11/go.mod h1:szDimtgmfOi9n25JpfIdGw12tZFYXqhGxjhVxsatHVE=
+github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
+github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
+github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
+go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
+go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
+go.opentelemetry.io/otel v1.40.0 h1:oA5YeOcpRTXq6NN7frwmwFR0Cn3RhTVZvXsP4duvCms=
+go.opentelemetry.io/otel v1.40.0/go.mod h1:IMb+uXZUKkMXdPddhwAHm6UfOwJyh4ct1ybIlV14J0g=
+go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.40.0 h1:NOyNnS19BF2SUDApbOKbDtWZ0IK7b8FJ2uAGdIWOGb0=
+go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.40.0/go.mod h1:VL6EgVikRLcJa9ftukrHu/ZkkhFBSo1lzvdBC9CF1ss=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.40.0 h1:QKdN8ly8zEMrByybbQgv8cWBcdAarwmIPZ6FThrWXJs=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.40.0/go.mod h1:bTdK1nhqF76qiPoCCdyFIV+N/sRHYXYCTQc+3VCi3MI=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0 h1:DvJDOPmSWQHWywQS6lKL+pb8s3gBLOZUtw4N+mavW1I=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0/go.mod h1:EtekO9DEJb4/jRyN4v4Qjc2yA7AtfCBuz2FynRUWTXs=
+go.opentelemetry.io/otel/metric v1.40.0 h1:rcZe317KPftE2rstWIBitCdVp89A2HqjkxR3c11+p9g=
+go.opentelemetry.io/otel/metric v1.40.0/go.mod h1:ib/crwQH7N3r5kfiBZQbwrTge743UDc7DTFVZrrXnqc=
+go.opentelemetry.io/otel/sdk v1.40.0 h1:KHW/jUzgo6wsPh9At46+h4upjtccTmuZCFAc9OJ71f8=
+go.opentelemetry.io/otel/sdk v1.40.0/go.mod h1:Ph7EFdYvxq72Y8Li9q8KebuYUr2KoeyHx0DRMKrYBUE=
+go.opentelemetry.io/otel/sdk/metric v1.40.0 h1:mtmdVqgQkeRxHgRv4qhyJduP3fYJRMX4AtAlbuWdCYw=
+go.opentelemetry.io/otel/sdk/metric v1.40.0/go.mod h1:4Z2bGMf0KSK3uRjlczMOeMhKU2rhUqdWNoKcYrtcBPg=
+go.opentelemetry.io/otel/trace v1.40.0 h1:WA4etStDttCSYuhwvEa8OP8I5EWu24lkOzp+ZYblVjw=
+go.opentelemetry.io/otel/trace v1.40.0/go.mod h1:zeAhriXecNGP/s2SEG3+Y8X9ujcJOTqQ5RgdEJcawiA=
+go.opentelemetry.io/proto/otlp v1.9.0 h1:l706jCMITVouPOqEnii2fIAuO3IVGBRPV5ICjceRb/A=
+go.opentelemetry.io/proto/otlp v1.9.0/go.mod h1:xE+Cx5E/eEHw+ISFkwPLwCZefwVjY+pqKg1qcK03+/4=
+go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
+go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
+golang.org/x/crypto v0.47.0 h1:V6e3FRj+n4dbpw86FJ8Fv7XVOql7TEwpHapKoMJ/GO8=
+golang.org/x/crypto v0.47.0/go.mod h1:ff3Y9VzzKbwSSEzWqJsJVBnWmRwRSHt/6Op5n9bQc4A=
+golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o=
+golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8=
+golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ=
+golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
+golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE=
+golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8=
+gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
+gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
+google.golang.org/genproto/googleapis/api v0.0.0-20260128011058-8636f8732409 h1:merA0rdPeUV3YIIfHHcH4qBkiQAc1nfCKSI7lB4cV2M=
+google.golang.org/genproto/googleapis/api v0.0.0-20260128011058-8636f8732409/go.mod h1:fl8J1IvUjCilwZzQowmw2b7HQB2eAuYBabMXzWurF+I=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20260128011058-8636f8732409 h1:H86B94AW+VfJWDqFeEbBPhEtHzJwJfTbgE2lZa54ZAQ=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20260128011058-8636f8732409/go.mod h1:j9x/tPzZkyxcgEFkiKEEGxfvyumM01BEtsW8xzOahRQ=
+google.golang.org/grpc v1.78.0 h1:K1XZG/yGDJnzMdd/uZHAkVqJE+xIDOcmdSFZkBUicNc=
+google.golang.org/grpc v1.78.0/go.mod h1:I47qjTo4OKbMkjA/aOOwxDIiPSBofUtQUI5EfpWvW7U=
+google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
+google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
--- a/main.go
+++ b/main.go
@@ -0,0 +1,251 @@
+package main
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"os"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/nats-io/nats.go"
+
+	"git.daviestechlabs.io/daviestechlabs/handler-base/clients"
+	"git.daviestechlabs.io/daviestechlabs/handler-base/config"
+	"git.daviestechlabs.io/daviestechlabs/handler-base/handler"
+	"git.daviestechlabs.io/daviestechlabs/handler-base/messages"
+	"git.daviestechlabs.io/daviestechlabs/handler-base/natsutil"
+	"google.golang.org/protobuf/proto"
+)
+
+func main() {
+	cfg := config.Load()
+	cfg.ServiceName = "chat-handler"
+	cfg.NATSQueueGroup = "chat-handlers"
+
+	// Chat-specific settings
+	ragTopK := getEnvInt("RAG_TOP_K", 10)
+	ragRerankTopK := getEnvInt("RAG_RERANK_TOP_K", 5)
+	ragCollection := getEnv("RAG_COLLECTION", "documents")
+	includeSources := getEnvBool("INCLUDE_SOURCES", true)
+	enableTTS := getEnvBool("ENABLE_TTS", false)
+	ttsLanguage := getEnv("TTS_LANGUAGE", "en")
+
+	// Service clients
+	timeout := 60 * time.Second
+	embeddings := clients.NewEmbeddingsClient(cfg.EmbeddingsURL(), timeout, "")
+	reranker := clients.NewRerankerClient(cfg.RerankerURL(), timeout)
+	llm := clients.NewLLMClient(cfg.LLMURL(), timeout)
+	milvus := clients.NewMilvusClient(cfg.MilvusHost, cfg.MilvusPort, ragCollection)
+
+	var tts *clients.TTSClient
+	if enableTTS {
+		tts = clients.NewTTSClient(cfg.TTSURL(), timeout, ttsLanguage)
+	}
+
+	h := handler.New("ai.chat.user.*.message", cfg)
+
+	h.OnTypedMessage(func(ctx context.Context, msg *nats.Msg) (proto.Message, error) {
+		var req messages.ChatRequest
+		if err := natsutil.Decode(msg.Data, &req); err != nil {
+			slog.Error("decode failed", "error", err)
+			return &messages.ErrorResponse{Error: true, Message: err.Error(), Type: "DecodeError"}, nil
+		}
+
+		query := messages.EffectiveQuery(&req)
+		requestID := req.RequestId
+		if requestID == "" {
+			requestID = "unknown"
+		}
+		userID := req.UserId
+		if userID == "" {
+			userID = "unknown"
+		}
+		enableRAG := req.EnableRag
+		if !enableRAG && req.Premium {
+			enableRAG = true
+		}
+		enableReranker := req.EnableReranker
+		if !enableReranker && enableRAG {
+			enableReranker = true
+		}
+		topK := req.TopK
+		if topK == 0 {
+			topK = int32(ragTopK)
+		}
+		collection := req.Collection
+		if collection == "" {
+			collection = ragCollection
+		}
+		reqEnableTTS := req.EnableTts || enableTTS
+		systemPrompt := req.SystemPrompt
+		responseSubject := req.ResponseSubject
+		if responseSubject == "" {
+			responseSubject = fmt.Sprintf("ai.chat.response.%s", requestID)
+		}
+
+		slog.Info("processing request", "request_id", requestID, "query_len", len(query))
+
+		contextText := ""
+		var ragSources []string
+		usedRAG := false
+
+		// RAG pipeline
+		if enableRAG {
+			// 1. Embed query
+			embedding, err := embeddings.EmbedSingle(ctx, query)
+			if err != nil {
+				slog.Error("embedding failed", "error", err)
+			} else {
+				// 2. Search Milvus
+				_ = milvus
+				_ = collection
+				_ = topK
+				_ = embedding
+				// NOTE: Milvus search uses the gRPC SDK (requires milvus-sdk-go)
+				// For now, we pass through without search; Milvus client will be
+				// connected when the SDK is integrated.
+				// documents := milvus.Search(ctx, embedding, topK)
+
+				var documents []map[string]any // placeholder for Milvus results
+
+				// 3. Rerank
+				if enableReranker && len(documents) > 0 {
+					texts := make([]string, len(documents))
+					for i, d := range documents {
+						if t, ok := d["text"].(string); ok {
+							texts[i] = t
+						}
+					}
+					reranked, err := reranker.Rerank(ctx, query, texts, ragRerankTopK)
+					if err != nil {
+						slog.Error("rerank failed", "error", err)
+					} else {
+						documents = make([]map[string]any, len(reranked))
+						for i, r := range reranked {
+							documents[i] = map[string]any{"document": r.Document, "score": r.Score}
+						}
+					}
+				}
+
+				// 4. Build context
+				if len(documents) > 0 {
+					var parts []string
+					for i, d := range documents {
+						text := ""
+						if t, ok := d["document"].(string); ok {
+							text = t
+						}
+						parts = append(parts, fmt.Sprintf("[%d] %s", i+1, text))
+					}
+					contextText = strings.Join(parts, "\n\n")
+
+					for _, d := range documents {
+						if len(ragSources) >= 3 {
+							break
+						}
+						src := ""
+						if s, ok := d["source"].(string); ok {
+							src = s
+						} else if s, ok := d["document"].(string); ok && len(s) > 80 {
+							src = s[:80]
+						}
+						ragSources = append(ragSources, src)
+					}
+					usedRAG = true
+				}
+			}
+		}
+
+		// 5. Generate LLM response (streaming when requested)
+		var responseText string
+		var err error
+		if req.EnableStreaming {
+			streamSubject := fmt.Sprintf("ai.chat.response.stream.%s", requestID)
+			responseText, err = llm.StreamGenerate(ctx, query, contextText, systemPrompt, func(token string) {
+				_ = h.NATS.Publish(streamSubject, &messages.ChatStreamChunk{
+					RequestId: requestID,
+					Type:      "chunk",
+					Content:   token,
+					Timestamp: messages.Timestamp(),
+				})
+			})
+			_ = h.NATS.Publish(streamSubject, &messages.ChatStreamChunk{
+				RequestId: requestID,
+				Type:      "done",
+				Done:      true,
+				Timestamp: messages.Timestamp(),
+			})
+		} else {
+			responseText, err = llm.Generate(ctx, query, contextText, systemPrompt)
+		}
+		if err != nil {
+			slog.Error("LLM generation failed", "error", err)
+			return &messages.ChatResponse{
+				UserId:  userID,
+				Success: false,
+				Error:   err.Error(),
+			}, nil
+		}
+
+		// 6. Optional TTS — audio as raw bytes (no base64)
+		var audio []byte
+		if reqEnableTTS && tts != nil {
+			audioBytes, err := tts.Synthesize(ctx, responseText, ttsLanguage, "")
+			if err != nil {
+				slog.Error("TTS failed", "error", err)
+			} else {
+				audio = audioBytes
+			}
+		}
+
+		result := &messages.ChatResponse{
+			UserId:       userID,
+			Response:     responseText,
+			ResponseText: responseText,
+			UsedRag:      usedRAG,
+			Success:      true,
+			Audio:        audio,
+		}
+		if includeSources {
+			result.RagSources = ragSources
+		}
+
+		// Publish to the response subject the frontend is waiting on
+		_ = h.NATS.Publish(responseSubject, result)
+
+		slog.Info("completed request", "request_id", requestID, "rag", usedRAG)
+		return result, nil
+	})
+
+	if err := h.Run(); err != nil {
+		slog.Error("handler failed", "error", err)
+		os.Exit(1)
+	}
+}
+
+// Helpers
+
+func getEnv(key, fallback string) string {
+	if v := os.Getenv(key); v != "" {
+		return v
+	}
+	return fallback
+}
+
+func getEnvInt(key string, fallback int) int {
+	if v := os.Getenv(key); v != "" {
+		if i, err := strconv.Atoi(v); err == nil {
+			return i
+		}
+	}
+	return fallback
+}
+
+func getEnvBool(key string, fallback bool) bool {
+	if v := os.Getenv(key); v != "" {
+		return strings.EqualFold(v, "true") || v == "1"
+	}
+	return fallback
+}
--- a/main_test.go
+++ b/main_test.go
@@ -0,0 +1,96 @@
+package main
+
+import (
+	"os"
+	"testing"
+
+	"git.daviestechlabs.io/daviestechlabs/handler-base/messages"
+	"google.golang.org/protobuf/proto"
+)
+
+func TestChatRequestDecode(t *testing.T) {
+	// Verify a proto-encoded struct round-trips cleanly.
+	original := &messages.ChatRequest{
+		RequestId: "req-1",
+		UserId:    "user-1",
+		Message:   "hello",
+		Premium:   true,
+		TopK:      10,
+	}
+	data, err := proto.Marshal(original)
+	if err != nil {
+		t.Fatal(err)
+	}
+	var req messages.ChatRequest
+	if err := proto.Unmarshal(data, &req); err != nil {
+		t.Fatal(err)
+	}
+	if req.RequestId != "req-1" {
+		t.Errorf("RequestID = %q", req.RequestId)
+	}
+	if messages.EffectiveQuery(&req) != "hello" {
+		t.Errorf("EffectiveQuery = %q", messages.EffectiveQuery(&req))
+	}
+	if !req.Premium {
+		t.Error("Premium should be true")
+	}
+	if req.TopK != 10 {
+		t.Errorf("TopK = %d", req.TopK)
+	}
+}
+
+func TestChatResponseRoundtrip(t *testing.T) {
+	resp := &messages.ChatResponse{
+		UserId:   "user-1",
+		Response: "answer",
+		Success:  true,
+		Audio:    []byte{0x01, 0x02, 0x03},
+	}
+	data, err := proto.Marshal(resp)
+	if err != nil {
+		t.Fatal(err)
+	}
+	var decoded messages.ChatResponse
+	if err := proto.Unmarshal(data, &decoded); err != nil {
+		t.Fatal(err)
+	}
+	if decoded.UserId != "user-1" || !decoded.Success {
+		t.Errorf("decoded: UserId=%s, Success=%v", decoded.UserId, decoded.Success)
+	}
+	if len(decoded.Audio) != 3 {
+		t.Errorf("audio len = %d", len(decoded.Audio))
+	}
+}
+
+func TestGetEnvHelpers(t *testing.T) {
+	t.Setenv("CHAT_TEST", "hello")
+	if got := getEnv("CHAT_TEST", "x"); got != "hello" {
+		t.Errorf("getEnv = %q", got)
+	}
+	if got := getEnv("NO_SUCH_VAR", "x"); got != "x" {
+		t.Errorf("getEnv fallback = %q", got)
+	}
+
+	t.Setenv("CHAT_PORT", "9090")
+	if got := getEnvInt("CHAT_PORT", 0); got != 9090 {
+		t.Errorf("getEnvInt = %d", got)
+	}
+	if got := getEnvInt("NO_SUCH_VAR", 80); got != 80 {
+		t.Errorf("getEnvInt fallback = %d", got)
+	}
+
+	t.Setenv("CHAT_FLAG", "true")
+	if got := getEnvBool("CHAT_FLAG", false); !got {
+		t.Error("getEnvBool should be true")
+	}
+	if got := getEnvBool("NO_SUCH_VAR", false); got {
+		t.Error("getEnvBool fallback should be false")
+	}
+}
+
+func TestMainBinaryBuilds(t *testing.T) {
+	// Verify the binary exists after build
+	if _, err := os.Stat("main.go"); err != nil {
+		t.Skip("main.go not found")
+	}
+}
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,43 +0,0 @@
-[project]
-name = "chat-handler"
-version = "1.0.0"
-description = "Text chat pipeline with RAG - Query → Embeddings → Milvus → Rerank → LLM"
-readme = "README.md"
-requires-python = ">=3.11"
-license = { text = "MIT" }
-authors = [{ name = "Davies Tech Labs" }]
-
-dependencies = [
-    "handler-base @ git+https://git.daviestechlabs.io/daviestechlabs/handler-base.git",
-]
-
-[project.optional-dependencies]
-dev = [
-    "pytest>=8.0.0",
-    "pytest-asyncio>=0.23.0",
-    "ruff>=0.1.0",
-]
-
-[build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
-
-[tool.hatch.metadata]
-allow-direct-references = true
-
-[tool.hatch.build.targets.wheel]
-packages = ["."]
-only-include = ["chat_handler.py"]
-
-[tool.ruff]
-line-length = 100
-target-version = "py311"
-
-[tool.pytest.ini_options]
-asyncio_mode = "auto"
-testpaths = ["tests"]
-python_files = ["test_*.py"]
-python_classes = ["Test*"]
-python_functions = ["test_*"]
-addopts = "-v --tb=short"
-filterwarnings = ["ignore::DeprecationWarning"]
--- a/tests/init.py
+++ b/tests/init.py
@@ -1 +0,0 @@
-# Chat Handler Tests
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,94 +0,0 @@
-"""
-Pytest configuration and fixtures for chat-handler tests.
-"""
-
-import asyncio
-import os
-from unittest.mock import MagicMock
-
-import pytest
-
-# Set test environment variables before importing
-os.environ.setdefault("NATS_URL", "nats://localhost:4222")
-os.environ.setdefault("REDIS_URL", "redis://localhost:6379")
-os.environ.setdefault("MILVUS_HOST", "localhost")
-os.environ.setdefault("OTEL_ENABLED", "false")
-os.environ.setdefault("MLFLOW_ENABLED", "false")
-
-
-@pytest.fixture(scope="session")
-def event_loop():
-    """Create event loop for async tests."""
-    loop = asyncio.new_event_loop()
-    yield loop
-    loop.close()
-
-
-@pytest.fixture
-def sample_embedding():
-    """Sample embedding vector."""
-    return [0.1] * 1024
-
-
-@pytest.fixture
-def sample_documents():
-    """Sample search results."""
-    return [
-        {"text": "Machine learning is a subset of AI.", "score": 0.95},
-        {"text": "Deep learning uses neural networks.", "score": 0.90},
-        {"text": "AI enables intelligent automation.", "score": 0.85},
-    ]
-
-
-@pytest.fixture
-def sample_reranked():
-    """Sample reranked results."""
-    return [
-        {"document": "Machine learning is a subset of AI.", "score": 0.98},
-        {"document": "Deep learning uses neural networks.", "score": 0.85},
-    ]
-
-
-@pytest.fixture
-def mock_nats_message():
-    """Create a mock NATS message."""
-    msg = MagicMock()
-    msg.subject = "ai.chat.user.test-user-1.message"
-    msg.reply = "ai.chat.response.test-123"
-    return msg
-
-
-@pytest.fixture
-def mock_chat_request():
-    """Sample chat request payload."""
-    return {
-        "request_id": "test-request-123",
-        "user_id": "test-user-1",
-        "username": "testuser",
-        "message": "What is machine learning?",
-        "premium": True,
-        "enable_rag": True,
-        "enable_reranker": True,
-        "enable_streaming": False,
-        "collection": "test_collection",
-        "enable_tts": False,
-        "system_prompt": None,
-    }
-
-
-@pytest.fixture
-def mock_chat_request_with_tts():
-    """Sample chat request with TTS enabled."""
-    return {
-        "request_id": "test-request-456",
-        "user_id": "test-user-2",
-        "username": "testuser2",
-        "message": "Tell me about AI",
-        "premium": True,
-        "enable_rag": True,
-        "enable_reranker": True,
-        "enable_streaming": False,
-        "collection": "documents",
-        "enable_tts": True,
-        "system_prompt": "You are a helpful assistant.",
-    }
--- a/tests/test_chat_handler.py
+++ b/tests/test_chat_handler.py
@@ -1,273 +0,0 @@
-"""
-Unit tests for ChatHandler.
-"""
-
-import pytest
-from unittest.mock import AsyncMock, patch
-
-from chat_handler import ChatHandler, ChatSettings
-
-
-class TestChatSettings:
-    """Tests for ChatSettings configuration."""
-
-    def test_default_settings(self):
-        """Test default settings values."""
-        settings = ChatSettings()
-
-        assert settings.service_name == "chat-handler"
-        assert settings.rag_top_k == 10
-        assert settings.rag_rerank_top_k == 5
-        assert settings.rag_collection == "documents"
-        assert settings.include_sources is True
-        assert settings.enable_tts is False
-        assert settings.tts_language == "en"
-
-    def test_custom_settings(self):
-        """Test custom settings."""
-        settings = ChatSettings(
-            rag_top_k=20,
-            rag_collection="custom_docs",
-            enable_tts=True,
-        )
-
-        assert settings.rag_top_k == 20
-        assert settings.rag_collection == "custom_docs"
-        assert settings.enable_tts is True
-
-
-class TestChatHandler:
-    """Tests for ChatHandler."""
-
-    @pytest.fixture
-    def handler(self):
-        """Create handler with mocked clients."""
-        with (
-            patch("chat_handler.EmbeddingsClient"),
-            patch("chat_handler.RerankerClient"),
-            patch("chat_handler.LLMClient"),
-            patch("chat_handler.TTSClient"),
-            patch("chat_handler.MilvusClient"),
-        ):
-            handler = ChatHandler()
-
-            # Setup mock clients
-            handler.embeddings = AsyncMock()
-            handler.reranker = AsyncMock()
-            handler.llm = AsyncMock()
-            handler.milvus = AsyncMock()
-            handler.tts = None  # TTS disabled by default
-            handler.nats = AsyncMock()
-
-            yield handler
-
-    @pytest.fixture
-    def handler_with_tts(self):
-        """Create handler with TTS enabled."""
-        with (
-            patch("chat_handler.EmbeddingsClient"),
-            patch("chat_handler.RerankerClient"),
-            patch("chat_handler.LLMClient"),
-            patch("chat_handler.TTSClient"),
-            patch("chat_handler.MilvusClient"),
-        ):
-            handler = ChatHandler()
-            handler.chat_settings.enable_tts = True
-
-            # Setup mock clients
-            handler.embeddings = AsyncMock()
-            handler.reranker = AsyncMock()
-            handler.llm = AsyncMock()
-            handler.milvus = AsyncMock()
-            handler.tts = AsyncMock()
-            handler.nats = AsyncMock()
-
-            yield handler
-
-    def test_init(self, handler):
-        """Test handler initialization."""
-        assert handler.subject == "ai.chat.user.*.message"
-        assert handler.queue_group == "chat-handlers"
-        assert handler.chat_settings.service_name == "chat-handler"
-
-    @pytest.mark.asyncio
-    async def test_handle_message_success(
-        self,
-        handler,
-        mock_nats_message,
-        mock_chat_request,
-        sample_embedding,
-        sample_documents,
-        sample_reranked,
-    ):
-        """Test successful chat request handling."""
-        # Setup mocks
-        handler.embeddings.embed_single.return_value = sample_embedding
-        handler.milvus.search_with_texts.return_value = sample_documents
-        handler.reranker.rerank.return_value = sample_reranked
-        handler.llm.generate.return_value = "Machine learning is a subset of AI that..."
-
-        # Execute
-        result = await handler.handle_message(mock_nats_message, mock_chat_request)
-
-        # Verify
-        assert result["user_id"] == "test-user-1"
-        assert result["success"] is True
-        assert "response" in result
-        assert result["response"] == "Machine learning is a subset of AI that..."
-        assert result["response_text"] == result["response"]
-        assert result["used_rag"] is True
-        assert isinstance(result["rag_sources"], list)
-
-        # Verify RAG pipeline was called (enable_rag=True in fixture)
-        handler.embeddings.embed_single.assert_called_once()
-        handler.milvus.search_with_texts.assert_called_once()
-        handler.reranker.rerank.assert_called_once()
-        handler.llm.generate.assert_called_once()
-
-    @pytest.mark.asyncio
-    async def test_handle_message_without_sources(
-        self,
-        handler,
-        mock_nats_message,
-        mock_chat_request,
-        sample_embedding,
-        sample_documents,
-        sample_reranked,
-    ):
-        """Test response without sources when disabled."""
-        handler.chat_settings.include_sources = False
-
-        handler.embeddings.embed_single.return_value = sample_embedding
-        handler.milvus.search_with_texts.return_value = sample_documents
-        handler.reranker.rerank.return_value = sample_reranked
-        handler.llm.generate.return_value = "Response text"
-
-        result = await handler.handle_message(mock_nats_message, mock_chat_request)
-
-        # New response format doesn't have a separate "sources" key;
-        # rag_sources is always present (may be empty)
-        assert "rag_sources" in result
-
-    @pytest.mark.asyncio
-    async def test_handle_message_with_tts(
-        self,
-        handler_with_tts,
-        mock_nats_message,
-        mock_chat_request_with_tts,
-        sample_embedding,
-        sample_documents,
-        sample_reranked,
-    ):
-        """Test response with TTS audio."""
-        handler = handler_with_tts
-
-        handler.embeddings.embed_single.return_value = sample_embedding
-        handler.milvus.search_with_texts.return_value = sample_documents
-        handler.reranker.rerank.return_value = sample_reranked
-        handler.llm.generate.return_value = "AI response"
-        handler.tts.synthesize.return_value = b"audio_bytes"
-
-        result = await handler.handle_message(mock_nats_message, mock_chat_request_with_tts)
-
-        assert "audio" in result
-        handler.tts.synthesize.assert_called_once()
-
-    @pytest.mark.asyncio
-    async def test_handle_message_with_custom_system_prompt(
-        self,
-        handler,
-        mock_nats_message,
-        sample_embedding,
-        sample_documents,
-        sample_reranked,
-    ):
-        """Test LLM is called with custom system prompt."""
-        request = {
-            "request_id": "test-123",
-            "user_id": "user-42",
-            "message": "Hello",
-            "premium": True,
-            "enable_rag": True,
-            "system_prompt": "You are a pirate. Respond like one.",
-        }
-
-        handler.embeddings.embed_single.return_value = sample_embedding
-        handler.milvus.search_with_texts.return_value = sample_documents
-        handler.reranker.rerank.return_value = sample_reranked
-        handler.llm.generate.return_value = "Ahoy!"
-
-        await handler.handle_message(mock_nats_message, request)
-
-        # Verify system_prompt was passed to LLM
-        handler.llm.generate.assert_called_once()
-        call_kwargs = handler.llm.generate.call_args.kwargs
-        assert call_kwargs.get("system_prompt") == "You are a pirate. Respond like one."
-
-    def test_build_context(self, handler):
-        """Test context building with numbered sources."""
-        documents = [
-            {"document": "First doc content"},
-            {"document": "Second doc content"},
-        ]
-
-        context = handler._build_context(documents)
-
-        assert "[1]" in context
-        assert "[2]" in context
-        assert "First doc content" in context
-        assert "Second doc content" in context
-
-    @pytest.mark.asyncio
-    async def test_setup_initializes_clients(self):
-        """Test that setup initializes all required clients."""
-        with (
-            patch("chat_handler.EmbeddingsClient") as emb_cls,
-            patch("chat_handler.RerankerClient") as rer_cls,
-            patch("chat_handler.LLMClient") as llm_cls,
-            patch("chat_handler.TTSClient") as tts_cls,
-            patch("chat_handler.MilvusClient") as mil_cls,
-        ):
-            mil_cls.return_value.connect = AsyncMock()
-
-            handler = ChatHandler()
-            await handler.setup()
-
-            emb_cls.assert_called_once()
-            rer_cls.assert_called_once()
-            llm_cls.assert_called_once()
-            mil_cls.assert_called_once()
-            # TTS should not be initialized when disabled
-            tts_cls.assert_not_called()
-
-    @pytest.mark.asyncio
-    async def test_teardown_closes_clients(self, handler):
-        """Test that teardown closes all clients."""
-        await handler.teardown()
-
-        handler.embeddings.close.assert_called_once()
-        handler.reranker.close.assert_called_once()
-        handler.llm.close.assert_called_once()
-        handler.milvus.close.assert_called_once()
-
-    @pytest.mark.asyncio
-    async def test_publishes_to_response_subject(
-        self,
-        handler,
-        mock_nats_message,
-        mock_chat_request,
-        sample_embedding,
-        sample_documents,
-        sample_reranked,
-    ):
-        """Test that result is published to response subject."""
-        handler.embeddings.embed_single.return_value = sample_embedding
-        handler.milvus.search_with_texts.return_value = sample_documents
-        handler.reranker.rerank.return_value = sample_reranked
-        handler.llm.generate.return_value = "Response"
-
-        await handler.handle_message(mock_nats_message, mock_chat_request)
-
-        handler.nats.publish.assert_called_once()
-        call_args = handler.nats.publish.call_args
-        assert "ai.chat.response.test-request-123" in str(call_args)
--- a/uv.lock
+++ b/uv.lock
Author	SHA1	Message	Date
Billy D.	a298568ef0	fix: use type=raw for Docker tags to preserve v prefix Some checks failed CI / Docker Build & Push (push) Failing after 3m3s Details CI / Notify (push) Successful in 2s Details CI / Lint (push) Successful in 2m54s Details CI / Test (push) Successful in 3m2s Details CI / Release (push) Successful in 1m15s Details docker/metadata-action type=semver strips the v prefix, causing tag mismatch between git tags (v0.1.3) and Docker tags (0.1.3). Switch to type=raw to pass through the version as-is.	2026-02-22 09:58:40 -05:00
Billy D.	4c3eb4952a	fix: switch Docker build to plain docker build/push with insecure registry Some checks failed CI / Lint (push) Successful in 2m50s Details CI / Release (push) Successful in 1m32s Details CI / Test (push) Successful in 2m51s Details CI / Docker Build & Push (push) Failing after 3m1s Details CI / Notify (push) Successful in 2s Details - Drop buildx (setup-buildx-action, build-push-action) - Use insecure HTTP registry with SIGHUP daemon reload - Use org-level PAT secrets for registry auth	2026-02-21 22:37:55 -05:00
Billy D.	4de3639df0	fix: use docker login CLI instead of login-action for Gitea compat Some checks failed CI / Lint (push) Successful in 2m18s Details CI / Release (push) Successful in 1m20s Details CI / Docker Build & Push (push) Failing after 5m43s Details CI / Test (push) Successful in 3m20s Details CI / Notify (push) Has been cancelled Details docker/login-action@v3 fails with 'Username and password required' on Gitea Actions — secrets not passed to action with: inputs. Switch to direct docker login CLI which reliably interpolates secrets in run: steps.	2026-02-21 19:34:27 -05:00
Billy D.	f6623d5473	fix: switch Docker registry to HTTPS endpoint with login-action Some checks failed CI / Release (push) Successful in 1m17s Details CI / Docker Build & Push (push) Failing after 8m21s Details CI / Test (push) Successful in 3m11s Details CI / Lint (push) Successful in 2m18s Details CI / Notify (push) Successful in 2s Details - Replace gitea-http.gitea.svc.cluster.local:3000 with registry.lab.daviestechlabs.io - Use docker/login-action@v3 for Gitea registry auth (proper buildx integration) - Remove manual base64 auth to ~/.docker/config.json (not picked up by buildkit) - Remove insecure registry daemon.json config and Docker restart - Remove buildkitd insecure registry config - Remove cache-from/cache-to type=gha (not supported on Gitea Actions) Fixes 401 Unauthorized: reqPackageAccess on Docker push	2026-02-21 18:05:40 -05:00
Billy D.	f61488a868	style: gofmt + fix copylocks lint warning Some checks failed CI / Test (push) Successful in 2m49s Details CI / Lint (push) Successful in 3m16s Details CI / Release (push) Successful in 2m8s Details CI / Docker Build & Push (push) Failing after 6m13s Details CI / Notify (push) Successful in 1s Details	2026-02-21 15:36:46 -05:00
Billy D.	e2176331c8	feat: migrate from msgpack to protobuf (handler-base v1.0.0) Some checks failed CI / Lint (push) Failing after 2m49s Details CI / Test (push) Successful in 3m36s Details CI / Notify (push) Has been cancelled Details CI / Docker Build & Push (push) Has been cancelled Details CI / Release (push) Has been cancelled Details - Replace msgpack encoding with protobuf wire format - Update field names to proto convention (UserId, RequestId, EnableRag, etc.) - Use messages.EffectiveQuery() standalone function - Cast TopK to int32 for proto compatibility - Rewrite tests for proto round-trips	2026-02-21 15:30:04 -05:00
Billy D.	87d0545d2c	feat: replace fake streaming with real SSE StreamGenerate Some checks failed CI / Lint (push) Successful in 3m0s Details CI / Test (push) Successful in 3m23s Details CI / Docker Build & Push (push) Failing after 4m55s Details CI / Release (push) Successful in 1m4s Details CI / Notify (push) Successful in 1s Details Use handler-base StreamGenerate() to publish real token-by-token ChatStreamChunk messages to NATS as they arrive from Ray Serve, instead of calling Generate() and splitting into 4-word chunks. Add 8 streaming tests: happy path, system prompt, RAG context, nil callback, timeout, HTTP error, context canceled, fallback.	2026-02-21 09:23:57 -05:00
Billy D.	7678d911fe	chore: bump handler-base to v0.1.5, add netrc secret mount to Dockerfile Some checks failed CI / Release (push) Successful in 1m11s Details CI / Lint (push) Successful in 3m55s Details CI / Test (push) Successful in 3m36s Details CI / Docker Build & Push (push) Failing after 8m10s Details CI / Notify (push) Successful in 1s Details	2026-02-20 18:15:47 -05:00
Billy D.	7d72400070	ci: retrigger build Some checks failed CI / Test (push) Successful in 2m51s Details CI / Release (push) Successful in 1m3s Details CI / Docker Build & Push (push) Failing after 28s Details CI / Lint (push) Successful in 3m27s Details CI / Notify (push) Successful in 1s Details	2026-02-20 10:05:21 -05:00
Billy D.	451e600985	fix: add GOPRIVATE and git auth for private handler-base module Some checks failed CI / Lint (push) Successful in 2m32s Details CI / Test (push) Successful in 2m54s Details CI / Release (push) Successful in 1m1s Details CI / Docker Build & Push (push) Failing after 6m40s Details CI / Notify (push) Successful in 1s Details - Set GOPRIVATE=git.daviestechlabs.io to bypass public Go module proxy - Configure git URL insteadOf with DISPATCH_TOKEN for private repo access	2026-02-20 09:22:35 -05:00
Billy D.	4b7b434eb7	fix: rename GITEA_TOKEN to DISPATCH_TOKEN Some checks failed CI / Lint (push) Has been cancelled Details CI / Test (push) Has been cancelled Details CI / Release (push) Has been cancelled Details CI / Docker Build & Push (push) Has been cancelled Details CI / Notify (push) Has been cancelled Details	2026-02-20 09:10:20 -05:00
Billy D.	808f41bc90	ci: add handler-base auto-update workflow, remove old Python CI Some checks failed CI / Lint (push) Has been cancelled Details CI / Test (push) Has been cancelled Details CI / Release (push) Has been cancelled Details CI / Docker Build & Push (push) Has been cancelled Details CI / Notify (push) Has been cancelled Details	2026-02-20 09:05:58 -05:00
Billy D.	2fa3668e8a	fix: use tagged handler-base v0.1.3, remove local replace directive Some checks failed CI / Docker Build & Push (push) Has been cancelled Details CI / Notify (push) Has been cancelled Details CI / Lint (push) Failing after 57s Details CI / Test (push) Failing after 1m34s Details CI / Release (push) Has been cancelled Details	2026-02-20 09:00:45 -05:00
Billy D.	b48fbb424d	fix: resolve golangci-lint errcheck warnings Some checks failed CI / Lint (push) Failing after 57s Details CI / Test (push) Failing after 1m22s Details CI / Release (push) Has been cancelled Details CI / Docker Build & Push (push) Has been cancelled Details CI / Notify (push) Has been cancelled Details - Add error checks for unchecked return values (errcheck) - Remove unused struct fields (unused) - Fix gofmt formatting issues	2026-02-20 08:45:24 -05:00
Billy Davies	c27d192705	Merge pull request 'feature/go-handler-refactor' (#1 ) from feature/go-handler-refactor into main Some checks failed CI / Lint (push) Failing after 57s Details CI / Test (push) Failing after 1m24s Details CI / Notify (push) Successful in 1s Details CI / Release (push) Has been skipped Details CI / Docker Build & Push (push) Has been skipped Details Reviewed-on: #1	2026-02-20 12:33:32 +00:00
Billy D.	4175e2070c	feat: migrate to typed messages, drop base64 Some checks failed CI / Lint (pull_request) Failing after 58s Details CI / Test (pull_request) Failing after 1m22s Details CI / Notify (pull_request) Successful in 1s Details CI / Release (pull_request) Has been skipped Details CI / Docker Build & Push (pull_request) Has been skipped Details - Switch OnMessage → OnTypedMessage with natsutil.Decode[messages.ChatRequest] - Return messages.ChatResponse / messages.ChatStreamChunk (not map[string]any) - Audio as raw []byte in msgpack (25% wire savings vs base64) - Remove strVal/boolVal/intVal helpers - Add .dockerignore, GOAMD64=v3 in Dockerfile - Update tests for typed structs (9 tests pass)	2026-02-20 07:10:43 -05:00
Billy D.	609b44de83	feat: add e2e tests + benchmarks, fix config API - e2e_test.go: full pipeline tests (LLM-only, RAG, TTS, timeout) - main.go: fix config field->method references (EmbeddingsURL() etc.) - Benchmarks: LLMOnly 136µs/op, RAGFlow 496µs/op	2026-02-20 06:45:21 -05:00
Billy D.	adcdb87b9a	feat: rewrite chat-handler in Go Replace Python chat handler with Go for smaller container images. Uses handler-base Go module for NATS, health, telemetry, and service clients. - RAG pipeline: embed → Milvus → rerank → LLM - Streaming response chunks - Optional TTS synthesis - Custom response_subject support for companions-frontend	2026-02-19 17:58:52 -05:00