second-brain/services/ingestion-worker/pipeline.py

"""
pipeline.py — Orchestrates the full ingestion flow for a single file.
"""

from __future__ import annotations

import hashlib
import logging
from pathlib import Path

import asyncpg

from chunker import chunk_document
from embedder import get_embedder
from indexer import document_needs_reindex, upsert_document
from parser import parse_document
from settings import Settings

logger = logging.getLogger(__name__)


def _sha256(text: str) -> str:
    return hashlib.sha256(text.encode('utf-8')).hexdigest()


async def ingest_file(
    file_path: Path,
    settings: Settings,
    conn: asyncpg.Connection,
) -> bool:
    """
    Full ingestion pipeline for a single Markdown file.

    Returns True if the file was (re)indexed, False if skipped.
    """
    vault_root = Path(settings.vault_path)

    if not file_path.exists():
        logger.warning('File not found, skipping: %s', file_path)
        return False

    if not file_path.suffix.lower() == '.md':
        return False

    raw_text = file_path.read_text(encoding='utf-8', errors='replace')
    content_hash = _sha256(raw_text)
    relative_path = str(file_path.relative_to(vault_root))

    # Idempotency check
    if not await document_needs_reindex(conn, relative_path, content_hash):
        logger.debug('Skipping unchanged file: %s', relative_path)
        return False

    logger.info('Ingesting %s', relative_path)

    # Parse
    doc = parse_document(file_path, vault_root)

    # Chunk
    chunks = chunk_document(
        doc.content_text,
        target_tokens=settings.chunk_size,
        overlap_tokens=settings.chunk_overlap,
    )

    if not chunks:
        logger.warning('No chunks generated for %s', relative_path)
        return False

    # Embed
    embedder = get_embedder(
        provider=settings.embedding_provider,
        ollama_url=settings.ollama_url,
        model=settings.embedding_model,
    )
    texts = [c.content for c in chunks]
    embeddings = embedder.embed_batch(texts)

    # Validate embedding dimension consistency
    if embeddings and len(embeddings[0]) != embedder.dimensions:
        logger.error(
            'Embedding dimension mismatch: expected %d, got %d',
            embedder.dimensions,
            len(embeddings[0]),
        )
        raise ValueError('Embedding dimension mismatch')

    # Store
    doc_id = await upsert_document(conn, doc, chunks, embeddings)
    logger.info('Indexed %s → %s (%d chunks)', relative_path, doc_id, len(chunks))
    return True