Fix indexer: handle date serialization and pgvector format

3 weeks ago · 773199bcb6
parent 90961cb62e
commit 773199bcb6
1 changed files with 13 additions and 2 deletions
--- a/services/ingestion-worker/indexer.py
+++ b/services/ingestion-worker/indexer.py
@ -7,6 +7,7 @@ from __future__ import annotations
 import hashlib
 import json
 import logging
+from datetime import date, datetime
 from typing import Any

 import asyncpg
@ -21,6 +22,14 @@ def sha256(text: str) -> str:
    return hashlib.sha256(text.encode('utf-8')).hexdigest()


+class DateTimeEncoder(json.JSONEncoder):
+    """JSON encoder that handles date/datetime objects."""
+    def default(self, obj):
+        if isinstance(obj, (date, datetime)):
+            return obj.isoformat()
+        return super().default(obj)
+
+
 async def upsert_document(
    conn: asyncpg.Connection,
    doc: ParsedDocument,
@ -56,7 +65,7 @@ async def upsert_document(
            doc.title,
            doc.content_raw,
            content_hash,
-            json.dumps(doc.frontmatter),
+            json.dumps(doc.frontmatter, cls=DateTimeEncoder),
            doc.tags,
            doc.aliases,
            doc.word_count,
@ -71,12 +80,14 @@ async def upsert_document(
        # ---- Insert chunks + embeddings ----
        chunk_records = []
        for chunk, embedding in zip(chunks, embeddings):
+            # Convert embedding list to pgvector string format
+            embedding_str = '[' + ','.join(str(x) for x in embedding) + ']'
            chunk_records.append((
                row['id'],
                chunk.chunk_index,
                chunk.content,
                chunk.token_count,
-                embedding,
+                embedding_str,
                json.dumps(chunk.metadata),
            ))