diff --git a/services/ingestion-worker/indexer.py b/services/ingestion-worker/indexer.py index a439707..90f35a4 100644 --- a/services/ingestion-worker/indexer.py +++ b/services/ingestion-worker/indexer.py @@ -7,6 +7,7 @@ from __future__ import annotations import hashlib import json import logging +from datetime import date, datetime from typing import Any import asyncpg @@ -21,6 +22,14 @@ def sha256(text: str) -> str: return hashlib.sha256(text.encode('utf-8')).hexdigest() +class DateTimeEncoder(json.JSONEncoder): + """JSON encoder that handles date/datetime objects.""" + def default(self, obj): + if isinstance(obj, (date, datetime)): + return obj.isoformat() + return super().default(obj) + + async def upsert_document( conn: asyncpg.Connection, doc: ParsedDocument, @@ -56,7 +65,7 @@ async def upsert_document( doc.title, doc.content_raw, content_hash, - json.dumps(doc.frontmatter), + json.dumps(doc.frontmatter, cls=DateTimeEncoder), doc.tags, doc.aliases, doc.word_count, @@ -71,12 +80,14 @@ async def upsert_document( # ---- Insert chunks + embeddings ---- chunk_records = [] for chunk, embedding in zip(chunks, embeddings): + # Convert embedding list to pgvector string format + embedding_str = '[' + ','.join(str(x) for x in embedding) + ']' chunk_records.append(( row['id'], chunk.chunk_index, chunk.content, chunk.token_count, - embedding, + embedding_str, json.dumps(chunk.metadata), ))