Fix indexer: handle date serialization and pgvector format

main
Clawd 3 weeks ago
parent 90961cb62e
commit 773199bcb6

@ -7,6 +7,7 @@ from __future__ import annotations
import hashlib import hashlib
import json import json
import logging import logging
from datetime import date, datetime
from typing import Any from typing import Any
import asyncpg import asyncpg
@ -21,6 +22,14 @@ def sha256(text: str) -> str:
return hashlib.sha256(text.encode('utf-8')).hexdigest() return hashlib.sha256(text.encode('utf-8')).hexdigest()
class DateTimeEncoder(json.JSONEncoder):
"""JSON encoder that handles date/datetime objects."""
def default(self, obj):
if isinstance(obj, (date, datetime)):
return obj.isoformat()
return super().default(obj)
async def upsert_document( async def upsert_document(
conn: asyncpg.Connection, conn: asyncpg.Connection,
doc: ParsedDocument, doc: ParsedDocument,
@ -56,7 +65,7 @@ async def upsert_document(
doc.title, doc.title,
doc.content_raw, doc.content_raw,
content_hash, content_hash,
json.dumps(doc.frontmatter), json.dumps(doc.frontmatter, cls=DateTimeEncoder),
doc.tags, doc.tags,
doc.aliases, doc.aliases,
doc.word_count, doc.word_count,
@ -71,12 +80,14 @@ async def upsert_document(
# ---- Insert chunks + embeddings ---- # ---- Insert chunks + embeddings ----
chunk_records = [] chunk_records = []
for chunk, embedding in zip(chunks, embeddings): for chunk, embedding in zip(chunks, embeddings):
# Convert embedding list to pgvector string format
embedding_str = '[' + ','.join(str(x) for x in embedding) + ']'
chunk_records.append(( chunk_records.append((
row['id'], row['id'],
chunk.chunk_index, chunk.chunk_index,
chunk.content, chunk.content,
chunk.token_count, chunk.token_count,
embedding, embedding_str,
json.dumps(chunk.metadata), json.dumps(chunk.metadata),
)) ))

Loading…
Cancel
Save

Powered by TurnKey Linux.