|
|
|
|
@ -7,6 +7,7 @@ from __future__ import annotations
|
|
|
|
|
import hashlib
|
|
|
|
|
import json
|
|
|
|
|
import logging
|
|
|
|
|
from datetime import date, datetime
|
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
|
|
import asyncpg
|
|
|
|
|
@ -21,6 +22,14 @@ def sha256(text: str) -> str:
|
|
|
|
|
return hashlib.sha256(text.encode('utf-8')).hexdigest()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DateTimeEncoder(json.JSONEncoder):
|
|
|
|
|
"""JSON encoder that handles date/datetime objects."""
|
|
|
|
|
def default(self, obj):
|
|
|
|
|
if isinstance(obj, (date, datetime)):
|
|
|
|
|
return obj.isoformat()
|
|
|
|
|
return super().default(obj)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def upsert_document(
|
|
|
|
|
conn: asyncpg.Connection,
|
|
|
|
|
doc: ParsedDocument,
|
|
|
|
|
@ -56,7 +65,7 @@ async def upsert_document(
|
|
|
|
|
doc.title,
|
|
|
|
|
doc.content_raw,
|
|
|
|
|
content_hash,
|
|
|
|
|
json.dumps(doc.frontmatter),
|
|
|
|
|
json.dumps(doc.frontmatter, cls=DateTimeEncoder),
|
|
|
|
|
doc.tags,
|
|
|
|
|
doc.aliases,
|
|
|
|
|
doc.word_count,
|
|
|
|
|
@ -71,12 +80,14 @@ async def upsert_document(
|
|
|
|
|
# ---- Insert chunks + embeddings ----
|
|
|
|
|
chunk_records = []
|
|
|
|
|
for chunk, embedding in zip(chunks, embeddings):
|
|
|
|
|
# Convert embedding list to pgvector string format
|
|
|
|
|
embedding_str = '[' + ','.join(str(x) for x in embedding) + ']'
|
|
|
|
|
chunk_records.append((
|
|
|
|
|
row['id'],
|
|
|
|
|
chunk.chunk_index,
|
|
|
|
|
chunk.content,
|
|
|
|
|
chunk.token_count,
|
|
|
|
|
embedding,
|
|
|
|
|
embedding_str,
|
|
|
|
|
json.dumps(chunk.metadata),
|
|
|
|
|
))
|
|
|
|
|
|
|
|
|
|
|