You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
81 lines
2.6 KiB
81 lines
2.6 KiB
"""
|
|
summarization/agent.py — Summarization Agent: generates summaries for long documents.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
|
|
import httpx
|
|
|
|
from base_agent import BaseAgent
|
|
|
|
logger = logging.getLogger('agent.summarization')
|
|
|
|
SUMMARY_PROMPT = """You are a knowledge management assistant.
|
|
Write a concise 2-4 sentence summary of the following document.
|
|
The summary should capture the main ideas and be useful for quick reference.
|
|
Respond with only the summary, no preamble.
|
|
|
|
Title: {title}
|
|
|
|
Content:
|
|
{content}
|
|
|
|
Summary:"""
|
|
|
|
|
|
class SummarizationAgent(BaseAgent):
|
|
agent_type = 'summarization'
|
|
|
|
async def process(self, job_id: str, payload: dict) -> dict:
|
|
ollama_url = self.settings.ollama_url
|
|
model = self.settings.chat_model
|
|
|
|
async with self.pool.acquire() as conn:
|
|
# Long documents that don't have a summary in frontmatter
|
|
docs = await conn.fetch(
|
|
"""
|
|
SELECT id::text, title, content, frontmatter
|
|
FROM documents
|
|
WHERE word_count > 500
|
|
AND (frontmatter->>'summary' IS NULL OR frontmatter->>'summary' = '')
|
|
LIMIT 10
|
|
"""
|
|
)
|
|
|
|
summarized = 0
|
|
for doc in docs:
|
|
doc_id = doc['id']
|
|
title = doc['title'] or ''
|
|
content = (doc['content'] or '')[:4000]
|
|
|
|
try:
|
|
summary = await self._generate_summary(title, content, ollama_url, model)
|
|
if summary:
|
|
fm = dict(doc['frontmatter'] or {})
|
|
fm['summary'] = summary
|
|
await conn.execute(
|
|
"UPDATE documents SET frontmatter = $2::jsonb WHERE id = $1::uuid",
|
|
doc_id, __import__('json').dumps(fm),
|
|
)
|
|
summarized += 1
|
|
logger.debug('Summarized: %s', title)
|
|
except Exception as exc:
|
|
logger.warning('Failed to summarize %s: %s', doc_id, exc)
|
|
|
|
return {'documents_summarized': summarized}
|
|
|
|
async def _generate_summary(
|
|
self, title: str, content: str, ollama_url: str, model: str
|
|
) -> str:
|
|
prompt = SUMMARY_PROMPT.format(title=title, content=content)
|
|
async with httpx.AsyncClient(timeout=60.0) as client:
|
|
resp = await client.post(
|
|
f'{ollama_url.rstrip("/")}/api/generate',
|
|
json={'model': model, 'prompt': prompt, 'stream': False},
|
|
)
|
|
resp.raise_for_status()
|
|
return resp.json().get('response', '').strip()
|