second-brain/infra/database/schema.sql

-- AI Second Brain — PostgreSQL Schema
-- Requires: PostgreSQL 14+ with pgvector extension

-- Enable extensions
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS pg_trgm;  -- for fuzzy text search

-- ---------------------------------------------------------------------------
-- DOCUMENTS
-- Represents a single Markdown file in the vault.
-- ---------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS documents (
    id              UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    path            TEXT NOT NULL UNIQUE,       -- relative path within vault
    title           TEXT,
    content         TEXT NOT NULL,              -- full raw markdown
    content_hash    TEXT NOT NULL,              -- SHA-256 for change detection
    frontmatter     JSONB NOT NULL DEFAULT '{}',
    tags            TEXT[] NOT NULL DEFAULT '{}',
    aliases         TEXT[] NOT NULL DEFAULT '{}',
    word_count      INTEGER,
    created_at      TIMESTAMPTZ NOT NULL DEFAULT now(),
    updated_at      TIMESTAMPTZ NOT NULL DEFAULT now(),
    indexed_at      TIMESTAMPTZ,
    fts_vector      TSVECTOR                    -- auto-maintained below
);

CREATE INDEX IF NOT EXISTS idx_documents_path        ON documents (path);
CREATE INDEX IF NOT EXISTS idx_documents_tags        ON documents USING GIN (tags);
CREATE INDEX IF NOT EXISTS idx_documents_aliases     ON documents USING GIN (aliases);
CREATE INDEX IF NOT EXISTS idx_documents_fts         ON documents USING GIN (fts_vector);
CREATE INDEX IF NOT EXISTS idx_documents_frontmatter ON documents USING GIN (frontmatter);
CREATE INDEX IF NOT EXISTS idx_documents_updated     ON documents (updated_at DESC);

-- Auto-update fts_vector on insert/update
CREATE OR REPLACE FUNCTION documents_fts_trigger()
RETURNS TRIGGER AS $$
BEGIN
    NEW.fts_vector :=
        setweight(to_tsvector('english', coalesce(NEW.title, '')), 'A') ||
        setweight(to_tsvector('english', coalesce(array_to_string(NEW.tags, ' '), '')), 'B') ||
        setweight(to_tsvector('english', coalesce(NEW.content, '')), 'C');
    RETURN NEW;
END;
$$ LANGUAGE plpgsql;

DROP TRIGGER IF EXISTS trig_documents_fts ON documents;
CREATE TRIGGER trig_documents_fts
    BEFORE INSERT OR UPDATE ON documents
    FOR EACH ROW EXECUTE FUNCTION documents_fts_trigger();

-- Auto-update updated_at timestamp
CREATE OR REPLACE FUNCTION set_updated_at()
RETURNS TRIGGER AS $$
BEGIN
    NEW.updated_at = now();
    RETURN NEW;
END;
$$ LANGUAGE plpgsql;

DROP TRIGGER IF EXISTS trig_documents_updated_at ON documents;
CREATE TRIGGER trig_documents_updated_at
    BEFORE UPDATE ON documents
    FOR EACH ROW EXECUTE FUNCTION set_updated_at();

-- ---------------------------------------------------------------------------
-- CHUNKS
-- Sliding-window text chunks from documents, each with an embedding vector.
-- ---------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS chunks (
    id              UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    document_id     UUID NOT NULL REFERENCES documents (id) ON DELETE CASCADE,
    chunk_index     INTEGER NOT NULL,
    content         TEXT NOT NULL,
    token_count     INTEGER,
    embedding       VECTOR(768),                -- nomic-embed-text dimension
    metadata        JSONB NOT NULL DEFAULT '{}',-- heading path, page, etc.
    created_at      TIMESTAMPTZ NOT NULL DEFAULT now(),
    UNIQUE (document_id, chunk_index)
);

CREATE INDEX IF NOT EXISTS idx_chunks_document_id ON chunks (document_id);

-- HNSW index — fast approximate nearest-neighbour search
-- Requires pgvector >= 0.5.0. Falls back to IVFFlat if unavailable.
CREATE INDEX IF NOT EXISTS idx_chunks_embedding_hnsw
    ON chunks USING hnsw (embedding vector_cosine_ops)
    WITH (m = 16, ef_construction = 64);

-- ---------------------------------------------------------------------------
-- ENTITIES
-- Named entities extracted from documents (optional NER layer).
-- ---------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS entities (
    id              UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    document_id     UUID NOT NULL REFERENCES documents (id) ON DELETE CASCADE,
    name            TEXT NOT NULL,
    entity_type     TEXT NOT NULL,              -- PERSON, ORG, CONCEPT, PLACE, etc.
    context         TEXT,                       -- surrounding sentence
    confidence      FLOAT,
    created_at      TIMESTAMPTZ NOT NULL DEFAULT now()
);

CREATE INDEX IF NOT EXISTS idx_entities_document_id  ON entities (document_id);
CREATE INDEX IF NOT EXISTS idx_entities_name         ON entities (name);
CREATE INDEX IF NOT EXISTS idx_entities_type         ON entities (entity_type);
CREATE INDEX IF NOT EXISTS idx_entities_name_trgm    ON entities USING GIN (name gin_trgm_ops);

-- ---------------------------------------------------------------------------
-- RELATIONS
-- WikiLink / explicit relations between documents.
-- ---------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS relations (
    id              UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    source_doc_id   UUID NOT NULL REFERENCES documents (id) ON DELETE CASCADE,
    target_path     TEXT NOT NULL,              -- raw link target (may be unresolved)
    target_doc_id   UUID REFERENCES documents (id) ON DELETE SET NULL,
    relation_type   TEXT NOT NULL DEFAULT 'wikilink', -- wikilink | tag | explicit | ai-inferred
    label           TEXT,                       -- optional human label for the edge
    context         TEXT,                       -- surrounding text of the link
    created_at      TIMESTAMPTZ NOT NULL DEFAULT now()
);

CREATE INDEX IF NOT EXISTS idx_relations_source       ON relations (source_doc_id);
CREATE INDEX IF NOT EXISTS idx_relations_target_id    ON relations (target_doc_id);
CREATE INDEX IF NOT EXISTS idx_relations_target_path  ON relations (target_path);
CREATE INDEX IF NOT EXISTS idx_relations_type         ON relations (relation_type);

-- ---------------------------------------------------------------------------
-- AGENT JOBS
-- Persistent job queue consumed by AI agents.
-- ---------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS agent_jobs (
    id              UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    agent_type      TEXT NOT NULL,              -- ingestion | linking | tagging | summarization | maintenance
    status          TEXT NOT NULL DEFAULT 'pending', -- pending | running | done | failed | cancelled
    priority        INTEGER NOT NULL DEFAULT 5, -- 1 (highest) .. 10 (lowest)
    payload         JSONB NOT NULL DEFAULT '{}',
    result          JSONB,
    error           TEXT,
    retry_count     INTEGER NOT NULL DEFAULT 0,
    max_retries     INTEGER NOT NULL DEFAULT 3,
    created_at      TIMESTAMPTZ NOT NULL DEFAULT now(),
    started_at      TIMESTAMPTZ,
    completed_at    TIMESTAMPTZ,
    scheduled_for   TIMESTAMPTZ NOT NULL DEFAULT now()
);

CREATE INDEX IF NOT EXISTS idx_agent_jobs_status        ON agent_jobs (status);
CREATE INDEX IF NOT EXISTS idx_agent_jobs_type          ON agent_jobs (agent_type);
CREATE INDEX IF NOT EXISTS idx_agent_jobs_scheduled     ON agent_jobs (scheduled_for ASC)
    WHERE status = 'pending';

-- ---------------------------------------------------------------------------
-- AGENT LOGS
-- Structured log entries written by agents.
-- ---------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS agent_logs (
    id              UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    job_id          UUID REFERENCES agent_jobs (id) ON DELETE SET NULL,
    agent_type      TEXT NOT NULL,
    level           TEXT NOT NULL DEFAULT 'info', -- debug | info | warning | error
    message         TEXT NOT NULL,
    metadata        JSONB NOT NULL DEFAULT '{}',
    created_at      TIMESTAMPTZ NOT NULL DEFAULT now()
);

CREATE INDEX IF NOT EXISTS idx_agent_logs_job_id   ON agent_logs (job_id);
CREATE INDEX IF NOT EXISTS idx_agent_logs_created  ON agent_logs (created_at DESC);
CREATE INDEX IF NOT EXISTS idx_agent_logs_level    ON agent_logs (level);

-- ---------------------------------------------------------------------------
-- SYSTEM CONFIG
-- Runtime key-value configuration, editable by agents and admins.
-- ---------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS system_config (
    key             TEXT PRIMARY KEY,
    value           JSONB NOT NULL,
    description     TEXT,
    updated_at      TIMESTAMPTZ NOT NULL DEFAULT now()
);

-- Seed default configuration
INSERT INTO system_config (key, value, description) VALUES
    ('embedding_model',   '"nomic-embed-text"',         'Ollama model for embeddings'),
    ('chat_model',        '"mistral"',                   'Ollama model for chat/generation'),
    ('chunk_size',        '700',                         'Target tokens per chunk'),
    ('chunk_overlap',     '70',                          'Overlap tokens between chunks'),
    ('search_top_k',      '10',                          'Default number of search results'),
    ('search_threshold',  '0.65',                        'Minimum cosine similarity score'),
    ('rerank_enabled',    'false',                       'Enable cross-encoder reranking'),
    ('auto_tag',          'true',                        'Auto-tag documents via LLM'),
    ('auto_summarize',    'true',                        'Auto-summarize long documents')
ON CONFLICT (key) DO NOTHING;