second-brain/services/ingestion-worker/parser.py

"""
parser.py — Markdown vault document parser.

Extracts:
  - YAML frontmatter (title, tags, aliases, date, custom fields)
  - Plain text content (Markdown stripped)
  - WikiLinks [[target|alias]] and #tags
  - Word count
"""

from __future__ import annotations

import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

import frontmatter  # python-frontmatter


# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------

@dataclass
class ParsedDocument:
    path: str
    title: str
    content_raw: str          # original markdown
    content_text: str         # plain text (markdown stripped)
    frontmatter: dict[str, Any]
    tags: list[str]
    aliases: list[str]
    wikilinks: list[str]      # resolved link targets
    word_count: int


# ---------------------------------------------------------------------------
# Regexes
# ---------------------------------------------------------------------------

_WIKILINK_RE = re.compile(r'\[\[([^\[\]|]+)(?:\|[^\[\]]+)?\]\]')
_INLINE_TAG_RE = re.compile(r'(?<!\w)#([\w/-]+)')
_HEADING_RE = re.compile(r'^#{1,6}\s+', re.MULTILINE)
_MARKDOWN_LINK_RE = re.compile(r'!?\[([^\]]*)\]\([^\)]*\)')
_CODE_BLOCK_RE = re.compile(r'```[\s\S]*?```|`[^`]+`', re.MULTILINE)
_HTML_RE = re.compile(r'<[^>]+>')
_HORIZONTAL_RULE_RE = re.compile(r'^[-*_]{3,}\s*$', re.MULTILINE)


def _strip_markdown(text: str) -> str:
    """Convert Markdown to plain text (lightweight, no external deps)."""
    # Remove code blocks first (preserve whitespace context)
    text = _CODE_BLOCK_RE.sub(' ', text)
    # Remove headings marker characters
    text = _HEADING_RE.sub('', text)
    # Replace Markdown links with their label
    text = _MARKDOWN_LINK_RE.sub(r'\1', text)
    # Replace WikiLinks with their display text (or target)
    text = _WIKILINK_RE.sub(lambda m: m.group(1).split('/')[-1], text)
    # Remove HTML tags
    text = _HTML_RE.sub(' ', text)
    # Remove horizontal rules
    text = _HORIZONTAL_RULE_RE.sub('', text)
    # Normalise whitespace
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()


# ---------------------------------------------------------------------------
# Parser
# ---------------------------------------------------------------------------

def parse_document(file_path: Path, vault_root: Path) -> ParsedDocument:
    """
    Parse a single Markdown file and return a ``ParsedDocument``.

    Args:
        file_path:  Absolute path to the Markdown file.
        vault_root: Absolute path to the vault root (used to compute relative path).
    """
    raw_text = file_path.read_text(encoding='utf-8', errors='replace')
    relative_path = str(file_path.relative_to(vault_root))

    # Parse frontmatter + body
    post = frontmatter.loads(raw_text)
    fm: dict[str, Any] = dict(post.metadata)
    body: str = post.content

    # ---- Title ----
    title: str = fm.get('title', '')
    if not title:
        # Fall back to first H1 heading
        h1 = re.search(r'^#\s+(.+)$', body, re.MULTILINE)
        if h1:
            title = h1.group(1).strip()
        else:
            title = file_path.stem

    # ---- Tags ----
    fm_tags: list[str] = _normalise_list(fm.get('tags', []))
    inline_tags: list[str] = _INLINE_TAG_RE.findall(body)
    tags = list(dict.fromkeys([t.lower().lstrip('#') for t in fm_tags + inline_tags]))

    # ---- Aliases ----
    aliases = _normalise_list(fm.get('aliases', []))

    # ---- WikiLinks ----
    wikilinks = list(dict.fromkeys(_WIKILINK_RE.findall(body)))

    # ---- Plain text ----
    content_text = _strip_markdown(body)
    word_count = len(content_text.split())

    return ParsedDocument(
        path=relative_path,
        title=title,
        content_raw=raw_text,
        content_text=content_text,
        frontmatter=fm,
        tags=tags,
        aliases=aliases,
        wikilinks=wikilinks,
        word_count=word_count,
    )


def _normalise_list(value: Any) -> list[str]:
    """Accept str, list[str], or None and return list[str]."""
    if not value:
        return []
    if isinstance(value, str):
        return [value]
    return [str(v) for v in value]