""" parser.py — Markdown vault document parser. Extracts: - YAML frontmatter (title, tags, aliases, date, custom fields) - Plain text content (Markdown stripped) - WikiLinks [[target|alias]] and #tags - Word count """ from __future__ import annotations import re from dataclasses import dataclass, field from pathlib import Path from typing import Any import frontmatter # python-frontmatter # --------------------------------------------------------------------------- # Data classes # --------------------------------------------------------------------------- @dataclass class ParsedDocument: path: str title: str content_raw: str # original markdown content_text: str # plain text (markdown stripped) frontmatter: dict[str, Any] tags: list[str] aliases: list[str] wikilinks: list[str] # resolved link targets word_count: int # --------------------------------------------------------------------------- # Regexes # --------------------------------------------------------------------------- _WIKILINK_RE = re.compile(r'\[\[([^\[\]|]+)(?:\|[^\[\]]+)?\]\]') _INLINE_TAG_RE = re.compile(r'(?]+>') _HORIZONTAL_RULE_RE = re.compile(r'^[-*_]{3,}\s*$', re.MULTILINE) def _strip_markdown(text: str) -> str: """Convert Markdown to plain text (lightweight, no external deps).""" # Remove code blocks first (preserve whitespace context) text = _CODE_BLOCK_RE.sub(' ', text) # Remove headings marker characters text = _HEADING_RE.sub('', text) # Replace Markdown links with their label text = _MARKDOWN_LINK_RE.sub(r'\1', text) # Replace WikiLinks with their display text (or target) text = _WIKILINK_RE.sub(lambda m: m.group(1).split('/')[-1], text) # Remove HTML tags text = _HTML_RE.sub(' ', text) # Remove horizontal rules text = _HORIZONTAL_RULE_RE.sub('', text) # Normalise whitespace text = re.sub(r'\n{3,}', '\n\n', text) return text.strip() # --------------------------------------------------------------------------- # Parser # --------------------------------------------------------------------------- def parse_document(file_path: Path, vault_root: Path) -> ParsedDocument: """ Parse a single Markdown file and return a ``ParsedDocument``. Args: file_path: Absolute path to the Markdown file. vault_root: Absolute path to the vault root (used to compute relative path). """ raw_text = file_path.read_text(encoding='utf-8', errors='replace') relative_path = str(file_path.relative_to(vault_root)) # Parse frontmatter + body post = frontmatter.loads(raw_text) fm: dict[str, Any] = dict(post.metadata) body: str = post.content # ---- Title ---- title: str = fm.get('title', '') if not title: # Fall back to first H1 heading h1 = re.search(r'^#\s+(.+)$', body, re.MULTILINE) if h1: title = h1.group(1).strip() else: title = file_path.stem # ---- Tags ---- fm_tags: list[str] = _normalise_list(fm.get('tags', [])) inline_tags: list[str] = _INLINE_TAG_RE.findall(body) tags = list(dict.fromkeys([t.lower().lstrip('#') for t in fm_tags + inline_tags])) # ---- Aliases ---- aliases = _normalise_list(fm.get('aliases', [])) # ---- WikiLinks ---- wikilinks = list(dict.fromkeys(_WIKILINK_RE.findall(body))) # ---- Plain text ---- content_text = _strip_markdown(body) word_count = len(content_text.split()) return ParsedDocument( path=relative_path, title=title, content_raw=raw_text, content_text=content_text, frontmatter=fm, tags=tags, aliases=aliases, wikilinks=wikilinks, word_count=word_count, ) def _normalise_list(value: Any) -> list[str]: """Accept str, list[str], or None and return list[str].""" if not value: return [] if isinstance(value, str): return [value] return [str(v) for v in value]