You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
135 lines
4.2 KiB
135 lines
4.2 KiB
"""
|
|
parser.py — Markdown vault document parser.
|
|
|
|
Extracts:
|
|
- YAML frontmatter (title, tags, aliases, date, custom fields)
|
|
- Plain text content (Markdown stripped)
|
|
- WikiLinks [[target|alias]] and #tags
|
|
- Word count
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import frontmatter # python-frontmatter
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Data classes
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@dataclass
|
|
class ParsedDocument:
|
|
path: str
|
|
title: str
|
|
content_raw: str # original markdown
|
|
content_text: str # plain text (markdown stripped)
|
|
frontmatter: dict[str, Any]
|
|
tags: list[str]
|
|
aliases: list[str]
|
|
wikilinks: list[str] # resolved link targets
|
|
word_count: int
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Regexes
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_WIKILINK_RE = re.compile(r'\[\[([^\[\]|]+)(?:\|[^\[\]]+)?\]\]')
|
|
_INLINE_TAG_RE = re.compile(r'(?<!\w)#([\w/-]+)')
|
|
_HEADING_RE = re.compile(r'^#{1,6}\s+', re.MULTILINE)
|
|
_MARKDOWN_LINK_RE = re.compile(r'!?\[([^\]]*)\]\([^\)]*\)')
|
|
_CODE_BLOCK_RE = re.compile(r'```[\s\S]*?```|`[^`]+`', re.MULTILINE)
|
|
_HTML_RE = re.compile(r'<[^>]+>')
|
|
_HORIZONTAL_RULE_RE = re.compile(r'^[-*_]{3,}\s*$', re.MULTILINE)
|
|
|
|
|
|
def _strip_markdown(text: str) -> str:
|
|
"""Convert Markdown to plain text (lightweight, no external deps)."""
|
|
# Remove code blocks first (preserve whitespace context)
|
|
text = _CODE_BLOCK_RE.sub(' ', text)
|
|
# Remove headings marker characters
|
|
text = _HEADING_RE.sub('', text)
|
|
# Replace Markdown links with their label
|
|
text = _MARKDOWN_LINK_RE.sub(r'\1', text)
|
|
# Replace WikiLinks with their display text (or target)
|
|
text = _WIKILINK_RE.sub(lambda m: m.group(1).split('/')[-1], text)
|
|
# Remove HTML tags
|
|
text = _HTML_RE.sub(' ', text)
|
|
# Remove horizontal rules
|
|
text = _HORIZONTAL_RULE_RE.sub('', text)
|
|
# Normalise whitespace
|
|
text = re.sub(r'\n{3,}', '\n\n', text)
|
|
return text.strip()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Parser
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def parse_document(file_path: Path, vault_root: Path) -> ParsedDocument:
|
|
"""
|
|
Parse a single Markdown file and return a ``ParsedDocument``.
|
|
|
|
Args:
|
|
file_path: Absolute path to the Markdown file.
|
|
vault_root: Absolute path to the vault root (used to compute relative path).
|
|
"""
|
|
raw_text = file_path.read_text(encoding='utf-8', errors='replace')
|
|
relative_path = str(file_path.relative_to(vault_root))
|
|
|
|
# Parse frontmatter + body
|
|
post = frontmatter.loads(raw_text)
|
|
fm: dict[str, Any] = dict(post.metadata)
|
|
body: str = post.content
|
|
|
|
# ---- Title ----
|
|
title: str = fm.get('title', '')
|
|
if not title:
|
|
# Fall back to first H1 heading
|
|
h1 = re.search(r'^#\s+(.+)$', body, re.MULTILINE)
|
|
if h1:
|
|
title = h1.group(1).strip()
|
|
else:
|
|
title = file_path.stem
|
|
|
|
# ---- Tags ----
|
|
fm_tags: list[str] = _normalise_list(fm.get('tags', []))
|
|
inline_tags: list[str] = _INLINE_TAG_RE.findall(body)
|
|
tags = list(dict.fromkeys([t.lower().lstrip('#') for t in fm_tags + inline_tags]))
|
|
|
|
# ---- Aliases ----
|
|
aliases = _normalise_list(fm.get('aliases', []))
|
|
|
|
# ---- WikiLinks ----
|
|
wikilinks = list(dict.fromkeys(_WIKILINK_RE.findall(body)))
|
|
|
|
# ---- Plain text ----
|
|
content_text = _strip_markdown(body)
|
|
word_count = len(content_text.split())
|
|
|
|
return ParsedDocument(
|
|
path=relative_path,
|
|
title=title,
|
|
content_raw=raw_text,
|
|
content_text=content_text,
|
|
frontmatter=fm,
|
|
tags=tags,
|
|
aliases=aliases,
|
|
wikilinks=wikilinks,
|
|
word_count=word_count,
|
|
)
|
|
|
|
|
|
def _normalise_list(value: Any) -> list[str]:
|
|
"""Accept str, list[str], or None and return list[str]."""
|
|
if not value:
|
|
return []
|
|
if isinstance(value, str):
|
|
return [value]
|
|
return [str(v) for v in value]
|