You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

135 lines
4.2 KiB

"""
parser.py — Markdown vault document parser.
Extracts:
- YAML frontmatter (title, tags, aliases, date, custom fields)
- Plain text content (Markdown stripped)
- WikiLinks [[target|alias]] and #tags
- Word count
"""
from __future__ import annotations
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
import frontmatter # python-frontmatter
# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------
@dataclass
class ParsedDocument:
path: str
title: str
content_raw: str # original markdown
content_text: str # plain text (markdown stripped)
frontmatter: dict[str, Any]
tags: list[str]
aliases: list[str]
wikilinks: list[str] # resolved link targets
word_count: int
# ---------------------------------------------------------------------------
# Regexes
# ---------------------------------------------------------------------------
_WIKILINK_RE = re.compile(r'\[\[([^\[\]|]+)(?:\|[^\[\]]+)?\]\]')
_INLINE_TAG_RE = re.compile(r'(?<!\w)#([\w/-]+)')
_HEADING_RE = re.compile(r'^#{1,6}\s+', re.MULTILINE)
_MARKDOWN_LINK_RE = re.compile(r'!?\[([^\]]*)\]\([^\)]*\)')
_CODE_BLOCK_RE = re.compile(r'```[\s\S]*?```|`[^`]+`', re.MULTILINE)
_HTML_RE = re.compile(r'<[^>]+>')
_HORIZONTAL_RULE_RE = re.compile(r'^[-*_]{3,}\s*$', re.MULTILINE)
def _strip_markdown(text: str) -> str:
"""Convert Markdown to plain text (lightweight, no external deps)."""
# Remove code blocks first (preserve whitespace context)
text = _CODE_BLOCK_RE.sub(' ', text)
# Remove headings marker characters
text = _HEADING_RE.sub('', text)
# Replace Markdown links with their label
text = _MARKDOWN_LINK_RE.sub(r'\1', text)
# Replace WikiLinks with their display text (or target)
text = _WIKILINK_RE.sub(lambda m: m.group(1).split('/')[-1], text)
# Remove HTML tags
text = _HTML_RE.sub(' ', text)
# Remove horizontal rules
text = _HORIZONTAL_RULE_RE.sub('', text)
# Normalise whitespace
text = re.sub(r'\n{3,}', '\n\n', text)
return text.strip()
# ---------------------------------------------------------------------------
# Parser
# ---------------------------------------------------------------------------
def parse_document(file_path: Path, vault_root: Path) -> ParsedDocument:
"""
Parse a single Markdown file and return a ``ParsedDocument``.
Args:
file_path: Absolute path to the Markdown file.
vault_root: Absolute path to the vault root (used to compute relative path).
"""
raw_text = file_path.read_text(encoding='utf-8', errors='replace')
relative_path = str(file_path.relative_to(vault_root))
# Parse frontmatter + body
post = frontmatter.loads(raw_text)
fm: dict[str, Any] = dict(post.metadata)
body: str = post.content
# ---- Title ----
title: str = fm.get('title', '')
if not title:
# Fall back to first H1 heading
h1 = re.search(r'^#\s+(.+)$', body, re.MULTILINE)
if h1:
title = h1.group(1).strip()
else:
title = file_path.stem
# ---- Tags ----
fm_tags: list[str] = _normalise_list(fm.get('tags', []))
inline_tags: list[str] = _INLINE_TAG_RE.findall(body)
tags = list(dict.fromkeys([t.lower().lstrip('#') for t in fm_tags + inline_tags]))
# ---- Aliases ----
aliases = _normalise_list(fm.get('aliases', []))
# ---- WikiLinks ----
wikilinks = list(dict.fromkeys(_WIKILINK_RE.findall(body)))
# ---- Plain text ----
content_text = _strip_markdown(body)
word_count = len(content_text.split())
return ParsedDocument(
path=relative_path,
title=title,
content_raw=raw_text,
content_text=content_text,
frontmatter=fm,
tags=tags,
aliases=aliases,
wikilinks=wikilinks,
word_count=word_count,
)
def _normalise_list(value: Any) -> list[str]:
"""Accept str, list[str], or None and return list[str]."""
if not value:
return []
if isinstance(value, str):
return [value]
return [str(v) for v in value]

Powered by TurnKey Linux.