almet/obsidian_pelican.py

## obsidian_pelican.py
"""A pelican plugin to read Obsidian files and import them as pelican Articles.

This reads the tags made with hashtags and render them as pelican tags instead
(they won't be present in the output).

Adds the title in the output, and specify a default dummy date.
"""

from pathlib import Path

from itertools import chain
import os
import re
from pelican import signals
from pelican.readers import MarkdownReader
from pelican.contents import Tag
from pelican.utils import pelican_open
from pelican.utils import get_date, slugify

from markdown import Markdown

ARTICLE_PATHS = {}
FILE_PATHS = {}

link = r'\[\[\s*(?P<filename>[^|\]]+)(\|\s*(?P<linkname>.+))?\]\]'
file_re = re.compile(r'!' + link)
link_re = re.compile(link)
tag_re = re.compile(r'#([\w]+)')


"""
# Test cases
[[my link]]
[[ my work ]]
[[ my work | is finished ]]

![[ a file.jpg ]]
![[file.jpg]]
"""


def get_file_and_linkname(match):
    group = match.groupdict()
    filename = group['filename'].strip()
    linkname = group['linkname'] if group['linkname'] else filename
    linkname = linkname.strip()
    return filename, linkname


class ObsidianMarkdownReader(MarkdownReader):
    """
    Change the format of various links to the accepted case of pelican.
    """

    def __init__(self, *args, **kwargs):
        super(ObsidianMarkdownReader, self).__init__(*args, **kwargs)
        self.settings["MARKDOWN"]["extensions"].append("markdown.extensions.toc")
        self.settings["MARKDOWN"]["extensions"].append("sane_lists")

    def replace_obsidian_links(self, text):
        def link_replacement(match):
            filename, linkname = get_file_and_linkname(match)
            path = ARTICLE_PATHS.get(filename)
            if path:
                link_structure = '[{linkname}]({{filename}}{path}{filename}.md)'.format(
                    linkname=linkname, path=path, filename=filename
                )
            else:
                link_structure = '{linkname}'.format(linkname=linkname)
            return link_structure

        def file_replacement(match):
            filename, linkname = get_file_and_linkname(match)
            path = FILE_PATHS.get(filename)
            if path:
                link_structure = '![{linkname}]({{static}}{path}{filename})'.format(
                    linkname=linkname, path=path, filename=filename
                )
            else:
                # don't show it at all since it will be broken
                link_structure = ''
            return link_structure

        text = file_re.sub(file_replacement, text)
        text = link_re.sub(link_replacement, text)
        return text

    def replace_tags(self, text):
        return re.sub(r'#([\w]+)', r'', text)

    def get_tags(self, text):
        return re.findall(r'#([\w]+)', text)

    def read(self, source_path):
        """Parse content and metadata of markdown files

        It also changes the links to the acceptable format for pelican
        """

        self._source_path = source_path
        self._md = Markdown(**self.settings['MARKDOWN'])

        tags = []
        with pelican_open(source_path) as text:
            text = self.replace_obsidian_links(text)
            tags = self.get_tags(text)
            text = self.replace_tags(text)
            content = self._md.convert(text)

        if hasattr(self._md, "Meta"):
            metadata = self._parse_metadata(self._md.Meta)
        else:
            metadata = {}

        if tags:
            metadata['tags'] = [Tag(tag, self.settings) for tag in tags]


        # Add the TOC to the metadata.
        if len(self._md.toc) > 300:
            metadata["table_of_contents"] = self._md.toc

        # Get the title from the first h1
        if "title" not in metadata and len(self._md.toc_tokens):
            first_title = self._md.toc_tokens[0]
            metadata["title"] = first_title["name"]
            content = content.replace(
                '<h1 id="{id}">{name}</h1>'.format(**first_title), ""
            )

        # Get the date from the filename, if possible.
        parts = os.path.splitext(os.path.basename(source_path))[0].split("-")
        metadata["date"] = get_date("2023-01-30")
        if "read_on" in metadata:
            metadata["date"] = datetime.strptime(metadata["read_on"], "%B %Y")
        elif len(parts) >= 3:
            metadata["date"] = get_date("-".join(parts[:3]))

        if "slug" not in metadata:
            metadata["slug"] = slugify(
                metadata["title"], self.settings.get("SLUG_REGEX_SUBSTITUTIONS", [])
            )

        category = os.path.basename(
            os.path.abspath(os.path.join(source_path, os.pardir))
        )
        metadata["category"] = self.process_metadata("category", category)

        return content, metadata


def populate_files_and_articles(article_generator):
    global ARTICLE_PATHS
    global FILE_PATHS

    base_path = Path(article_generator.path)
    articles = base_path.glob('**/*.md')
    for article in articles:
        full_path, filename_w_ext = os.path.split(article)
        filename, ext = os.path.splitext(filename_w_ext)
        path = str(full_path).replace(str(base_path), '') + '/'
        ARTICLE_PATHS[filename] = path

    globs = [base_path.glob('**/*.{}'.format(ext)) for ext in ['png', 'jpg', 'svg', 'apkg', 'gif']]
    files = chain(*globs)

    for _file in files:
        full_path, filename_w_ext = os.path.split(_file)
        path = str(full_path).replace(str(base_path), '') + '/'
        FILE_PATHS[filename_w_ext] = path


def modify_reader(article_generator):
    populate_files_and_articles(article_generator)
    article_generator.readers.readers['md'] = ObsidianMarkdownReader(article_generator.settings)


def modify_metadata(article_generator, metadata):
    """
    Modify the tags so we can define the tags as we are used to in obsidian.
    """
    for tag in metadata.get('tags', []):
        if '#' in tag.name:
            tag.name = tag.name.replace('#', '')


def register():
    signals.article_generator_context.connect(modify_metadata)
    signals.article_generator_init.connect(modify_reader)
	"""A pelican plugin to read Obsidian files and import them as pelican Articles.

	This reads the tags made with hashtags and render them as pelican tags instead
	(they won't be present in the output).

	Adds the title in the output, and specify a default dummy date.
	"""

	from pathlib import Path

	from itertools import chain
	import os
	import re
	from pelican import signals
	from pelican.readers import MarkdownReader
	from pelican.contents import Tag
	from pelican.utils import pelican_open
	from pelican.utils import get_date, slugify

	from markdown import Markdown

	ARTICLE_PATHS = {}
	FILE_PATHS = {}

	link = r'\[\[\s(?P<filename>[^\|\]]+)(\\|\s(?P<linkname>.+))?\]\]'
	file_re = re.compile(r'!' + link)
	link_re = re.compile(link)
	tag_re = re.compile(r'#([\w]+)')


	"""
	# Test cases
	[[my link]]
	[[ my work ]]
	[[ my work \| is finished ]]

	![[ a file.jpg ]]
	![[file.jpg]]
	"""


	def get_file_and_linkname(match):
	group = match.groupdict()
	filename = group['filename'].strip()
	linkname = group['linkname'] if group['linkname'] else filename
	linkname = linkname.strip()
	return filename, linkname


	class ObsidianMarkdownReader(MarkdownReader):
	"""
	Change the format of various links to the accepted case of pelican.
	"""

	def __init__(self, args, *kwargs):
	super(ObsidianMarkdownReader, self).__init__(args, *kwargs)
	self.settings["MARKDOWN"]["extensions"].append("markdown.extensions.toc")
	self.settings["MARKDOWN"]["extensions"].append("sane_lists")

	def replace_obsidian_links(self, text):
	def link_replacement(match):
	filename, linkname = get_file_and_linkname(match)
	path = ARTICLE_PATHS.get(filename)
	if path:
	link_structure = '[{linkname}]({{filename}}{path}{filename}.md)'.format(
	linkname=linkname, path=path, filename=filename
	)
	else:
	link_structure = '{linkname}'.format(linkname=linkname)
	return link_structure

	def file_replacement(match):
	filename, linkname = get_file_and_linkname(match)
	path = FILE_PATHS.get(filename)
	if path:
	link_structure = '![{linkname}]({{static}}{path}{filename})'.format(
	linkname=linkname, path=path, filename=filename
	)
	else:
	# don't show it at all since it will be broken
	link_structure = ''
	return link_structure

	text = file_re.sub(file_replacement, text)
	text = link_re.sub(link_replacement, text)
	return text

	def replace_tags(self, text):
	return re.sub(r'#([\w]+)', r'', text)

	def get_tags(self, text):
	return re.findall(r'#([\w]+)', text)

	def read(self, source_path):
	"""Parse content and metadata of markdown files

	It also changes the links to the acceptable format for pelican
	"""

	self._source_path = source_path
	self._md = Markdown(**self.settings['MARKDOWN'])

	tags = []
	with pelican_open(source_path) as text:
	text = self.replace_obsidian_links(text)
	tags = self.get_tags(text)
	text = self.replace_tags(text)
	content = self._md.convert(text)

	if hasattr(self._md, "Meta"):
	metadata = self._parse_metadata(self._md.Meta)
	else:
	metadata = {}

	if tags:
	metadata['tags'] = [Tag(tag, self.settings) for tag in tags]


	# Add the TOC to the metadata.
	if len(self._md.toc) > 300:
	metadata["table_of_contents"] = self._md.toc

	# Get the title from the first h1
	if "title" not in metadata and len(self._md.toc_tokens):
	first_title = self._md.toc_tokens[0]
	metadata["title"] = first_title["name"]
	content = content.replace(
	'<h1 id="{id}">{name}</h1>'.format(**first_title), ""
	)

	# Get the date from the filename, if possible.
	parts = os.path.splitext(os.path.basename(source_path))[0].split("-")
	metadata["date"] = get_date("2023-01-30")
	if "read_on" in metadata:
	metadata["date"] = datetime.strptime(metadata["read_on"], "%B %Y")
	elif len(parts) >= 3:
	metadata["date"] = get_date("-".join(parts[:3]))

	if "slug" not in metadata:
	metadata["slug"] = slugify(
	metadata["title"], self.settings.get("SLUG_REGEX_SUBSTITUTIONS", [])
	)

	category = os.path.basename(
	os.path.abspath(os.path.join(source_path, os.pardir))
	)
	metadata["category"] = self.process_metadata("category", category)

	return content, metadata


	def populate_files_and_articles(article_generator):
	global ARTICLE_PATHS
	global FILE_PATHS

	base_path = Path(article_generator.path)
	articles = base_path.glob('*/.md')
	for article in articles:
	full_path, filename_w_ext = os.path.split(article)
	filename, ext = os.path.splitext(filename_w_ext)
	path = str(full_path).replace(str(base_path), '') + '/'
	ARTICLE_PATHS[filename] = path

	globs = [base_path.glob('*/.{}'.format(ext)) for ext in ['png', 'jpg', 'svg', 'apkg', 'gif']]
	files = chain(*globs)

	for _file in files:
	full_path, filename_w_ext = os.path.split(_file)
	path = str(full_path).replace(str(base_path), '') + '/'
	FILE_PATHS[filename_w_ext] = path


	def modify_reader(article_generator):
	populate_files_and_articles(article_generator)
	article_generator.readers.readers['md'] = ObsidianMarkdownReader(article_generator.settings)


	def modify_metadata(article_generator, metadata):
	"""
	Modify the tags so we can define the tags as we are used to in obsidian.
	"""
	for tag in metadata.get('tags', []):
	if '#' in tag.name:
	tag.name = tag.name.replace('#', '')


	def register():
	signals.article_generator_context.connect(modify_metadata)
	signals.article_generator_init.connect(modify_reader)