BjornFJohansson/wikidpad-to-obsidian.py

## wikidpad-to-obsidian.py
from pathlib import Path
import urllib
import re, os
from urllib.parse import urlparse
from string import punctuation
pages = sorted(Path('.').glob('*.md'))

# Sometimes wp creates pages with ~ (tilde). Rename these
tildepages = [p for p in pages if "~" in str(p)]
for tildepage in tildepages:
    a, b = str(tildepage).split("~", maxsplit=1)
    np = Path(f"{a}.md")
    if not np.exists():
        tildepage.rename(np)

# Some wp files have unicode and some punctuation quoted.
# subpages are renamed with a pipe | character
for page in pages:
    uq = urllib.parse.unquote(str(page), encoding='cp1252')
    if not str(page) == uq:
        nm = uq.replace("/", "|")
        page.rename(nm)


# Strip page name from first line, remove if only punctuation remains
i = 0
for page in pages:
    firstline, *rest = page.read_text(encoding='utf-8')[1:].splitlines()
    newfirstline = firstline.strip("# ").replace(page.stem, "")
    if not set(newfirstline) - set(punctuation):
        newfirstline = ""
    if firstline != newfirstline:
        newpagetext = newfirstline + "\n" + "\n".join(rest)
        page.write_text(newpagetext)
        i+=1
# 7175 pages


# remove [alias:...] and replace with obsidian alias:
# https://help.obsidian.md/Linking+notes+and+files/Aliases
i = 0
regxal = re.compile(r"(?:\[)alias:(.+)(?:\])")
for page in pages[1:]:
    pagetext = page.read_text(encoding='utf-8')[1:]
    matchobj = re.search(regxal, pagetext)
    if matchobj:
        aliases = matchobj.group(1).strip().split("; ")
        nb = f"---\naliases: {', '.join(aliases)}\n---\n\n"
        newpagetext = nb + pagetext[:matchobj.start()]+pagetext[matchobj.end():]
        page.write_text(newpagetext)
        i+=1


# Replace absolute file links with obsidian md style links:
# file:/home/bjorn/Desktop/mecwiki/yeast-colony-pcr.md
# [file:/home/bjorn/Desktop/mecwiki/yeast-colony-pcr.md]
# >>>====>
# [yeast-colony-pcr.md](<file:/home/bjorn/Desktop/mecwiki/yeast-colony-pcr.md>)
# This could be improved to handle images by prepending a ! for some links
regxfl = re.compile(r"(?:\[?)(file:/[^\s\]]+)(?:\]|\s)")
i=0
def repl_file_links(matchobj):
    url = matchobj.group(1)
    fn = os.path.basename(url)
    return f"[{fn}](<{url}>)"
for page in pages:
    pagetext = page.read_text(encoding='utf-8').lstrip("\ufeff")
    newpagetext, r = re.subn(regxfl, repl_file_links, pagetext)
    if r:
        page.write_text(newpagetext)
        i+=1

# Search and replace all *defined* CamelCase and [wikiwords]
pregs = [re.compile(f"(?:\[)({p.stem})(?:\])") for p in pages]

from tqdm import tqdm
for page in tqdm(pages):
    pagetext = page.read_text(encoding='utf-8')
    for preg in pregs:
        pagetext = re.sub(preg, r"[[\1]]" , pagetext)
    page.write_text(pagetext)
	from pathlib import Path
	import urllib
	import re, os
	from urllib.parse import urlparse
	from string import punctuation
	pages = sorted(Path('.').glob('*.md'))

	# Sometimes wp creates pages with ~ (tilde). Rename these
	tildepages = [p for p in pages if "~" in str(p)]
	for tildepage in tildepages:
	a, b = str(tildepage).split("~", maxsplit=1)
	np = Path(f"{a}.md")
	if not np.exists():
	tildepage.rename(np)

	# Some wp files have unicode and some punctuation quoted.
	# subpages are renamed with a pipe \| character
	for page in pages:
	uq = urllib.parse.unquote(str(page), encoding='cp1252')
	if not str(page) == uq:
	nm = uq.replace("/", "\|")
	page.rename(nm)


	# Strip page name from first line, remove if only punctuation remains
	i = 0
	for page in pages:
	firstline, *rest = page.read_text(encoding='utf-8')[1:].splitlines()
	newfirstline = firstline.strip("# ").replace(page.stem, "")
	if not set(newfirstline) - set(punctuation):
	newfirstline = ""
	if firstline != newfirstline:
	newpagetext = newfirstline + "\n" + "\n".join(rest)
	page.write_text(newpagetext)
	i+=1
	# 7175 pages



	# remove [alias:...] and replace with obsidian alias:
	# https://help.obsidian.md/Linking+notes+and+files/Aliases
	i = 0
	regxal = re.compile(r"(?:\[)alias:(.+)(?:\])")
	for page in pages[1:]:
	pagetext = page.read_text(encoding='utf-8')[1:]
	matchobj = re.search(regxal, pagetext)
	if matchobj:
	aliases = matchobj.group(1).strip().split("; ")
	nb = f"---\naliases: {', '.join(aliases)}\n---\n\n"
	newpagetext = nb + pagetext[:matchobj.start()]+pagetext[matchobj.end():]
	page.write_text(newpagetext)
	i+=1


	# Replace absolute file links with obsidian md style links:
	# file:/home/bjorn/Desktop/mecwiki/yeast-colony-pcr.md
	# [file:/home/bjorn/Desktop/mecwiki/yeast-colony-pcr.md]
	# >>>====>
	# [yeast-colony-pcr.md](<file:/home/bjorn/Desktop/mecwiki/yeast-colony-pcr.md>)
	# This could be improved to handle images by prepending a ! for some links
	regxfl = re.compile(r"(?:\[?)(file:/[^\s\]]+)(?:\]\|\s)")
	i=0
	def repl_file_links(matchobj):
	url = matchobj.group(1)
	fn = os.path.basename(url)
	return f"[{fn}](<{url}>)"
	for page in pages:
	pagetext = page.read_text(encoding='utf-8').lstrip("\ufeff")
	newpagetext, r = re.subn(regxfl, repl_file_links, pagetext)
	if r:
	page.write_text(newpagetext)
	i+=1

	# Search and replace all defined CamelCase and [wikiwords]
	pregs = [re.compile(f"(?:\[)({p.stem})(?:\])") for p in pages]

	from tqdm import tqdm
	for page in tqdm(pages):
	pagetext = page.read_text(encoding='utf-8')
	for preg in pregs:
	pagetext = re.sub(preg, r"[[\1]]" , pagetext)
	page.write_text(pagetext)