Created
April 17, 2023 19:54
-
-
Save james2doyle/95657149dfd386db2cfd4abb7e0e69f4 to your computer and use it in GitHub Desktop.
Linkoln parses wikilinks out of a markdown document, and searches the world wide web to find a hyperlink for each one. https://animaomnium.github.io/keep-stuff-linkable/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# Linkoln by Anima Omnium | |
# Dedicated to the Public Domain | |
# https://animaomnium.github.io/keep-stuff-linkable/ | |
# [[programming language:Rust]] is a [[systems programming language]] bootstrapped from [[rust prehistory|OCaml]]. | |
# [Rust][1] is a [systems programming language][2] bootstrapped from [OCaml][3]. | |
# [1]: https://www.rust-lang.org | |
# [2]: https://en.wikipedia.org/wiki/System_programming_language | |
# [3]: https://github.com/graydon/rust-prehistory | |
# Just standard library for portability | |
import sys | |
import urllib.request | |
import time | |
# Input from file, output to stdout | |
# Suggested usage: | |
# python linkoln.py INPUT.md > OUTPUT.md | |
# Read input file name | |
if len(sys.argv) != 2: | |
print("Usage: linkoln FILE") | |
exit(1) | |
# Read file | |
FILE = sys.argv[1] | |
with open(FILE, "r") as fin: | |
INPUT = fin.read() | |
# Link numbering start | |
OFFSET = 1 | |
# Ignore wikilinks in code, headings, frontmatter | |
IGNORE = [ | |
("```", "```"), | |
("#", "\n"), | |
("`", "`"), | |
("+++", "+++"), | |
] | |
# Syntax for links | |
LINK_OPEN = "[[" | |
LINK_CLOSE = "]]" | |
LINK_QUERY = "|" | |
LINK_CONTEXT = ":" | |
# Parser state enum | |
S_IGNORE = 0 | |
S_SCANIN = 1 | |
S_EATING = 2 | |
# Initialize parser | |
state = S_SCANIN | |
rem = INPUT | |
closing = "" | |
inside = "" | |
colophon = [] | |
# Skip amt chars | |
def skip(r, amt): | |
return r[amt:] | |
# Skip amt, echo what was skipped | |
def eat(r, amt): | |
print(r[:amt], end="") | |
return skip(r, amt) | |
# Check r prefix matches against | |
def check(r, against): | |
return r[:len(against)] == against | |
# Parse inside wikilink | |
def extract(inside): | |
(link, text) = (inside, inside) | |
if LINK_QUERY in inside: | |
(link, text) = inside.split(LINK_QUERY) | |
elif LINK_CONTEXT in inside: | |
(link, text) = inside.split(LINK_CONTEXT) | |
link = f"{link} {text}" | |
return (link, text) | |
# Echo formatted link | |
def emit_link(entry): | |
(num, inside) = entry | |
(_, inside) = extract(inside) | |
print(f"[{inside}][{num}]", end="") | |
# Echo formatted link reference | |
def emit_entry(entry): | |
(num, inside) = entry | |
(inside, _) = extract(inside) | |
inside = google_it(inside) | |
print(f"[{num}]: {inside}") | |
# Locate link matching given query | |
def google_it(query): | |
# Dumbest most fragile hack ever | |
quoted = urllib.parse.quote(query, safe='') | |
# Don't hammer friends at DuckDuckGo | |
time.sleep(0.5) | |
try: | |
contents = urllib.request.urlopen(f"https://lite.duckduckgo.com/lite/search&q={quoted}").read() | |
# Parsing html is easy | |
top_result = contents.split(b"link-text")[1] | |
top_link = top_result.split(b">")[1].split(b"<")[0] | |
except: | |
# Leave for human to fix | |
return f"ERROR: {query}" | |
return "https://" + top_link.decode("utf-8") | |
# State machine driving loop | |
while rem != "": | |
# Scanning for next link or comment | |
if state == S_SCANIN: | |
for (open, close) in IGNORE: | |
try: | |
if check(rem, open): | |
rem = eat(rem, len(open)) | |
closing = close | |
state = S_IGNORE | |
break | |
except: | |
pass | |
if state == S_IGNORE: | |
continue | |
try: | |
if check(rem, LINK_OPEN): | |
rem = skip(rem, len(LINK_OPEN)) | |
inside = "" | |
state = S_EATING | |
continue | |
except: | |
pass | |
rem = eat(rem, 1) | |
# Eating contents of wikilink | |
elif state == S_EATING: | |
if check(rem, LINK_CLOSE): | |
rem = skip(rem, len(LINK_CLOSE)) | |
entry = (len(colophon) + OFFSET, inside) | |
emit_link(entry) | |
colophon.append(entry) | |
state = S_SCANIN | |
else: | |
inside = inside + rem[:1] | |
rem = skip(rem, 1) | |
# Ignoring contents of comments | |
elif state == S_IGNORE: | |
if check(rem, closing): | |
rem = eat(rem, len(closing)) | |
state = S_SCANIN | |
else: | |
rem = eat(rem, 1) | |
# Frick your computer is on fire | |
else: | |
assert false, "Invalid state" | |
# Google all the queries | |
print() | |
for entry in colophon: | |
emit_entry(entry) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment