james2doyle/Linkoln.py

## Linkoln.py
#!/usr/bin/python3

# Linkoln by Anima Omnium
# Dedicated to the Public Domain
# https://animaomnium.github.io/keep-stuff-linkable/
# [[programming language:Rust]] is a [[systems programming language]] bootstrapped from [[rust prehistory|OCaml]].
# [Rust][1] is a [systems programming language][2] bootstrapped from [OCaml][3].
# [1]: https://www.rust-lang.org
# [2]: https://en.wikipedia.org/wiki/System_programming_language
# [3]: https://github.com/graydon/rust-prehistory

# Just standard library for portability
import sys
import urllib.request
import time

# Input from file, output to stdout
# Suggested usage:
# python linkoln.py INPUT.md > OUTPUT.md

# Read input file name
if len(sys.argv) != 2:
  print("Usage: linkoln FILE")
  exit(1)

# Read file
FILE = sys.argv[1]
with open(FILE, "r") as fin:
  INPUT = fin.read()

# Link numbering start
OFFSET = 1

# Ignore wikilinks in code, headings, frontmatter
IGNORE = [
  ("```", "```"),
  ("#", "\n"),
  ("`", "`"),
  ("+++", "+++"),
]

# Syntax for links
LINK_OPEN = "[["
LINK_CLOSE = "]]"
LINK_QUERY = "|"
LINK_CONTEXT = ":"

# Parser state enum
S_IGNORE = 0
S_SCANIN = 1
S_EATING = 2

# Initialize parser
state = S_SCANIN
rem = INPUT
closing = ""
inside = ""
colophon = []

# Skip amt chars
def skip(r, amt):
  return r[amt:]

# Skip amt, echo what was skipped
def eat(r, amt):
  print(r[:amt], end="")
  return skip(r, amt)

# Check r prefix matches against
def check(r, against):
  return r[:len(against)] == against

# Parse inside wikilink
def extract(inside):
  (link, text) = (inside, inside)
  if LINK_QUERY in inside:
    (link, text) = inside.split(LINK_QUERY)
  elif LINK_CONTEXT in inside:
    (link, text) = inside.split(LINK_CONTEXT)
    link = f"{link} {text}"
  return (link, text)

# Echo formatted link
def emit_link(entry):
  (num, inside) = entry
  (_, inside) = extract(inside)
  print(f"[{inside}][{num}]", end="")

# Echo formatted link reference
def emit_entry(entry):
  (num, inside) = entry
  (inside, _) = extract(inside)
  inside = google_it(inside)
  print(f"[{num}]: {inside}")

# Locate link matching given query
def google_it(query):
  # Dumbest most fragile hack ever
  quoted = urllib.parse.quote(query, safe='')
  # Don't hammer friends at DuckDuckGo
  time.sleep(0.5)
  try:
    contents = urllib.request.urlopen(f"https://lite.duckduckgo.com/lite/search&q={quoted}").read()
    # Parsing html is easy
    top_result = contents.split(b"link-text")[1]
    top_link = top_result.split(b">")[1].split(b"<")[0]
  except:
    # Leave for human to fix
    return f"ERROR: {query}"
  return "https://" + top_link.decode("utf-8")

# State machine driving loop
while rem != "":
  # Scanning for next link or comment
  if state == S_SCANIN:
    for (open, close) in IGNORE:
      try:
        if check(rem, open):
          rem = eat(rem, len(open))
          closing = close
          state = S_IGNORE
          break
      except:
        pass
    if state == S_IGNORE:
      continue
    try:
      if check(rem, LINK_OPEN):
        rem = skip(rem, len(LINK_OPEN))
        inside = ""
        state = S_EATING
        continue
    except:
      pass
    rem = eat(rem, 1)

  # Eating contents of wikilink
  elif state == S_EATING:
    if check(rem, LINK_CLOSE):
      rem = skip(rem, len(LINK_CLOSE))
      entry = (len(colophon) + OFFSET, inside)
      emit_link(entry)
      colophon.append(entry)
      state = S_SCANIN
    else:
      inside = inside + rem[:1]
      rem = skip(rem, 1)

  # Ignoring contents of comments
  elif state == S_IGNORE:
    if check(rem, closing):
      rem = eat(rem, len(closing))
      state = S_SCANIN
    else:
      rem = eat(rem, 1)

  # Frick your computer is on fire
  else:
    assert false, "Invalid state"

# Google all the queries
print()
for entry in colophon:
  emit_entry(entry)
	#!/usr/bin/python3

	# Linkoln by Anima Omnium
	# Dedicated to the Public Domain
	# https://animaomnium.github.io/keep-stuff-linkable/
	# [[programming language:Rust]] is a [[systems programming language]] bootstrapped from [[rust prehistory\|OCaml]].
	# [Rust][1] is a [systems programming language][2] bootstrapped from [OCaml][3].
	# [1]: https://www.rust-lang.org
	# [2]: https://en.wikipedia.org/wiki/System_programming_language
	# [3]: https://github.com/graydon/rust-prehistory

	# Just standard library for portability
	import sys
	import urllib.request
	import time

	# Input from file, output to stdout
	# Suggested usage:
	# python linkoln.py INPUT.md > OUTPUT.md

	# Read input file name
	if len(sys.argv) != 2:
	print("Usage: linkoln FILE")
	exit(1)

	# Read file
	FILE = sys.argv[1]
	with open(FILE, "r") as fin:
	INPUT = fin.read()

	# Link numbering start
	OFFSET = 1

	# Ignore wikilinks in code, headings, frontmatter
	IGNORE = [
	("```", "```"),
	("#", "\n"),
	("`", "`"),
	("+++", "+++"),
	]

	# Syntax for links
	LINK_OPEN = "[["
	LINK_CLOSE = "]]"
	LINK_QUERY = "\|"
	LINK_CONTEXT = ":"

	# Parser state enum
	S_IGNORE = 0
	S_SCANIN = 1
	S_EATING = 2

	# Initialize parser
	state = S_SCANIN
	rem = INPUT
	closing = ""
	inside = ""
	colophon = []

	# Skip amt chars
	def skip(r, amt):
	return r[amt:]

	# Skip amt, echo what was skipped
	def eat(r, amt):
	print(r[:amt], end="")
	return skip(r, amt)

	# Check r prefix matches against
	def check(r, against):
	return r[:len(against)] == against

	# Parse inside wikilink
	def extract(inside):
	(link, text) = (inside, inside)
	if LINK_QUERY in inside:
	(link, text) = inside.split(LINK_QUERY)
	elif LINK_CONTEXT in inside:
	(link, text) = inside.split(LINK_CONTEXT)
	link = f"{link} {text}"
	return (link, text)

	# Echo formatted link
	def emit_link(entry):
	(num, inside) = entry
	(_, inside) = extract(inside)
	print(f"[{inside}][{num}]", end="")

	# Echo formatted link reference
	def emit_entry(entry):
	(num, inside) = entry
	(inside, _) = extract(inside)
	inside = google_it(inside)
	print(f"[{num}]: {inside}")

	# Locate link matching given query
	def google_it(query):
	# Dumbest most fragile hack ever
	quoted = urllib.parse.quote(query, safe='')
	# Don't hammer friends at DuckDuckGo
	time.sleep(0.5)
	try:
	contents = urllib.request.urlopen(f"https://lite.duckduckgo.com/lite/search&q={quoted}").read()
	# Parsing html is easy
	top_result = contents.split(b"link-text")[1]
	top_link = top_result.split(b">")[1].split(b"<")[0]
	except:
	# Leave for human to fix
	return f"ERROR: {query}"
	return "https://" + top_link.decode("utf-8")

	# State machine driving loop
	while rem != "":
	# Scanning for next link or comment
	if state == S_SCANIN:
	for (open, close) in IGNORE:
	try:
	if check(rem, open):
	rem = eat(rem, len(open))
	closing = close
	state = S_IGNORE
	break
	except:
	pass
	if state == S_IGNORE:
	continue
	try:
	if check(rem, LINK_OPEN):
	rem = skip(rem, len(LINK_OPEN))
	inside = ""
	state = S_EATING
	continue
	except:
	pass
	rem = eat(rem, 1)

	# Eating contents of wikilink
	elif state == S_EATING:
	if check(rem, LINK_CLOSE):
	rem = skip(rem, len(LINK_CLOSE))
	entry = (len(colophon) + OFFSET, inside)
	emit_link(entry)
	colophon.append(entry)
	state = S_SCANIN
	else:
	inside = inside + rem[:1]
	rem = skip(rem, 1)

	# Ignoring contents of comments
	elif state == S_IGNORE:
	if check(rem, closing):
	rem = eat(rem, len(closing))
	state = S_SCANIN
	else:
	rem = eat(rem, 1)

	# Frick your computer is on fire
	else:
	assert false, "Invalid state"

	# Google all the queries
	print()
	for entry in colophon:
	emit_entry(entry)