Skip to content

Instantly share code, notes, and snippets.

@pebbie
Last active August 29, 2015 13:56
Show Gist options
  • Save pebbie/9262557 to your computer and use it in GitHub Desktop.
Save pebbie/9262557 to your computer and use it in GitHub Desktop.
"""
file: html2wikitext.py
author: Peb Ruswono Aryan (28.02.2014)
desc:
script to convert html extracted by [http://github.com/petrabarus/perundangan](perundangan)
into wikitext used in http://hukum.pebbie.org
"""
import re
import sys
replacer = [
(re.compile(r"<html[^>.]*>(.*?)<\/html>", re.I), r"\1"),
(re.compile(r"<head[^>.]*>(.*?)<\/head>", re.I), r""),
(re.compile(r"<body[^>.]*>(.*?)<\/body>", re.I), r"\1"),
(re.compile(r"<p[^>.]*>(.*?)<\/p>", re.I), r"\1"),
(re.compile(r"<small[^>.]*>(.*?)<\/small>", re.I), r"\1"),
(re.compile(r"<tbody[^>.]*>(.*?)<\/tbody>", re.I), r"\1"),
(re.compile(r"<thead[^>.]*>(.*?)<\/thead>", re.I), r"\1"),
(re.compile(r"<table[^>.]*>(.*?)<\/table>", re.I), r"\1"),
(re.compile(r"<td[^>.]*>(.*?)<\/td>", re.I), r"\1"),
(re.compile(r"<tr[^>.]*>(.*?)<\/tr>", re.I), r"\1"),
(re.compile(r"<center[^>.]*>(.*?)<\/center>", re.I), r"\n\1\n"),
(re.compile(r"<div[^>.]*>(.*?)<\/div>", re.I), r"\n\1"),
(re.compile(r"<font[^>]*>(.*?)<\/font>", re.I), r"\1"),
(re.compile(r"<hr[^>]*(\/?)>", re.I), r"\n"),
(re.compile(r"<img[^>]*(\/?)>", re.I), r""),
(re.compile(r"<br(\/?)>", re.I), r"\n"),
(re.compile(r"<(\/?).+(\/?)>", re.I), r""),
]
def get_text(src):
for p,r in replacer:
src = p.sub(r, src)
return src
def linkify(segment, prefix=""):
ayat = re.compile(r" ayat \((\d+)\) ", re.I)
segment = re.sub(r"[ ]+Peraturan Pemerintah Nomor (\d+) Tahun (\d+)([,. ])+", r" [[PP/\1/\2|Peraturan Pemerintah Nomor \1 Tahun \2]] ", segment)
segment = re.sub(r"[ ]+Pasal (\d+) ayat \((\d+)\) huruf (\w+)([,. ])+", r" [[#Pasal\1Ayat\2_\3|Pasal \1 ayat (\2) huruf \3]] ", segment)
segment = re.sub(r"[ ]+Pasal (\d+) ayat \((\d+)\)([,. ])+", r" [[#Pasal\1Ayat\2|Pasal \1 ayat (\2)]] ", segment)
segment = re.sub(r"[ ]+ayat \((\d+)\) huruf (\w+)([,. ])+", r" [[#${pp}Ayat\1_\2|ayat (\1) huruf \2]] ", segment)
segment = re.sub(r"[ ]+ayat \((\d+)\)([,. ])+", r" [[#${pp}Ayat\1|ayat (\1)]]\2", segment)
segment = re.sub(r"[ ]+Pasal (\d+)([,. ])+", r" [[#Pasal\1|Pasal \1]]\2", segment)
segment = re.sub(r"[ ]+huruf (\w+)([,. ])+", r" [[#${pp}\1|huruf (\1)]]\2", segment)
if "${pp}" in segment: segment = segment.replace("${pp}", prefix)
return segment
def wikify(src):
lines = src.split("\n")
output = []
buffer = []
state = "START"
nempty = 0
stack = []
segmentid = ""
list_stack = []
title_hints = ["UNDANG", "PERATURAN"]
for line in lines:
ul = line.strip().upper()
ll = line.strip().lower()
print state, line
if state != "JUDUL" and any([hint in line and ul.index(hint)==0 for hint in title_hints]):
while len(stack)>0: output.append(stack.pop())
state = "JUDUL"
buffer = [line]
numindent = 1
stack=["="*numindent+" <br/>".join(buffer)+"="*numindent]
elif state=="JUDUL" and line.strip()==ul and len(ul)>0:
buffer.append(line)
numindent = 1
stack=["="*numindent+" <br/>".join(buffer)+"="*numindent+"\n"]
elif state != "BAB" and "BAB" in line and ul.index("BAB")==0:
while len(stack)>0: output.append(stack.pop())
state = "BAB"
buffer = [line]
numindent = 2
stack=["="*numindent+" <br/>".join(buffer)+"="*numindent]
elif state=="BAB" and line.strip()==ul and len(ul)>0:
buffer.append(line)
numindent = 2
stack=["="*numindent+" <br/>".join(buffer)+"="*numindent]
elif state != "BAGIAN" and "bagian" in ll and ll.index("bagian")==0:
while len(stack)>0: output.append(stack.pop())
state = "BAGIAN"
buffer = [line]
numindent = 3
stack=["="*numindent+" <br/>".join(buffer)+"="*numindent]
elif state=="BAGIAN" and len(ul)>0:
buffer.append(line)
numindent = 3
stack=["="*numindent+" <br/>".join(buffer)+"="*numindent]
elif "pasal" in ll and ll.index("pasal")==0:
state = "PASAL"
while len(stack)>0: output.append(stack.pop())
numindent = 3
output.append("="*numindent+line+"="*numindent)
segmentid = "Pasal"+line.split(" ")[1]
elif len(line.strip())==0:
if state != "EMPTY":
if state in ["PASAL","LIST"]:continue
nempty = 1
while len(stack)>0: output.append(stack.pop())
else:
nempty += 1
state="EMPTY"
if nempty>2:
output.append("\n")
nempty = 0
elif state=="PASAL":
if "(" in line and ll.index("(")==0:
state="AYAT"
ayatid = line[1:line.index(")")]
output.append("<ol style=\"list-style-type:none;margin-left:0;\" id=\"Pasal5\">")
if ":"==line[-1]:
state="LIST"
fragment="%sAyat%s_" % (segmentid, ayatid)
output.append("<li id=\"%sAyat%s\">%s" % (segmentid, ayatid, linkify(line, segmentid)))
stack.append("</li>")
output.append("<ol style=\"list-style-type:none;\">")
stack.append("</ol>")
else:
output.append("<li id=\"%sAyat%s\">%s</li>" % (segmentid, ayatid, linkify(line, segmentid)))
stack.append("</ol>\n")
else:
state="ISI"
output.append("<span id=\"%s\">" % (segmentid))
output.append(linkify(line.strip(), segmentid))
stack.append("</span>\n")
if ":"==line[-1]:
state="LIST"
fragment="%s_" % segmentid
output.append("<ol style=\"list-style-type:none;\">")
stack.append("</ol>")
elif state=="AYAT" and "(" in line and ll.index("(")==0:
ayatid = line[1:line.index(")")]
if ":"==line[-1]:
state="LIST"
fragment="%sAyat%s_" % (segmentid, ayatid)
output.append("<li id=\"%sAyat%s\">%s" % (segmentid, ayatid, linkify(line, segmentid)))
stack.append("</li>")
output.append("<ol style=\"list-style-type:none;\">")
stack.append("</ol>")
else:
output.append("<li id=\"%sAyat%s\">%s</li>" % (segmentid, ayatid, linkify(line, segmentid)))
elif state=="LIST":
if "(" in line and ll.index("(")==0:
while len(stack)>0: output.append(stack.pop())
ayatid = line[1:line.index(")")]
state="AYAT"
if ":"==line[-1]:
state="LIST"
fragment="%sAyat%s_" % (segmentid, ayatid)
output.append("<li id=\"%sAyat%s\">%s" % (segmentid, ayatid, linkify(line, segmentid)))
stack.append("</li>")
output.append("<ol style=\"list-style-type:none;\">")
stack.append("</ol>")
else:
output.append("<li id=\"%sAyat%s\">%s</li>" % (segmentid, ayatid, linkify(line, segmentid)))
else:
itemid = line.split(".")[0].strip()
output.append("<li id=\"%s%s\">%s</li>" % (fragment, itemid, linkify(line, fragment) ))
elif state=="EMPTY" and ("PENJELASAN"==ul or ("TAMBAHAN" in ul and ul.index("TAMBAHAN")==0)):
break
else:
output.append(line.strip())
output.append("[[Category:Peraturan]]")
return "\n".join(output)
if __name__=="__main__":
if len(sys.argv)>1:
input_file = sys.argv[1]
with open(input_file) as f: txt = f.read()
txt = get_text(txt)
txt = wikify(txt)
if len(sys.argv)>2:
output_file = sys.argv[2]
with open(output_file,"w") as f:
f.write(txt)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment