Skip to content

Instantly share code, notes, and snippets.

@GoogilyBoogily
Created May 8, 2015 01:54
Show Gist options
  • Save GoogilyBoogily/232a5f25e67dd6d8662f to your computer and use it in GitHub Desktop.
Save GoogilyBoogily/232a5f25e67dd6d8662f to your computer and use it in GitHub Desktop.
import streams, parsexml, re, strutils, unicode, parseopt2
# Wikitext handling
# -----------------
# This regex matches anywhere in the text that there *might* be wiki syntax
# that we have to clean up.
let ANYTHING_INTERESTING_RE: Regex = re"[*#:;|!{['_]"
# We skip the contents of these HTML tags entirely, and they don't nest
# inside each other.
const SKIP_SPANS = [
"cite", "ref", "hiero", "gallery", "timeline", "noinclude",
"caption", "references", "img", "source", "math"
]
# This regex is for matching and skipping over simple wikitext formatting.
# Here's the breakdown of the patterns we're matching:
#
# '''? = Bold and italic formatting (two or three apostrophes)
# ^#\s*(REDIRECT|redirect).*$ = Redirect syntax
# ^[ *#:;]+ = Bullets and indentation markers at the start of a line
# ^__.*__$ = table-of-contents directives
# ^[|!].*$ = Table detritus
#
# "Table detritus" might require some explanation. Tables, delimited by {|
# and |}, are something that we skip separately in filterWikitext. But
# because MediaWiki is insane like this, some tables are made using syntax
# that uses a template for the beginning of the table and |} syntax for the
# end.
#
# Because we don't know what's in templates, when this happens, we end up
# just seeing the inside and end of the table as if it were text. Usually,
# though, these lines begin with the cell separator |, so we can just filter
# those out.
let FORMATTING_RE: Regex = re(r"('''?|^#\s*redirect.*$|^[ *#:;]+|^[|!].*$|^__.*__$)", {reMultiLine, reIgnoreCase})
# This regex matches sequences of more than one blank line.
let BLANK_LINE_RE: Regex = re"\n\s*\n\s*\n"
let WORD_SEPARATOR_RE: Regex = re"'?([\x01-\x26\x28-\x2f\x3a-\x40\x5b-\x60\x7b-\x7f]|(\xc2[\x80-\xbf])|(\xe2\x80.)|(\xe2\x81[\x80-\xaf])|(\xe3\x80[\x80-\x9f]))'?"
let EMPTY_REF_RE: Regex = re(r"<ref [^>]+/\s*>", {reIgnoreCase})
const FAKE_FILENAME = "<wikipage>"
proc skipNestedChars(text: string, pos: var int, open: char, close: char) =
## Move our position 'pos' forward in the text, to skip a number of
## matching instances of the characters 'open' and 'close'.
##
## Precondition: text[pos] == open
## Postcondition: pos will increase by at least 1
pos += 1
var count = 1
while count > 0 and pos < text.len:
let nextPos: int = text.find({open, close}, pos)
if nextPos == -1:
# We can't find any more closing characters in the text.
# Jump to the end and abort.
pos = text.len
return
else:
let nextChar: char = text[nextPos]
if nextChar == open:
count += 1
else:
count -= 1
pos = nextPos + 1
# Convert Unicode text to lowercase.
# I hope this eventually ends up in the standard library.
proc unicodeLower(text: string): string =
result = ""
for rune in runes(text):
result.add(rune.toLower.toUTF8)
# forward declaration
proc filterWikitext(text: string): string
proc extractInternalLink(linkText: string): string =
# Links with colons might be special MediaWiki syntax. Just throw them
# all away.
if linkText.contains(':'):
return ""
let contents: string = filterWikitext(linkText[2 .. ^3])
let lastPart: int = contents.rfind('|') + 1
return contents[lastPart .. ^1]
proc extractExternalLink(linkText: string): string =
let spacePos = linkText.find(' ')
if spacePos == -1:
return ""
else:
return filterWikitext(linkText[spacePos + 1 .. ^2])
proc filterLink(text: string, pos: var int): string =
let startPos: int = pos
# No matter what, move pos to the end of the link
skipNestedChars(text, pos, '[', ']')
# Figure out what we skipped. If it's an ugly pseudo-link, return
# nothing.
if text.continuesWith("[[", startPos):
# Get the displayed text out of the internal link.
return extractInternalLink(text[startPos .. <pos])
else:
# Get the displayed text out of the external link.
return extractExternalLink(text[startPos .. <pos])
var tstream: StringStream = newStringStream()
proc filterHTML(origText: string): string =
## Scan through a Wiki page as HTML (Wikitext can have HTML tags mixed
## into it). Remove HTML tags, as well as the content of certain tags
## listed in SKIP_SPANS.
let text = origText.replace(EMPTY_REF_RE, "<ref />")
var xml: XmlParser
# Quickly copy the text into the StringStream object
shallowCopy(tstream.data, text)
tstream.setPosition(0)
result = newStringOfCap(text.len)
xml.open(tstream, FAKE_FILENAME, options={reportWhitespace})
while true:
xml.next()
case xml.kind
of xmlElementStart, xmlElementOpen:
if SKIP_SPANS.contains(xml.elementName):
let skipTo: string = xml.elementName
while true:
xml.next()
if xml.kind == xmlElementEnd and xml.elementName == skipTo:
break
elif xml.kind == xmlEof:
break
of xmlCharData, xmlWhitespace:
result.add(xml.charData)
of xmlEof:
break
else:
discard
# return result implicitly
xml.close
proc filterWikitext(text: string): string =
## Given the complete wikitext of an article, filter it for the part
## that's meant to be read as plain text.
# This method works by building a 'result' string incrementally, and
# advancing an index called 'pos' through the text as it goes. Some
# of the procedures this relies on will also advance 'pos' themselves.
result = newStringOfCap(text.len)
var pos = 0
while pos < text.len:
# Skip to the next character that could be wiki syntax.
var found: int = text.find(ANYTHING_INTERESTING_RE, pos)
if found == -1:
found = text.len
# Add everything up until then to the string.
if found > pos:
result.add(text[pos .. <found])
# Figure out what's here and deal with it.
pos = found
if pos < text.len:
let next2chars: string = text[pos .. pos+1]
if next2chars == "{{" or next2chars == "{|":
skipNestedChars(text, pos, '{', '}')
elif text[pos] == '[':
# pos gets updated by filterLink
result.add(filterLink(text, pos))
else:
# Skip over formatting
let matched = text.matchLen(FORMATTING_RE, pos)
if matched > 0:
pos += matched
else:
# We didn't match any of the cases, so output one character
# and proceed
result.add($(text[pos]))
pos += 1
# XML handling
# ------------
# ArticleData is an array that stores four string properties of a page: its
# title, its text content, its redirect value if it redirects to another page,
# and its namespace number. These are the XML values that we care about for
# each page.
type
TagType = enum
TITLE, TEXT, REDIRECT, NS
ArticleData = array[TagType, string]
var RELEVANT_XML_TAGS = ["title", "text", "ns"]
proc handleArticle(article: ArticleData, tokenize: bool) =
if article[NS] == "0" and article[REDIRECT] == "":
if not tokenize:
echo("= $1 =" % [article[TITLE]])
# Parse the article inside a try/except block, discarding the errors
# that appear due to occasional HTML that's flagrantly bad XML.
try:
let text = filterWikitext(filterHTML(article[TEXT]))
if tokenize:
let words = text.split(WORD_SEPARATOR_RE)
for word in words:
if len(word) > 0:
echo(unicodeLower(word))
else:
echo(text.replace(BLANK_LINE_RE, "\n"))
except IndexError:
discard
except RangeError:
discard
proc readMediaWikiXML(input: Stream, tokenize: bool, filename="<input>") =
## Read the XML content that one actually downloads from Wikimedia,
## extracting the article content and sending it to the handleArticle()
## procedure.
var xml: XmlParser
var textBuffer: string = ""
var article: ArticleData
for tag in TITLE..NS:
article[tag] = ""
# Keep track of what text content represents. Is it article text, or a
# similar text property of the page? Is it an attribute value on a tag that
# we should pay attention to?
var gettingText: bool = false
var gettingAttribute: bool = false
xml.open(input, filename, options={reportWhitespace})
while true:
# Scan through the XML, handling each token as it arrives.
xml.next()
case xml.kind
of xmlElementStart, xmlElementOpen:
if RELEVANT_XML_TAGS.contains(xml.elementName):
# If this is a "title", "text", or "ns" tag, prepare to get its
# text content. Move our writing pointer to the beginning of
# the text buffer, so we can overwrite what was there.
textBuffer.setLen(0)
gettingText = true
elif xml.elementName == "page":
# If this is a new instance of the <page> tag that contains all
# these tags, then reset the value that won't necessarily be
# overridden, which is the redirect value.
article[REDIRECT].setLen(0)
elif xml.elementName == "redirect":
# If this is the start of a redirect tag, prepare to get its
# attribute value.
gettingAttribute = true
of xmlAttribute:
# If we're looking for an attribute value, and we found one, add it
# to the buffer.
if gettingAttribute:
textBuffer.add(xml.attrValue)
of xmlCharData, xmlWhitespace:
# If we're looking for text, and we found it, add it to the buffer.
if gettingText:
textBuffer.add(xml.charData)
of xmlElementEnd:
# When we reach the end of an element we care about, take the text
# we've found and store it in the 'article' data structure. We can
# accomplish this quickly by simply swapping their references.
case xml.elementName
of "title":
swap article[TITLE], textBuffer
of "text":
swap article[TEXT], textBuffer
of "redirect":
swap article[REDIRECT], textBuffer
of "ns":
swap article[NS], textBuffer
of "page":
# When we reach the end of the <page> tag, send the article
# data to handleArticle().
handleArticle(article, tokenize)
else:
discard
# Now that we've reached the end of an element, stop extracting
# text. (We'll never need to extract text from elements that can
# have other XML elements nested inside them.)
gettingText = false
gettingAttribute = false
of xmlEof:
break
else:
discard
xml.close
const helptext: string = """
wiki2text - transform MediaWiki XML to text
Usage: wiki2text [-t] [-h]
Options:
-h Show this help text
-t Use a simple tokenizer, outputting one word per line
"""
proc writeHelp() =
stderr.write(helptext)
when isMainModule:
var
tokenize: bool = false
run: bool = true
for kind, key, val in getopt():
case kind
of cmdLongOption, cmdShortOption:
case key
of "tokenize", "t":
tokenize = true
of "help", "h":
writeHelp()
run = false
else:
discard
else:
discard
if run:
readMediaWikiXML(newFileStream("enwiki-20150403-pages-articles.xml", FileMode.fmRead), tokenize)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment