Last active
June 12, 2016 18:09
-
-
Save def-/c36ce30571bdfcd08d5902fb88cbcd5b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Search wikipedia dump for a string | |
# | |
# XML parse code credit: Rob Speer (https://github.com/rspeer/wiki2text) | |
# | |
import re, options, strutils, os, streams, parsexml | |
let mySearchRe = re"archive[.]org/w?e?b?/?[0-9]{1,14}/|[{][{][ ]?[Ww]ayback" | |
var | |
wpDump = "wikipedia.xml" | |
countAllArticle = 0 # All article count | |
countArticle = 0 # Article titles containing a match (any number of matches) | |
countHits = 0 # Number of matches of search pattern (running total) | |
maxCount = 10 # Stop searching after X countArticle for speed testing. Set to 0 to find all. | |
type | |
TagType = enum | |
TITLE, TEXT, REDIRECT, NS | |
ArticleData = array[TagType, string] | |
# | |
# Search text | |
# | |
proc searchText(article: ArticleData): bool {.discardable.} = | |
var | |
artcount = 0 | |
pos = -1 | |
inc countAllArticle | |
echo "checking ", article[TITLE] | |
while pos < article[TEXT].len: | |
pos = article[TEXT].find(mySearchRe, pos + 1) | |
if pos == -1: break | |
inc artcount | |
echo " found ", artcount | |
if artcount > 0: | |
inc countArticle # number of article titles matching | |
countHits += artcount # number of matches of search pattern | |
result = true | |
if maxCount > 0: | |
if countArticle >= maxCount: | |
echo "" | |
echo "Articles all: ", countAllArticle | |
echo "Articles with a match: ", countArticle | |
echo "Number of pattern matches: ", countHits | |
quit() | |
var | |
RELEVANT_XML_TAGS = ["title", "text", "ns"] | |
textBuffer = "" | |
s = newFileStream(wpDump, fmRead) | |
gettingText = false | |
gettingAttribute = false | |
article: ArticleData | |
xml: XmlParser | |
if s == nil: quit("cannot open the file " & wpDump) | |
for tag in TITLE..NS: article[tag] = "" | |
xml.open(s, wpDump, options={reportWhitespace}) | |
while true: | |
# Scan through the XML, handling each token as it arrives. | |
xml.next() | |
case xml.kind | |
of xmlElementStart, xmlElementOpen: | |
if RELEVANT_XML_TAGS.contains(xml.elementName): | |
# If this is a "title", "text", or "ns" tag, prepare to get its | |
# text content. Move our writing pointer to the beginning of | |
# the text buffer, so we can overwrite what was there. | |
textBuffer.setLen(0) | |
gettingText = true | |
elif xml.elementName == "page": | |
# If this is a new instance of the <page> tag that contains all | |
# these tags, then reset the value that won't necessarily be | |
# overridden, which is the redirect value. | |
article[REDIRECT].setLen(0) | |
elif xml.elementName == "redirect": | |
# If this is the start of a redirect tag, prepare to get its | |
# attribute value. | |
gettingAttribute = true | |
of xmlAttribute: | |
# If we're looking for an attribute value, and we found one, add it | |
# to the buffer. | |
if gettingAttribute: | |
textBuffer.add(xml.attrValue) | |
of xmlCharData, xmlWhitespace: | |
# If we're looking for text, and we found it, add it to the buffer. | |
if gettingText: | |
textBuffer.add(xml.charData) | |
of xmlElementEnd: | |
# When we reach the end of an element we care about, take the text | |
# we've found and store it in the 'article' data structure. We can | |
# accomplish this quickly by simply swapping their references. | |
case xml.elementName | |
of "title": | |
swap article[TITLE], textBuffer | |
of "text": | |
swap article[TEXT], textBuffer | |
of "redirect": | |
swap article[REDIRECT], textBuffer | |
of "ns": | |
swap article[NS], textBuffer | |
of "page": | |
# When we reach the end of the <page> tag, send the article | |
# data to searchText(). | |
searchText(article) | |
else: | |
discard | |
# Now that we've reached the end of an element, stop extracting | |
# text. (We'll never need to extract text from elements that can | |
# have other XML elements nested inside them.) | |
gettingText = false | |
gettingAttribute = false | |
of xmlEof: | |
break | |
else: | |
discard | |
xml.close |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment