Skip to content

Instantly share code, notes, and snippets.

@gamingrobot
Created March 17, 2013 15:20
Show Gist options
  • Save gamingrobot/5181995 to your computer and use it in GitHub Desktop.
Save gamingrobot/5181995 to your computer and use it in GitHub Desktop.
import sys
import HTMLParser
import re
import MySQLdb
htmlparser = HTMLParser.HTMLParser()
numwords = 2
db = MySQLdb.connect(host="localhost", user="root", passwd="", db="wiki1")
cur = db.cursor()
def main(filename):
with open(str(filename)) as infile:
for line in infile:
processLine(line)
def processLine(line):
line = line.strip()
if(hasData(line)):
data = stripLine(line)
if data != "":
splitdata = data.split(" ")
splitdata = filter(None, splitdata)
if len(splitdata) >= numwords:
processWords(splitdata)
def processWords(words):
for currword in range(len(words) - 1):
word1 = words[currword]
word2 = words[currword + 1]
cur.execute("SELECT COUNT(*) FROM `words2` WHERE `word1`=%s AND `word2`=%s;", (word1, word2))
if cur.fetchone()[0] == 0:
print "New Word: %s" % word1
cur.execute("INSERT INTO `words2` (`word1`, `word2`, `count`) VALUES (%s, %s, 0);", (word1, word2))
cur.connection.commit()
else:
cur.execute("UPDATE `words2` SET `count`=`count`+1 WHERE `word1`=%s AND `word2`=%s LIMIT 1;", (word1, word2))
cur.connection.commit()
def hasData(line):
if line.startswith(("*", "!", "|", ";", "<", "{", "[", "&", ".")):
return False
if line.startswith("==") and line.endswith("=="):
return False
return True
def stripLine(line):
line = line.decode('utf8')
data = htmlparser.unescape(line).lower() # decode htmlcodes and make lowercase
data = re.sub('\[\[(?:[^\]|]*\|)?([^\]|]*)\]\]', '\1', data) # turn [[Link]] to link
p = re.compile('\{\{*.*\}\}')
data = p.sub('', data) # remove {{data}}
p = re.compile('<.*?>')
data = p.sub(' ', data) # remove <ref>
data = re.sub('http.*? ', '', data)
data = re.sub("[^A-Za-z0-9 ]", "", data) # remove anything extra
data = data.strip()
return data
if __name__ == '__main__':
main(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment