Created
November 3, 2013 15:30
-
-
Save rHermes/7291462 to your computer and use it in GitHub Desktop.
This was an experiment, trying to parse the XML dump of stackoverflow into a mysql database. Unfortuently, the script failed once the XML dumps got large enough.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import etree | |
import pymysql | |
def fast_iter(context, func): | |
for event, elem in context: | |
func(elem) | |
elem.clear() | |
while elem.getprevious() is not None: | |
del elem.getparent()[0] | |
del context | |
# Since most of the functions are all alike, this is a nice way to generate the functions. | |
def process_sqldump(arr): | |
return lambda elem: arr.append(elem.items()) | |
db = pymysql.connect(host="localhost", user="dev", passwd="xyz", charset="utf8") | |
c = db.cursor() | |
# The main def. | |
def main(folder, dbname): | |
# The items. | |
badges = [] | |
comments = [] | |
posthistory = [] | |
posts = [] | |
users = [] | |
votes = [] | |
# Having the infiles paired with their arrays make for easy looping later. | |
infiles = (('{}/Badges.xml'.format(folder), badges), | |
('{}/Comments.xml'.format(folder), comments), | |
('{}/PostHistory.xml'.format(folder), posthistory), | |
('{}/Posts.xml'.format(folder), posts), | |
('{}/Users.xml'.format(folder), users), | |
('{}/Votes.xml'.format(folder), votes)) | |
tables = ('badges', | |
'comments', | |
'posthistory', | |
'posts', | |
'users', | |
'votes') | |
create_tables = ''' | |
CREATE TABLE IF NOT EXISTS `badges` ( | |
`Id` INT NOT NULL AUTO_INCREMENT, | |
`UserId` INT NOT NULL, | |
`Name` VARCHAR(50) NOT NULL, | |
`Date` DATETIME NOT NULL, | |
PRIMARY KEY (`Id`)) | |
ENGINE = InnoDB; | |
CREATE TABLE IF NOT EXISTS `users` ( | |
`Id` INT NOT NULL AUTO_INCREMENT, | |
`Reputation` INT NOT NULL DEFAULT 1, | |
`CreationDate` DATETIME NOT NULL, | |
`DisplayName` VARCHAR(45) NOT NULL, | |
`LastAccessDate` DATETIME NOT NULL, | |
`WebsiteUrl` VARCHAR(70) NULL, | |
`Location` VARCHAR(45) NULL, | |
`AboutMe` MEDIUMTEXT NULL, | |
`Views` INT NULL DEFAULT 0, | |
`UpVotes` INT NULL DEFAULT 0, | |
`DownVotes` INT NULL DEFAULT 0, | |
`ProfileImageUrl` VARCHAR(100) NULL, | |
`EmailHash` VARCHAR(45) NULL, | |
`Age` TINYINT UNSIGNED NULL, | |
PRIMARY KEY (`Id`)) | |
ENGINE = InnoDB; | |
CREATE TABLE IF NOT EXISTS `comments` ( | |
`Id` INT NOT NULL AUTO_INCREMENT, | |
`PostId` INT NOT NULL, | |
`Score` INT NULL, | |
`Text` MEDIUMTEXT NOT NULL, | |
`CreationDate` DATETIME NOT NULL, | |
`UserId` INT NULL, | |
PRIMARY KEY (`Id`)) | |
ENGINE = InnoDB; | |
CREATE TABLE IF NOT EXISTS `posts` ( | |
`Id` INT NOT NULL AUTO_INCREMENT, | |
`PostTypeId` TINYINT NOT NULL, | |
`AcceptedAnswerId` INT NULL, | |
`ParentId` INT NULL, | |
`CreationDate` DATETIME NOT NULL, | |
`Score` INT NOT NULL, | |
`ViewCount` INT NULL, | |
`Body` LONGTEXT NOT NULL, | |
`OwnerUserId` INT NULL, | |
`LastEditorUserId` INT NULL, | |
`LastEditDate` DATETIME NULL, | |
`LastActivityDate` DATETIME NULL, | |
`AnswerCount` INT NULL, | |
`CommentCount` INT NULL, | |
`FavoriteCount` INT NULL, | |
PRIMARY KEY (`Id`)) | |
ENGINE = InnoDB; | |
CREATE TABLE IF NOT EXISTS `votes` ( | |
`Id` INT NOT NULL AUTO_INCREMENT, | |
`PostId` INT NOT NULL, | |
`VoteTypeId` INT NOT NULL, | |
`UserId` INT NULL, | |
`CreationDate` DATETIME NOT NULL, | |
PRIMARY KEY (`Id`)) | |
ENGINE = InnoDB; | |
CREATE TABLE IF NOT EXISTS `posthistory` ( | |
`Id` INT NOT NULL AUTO_INCREMENT, | |
`PostHistoryTypeId` INT NOT NULL, | |
`PostId` INT NOT NULL, | |
`RevisionGUID` VARCHAR(36) NULL, | |
`CreationDate` DATETIME NULL, | |
`UserDisplayName` VARCHAR(45) NULL, | |
`UserId` INT NULL, | |
`Comment` MEDIUMTEXT NULL, | |
`Text` MEDIUMTEXT NULL, | |
PRIMARY KEY (`Id`)) | |
ENGINE = InnoDB; | |
''' | |
for infile, arr in infiles: | |
context = etree.iterparse(infile, tag='row') | |
fast_iter(context, process_sqldump(arr)) | |
# Creat schema | |
c.execute('DROP SCHEMA IF EXISTS {}'.format(dbname)) | |
c.execute('CREATE SCHEMA {} CHARACTER SET utf8'.format(dbname)) | |
c.execute('USE {}'.format(dbname)) | |
# We don't need the existing tables. | |
for table in tables: | |
c.execute('DROP TABLE IF EXISTS {}'.format(table)) | |
# We need the tables | |
c.execute(create_tables) | |
# This section is going to be all about formatting the data for | |
# clean sql insertion. | |
badgesSQL = [] | |
for elem in badges: | |
elemdict = dict(elem) | |
t = (elemdict['Id'], elemdict['UserId'], elemdict['Name'], elemdict['Date']) | |
badgesSQL.append(t) | |
usersSQL = [] | |
for elem in users: | |
# Since not all rows contain all the attributes, using a dict is a very | |
# good solution, as we get the dict.get() method. | |
elemdict = dict(elem) | |
t = (elemdict['Id'], | |
elemdict['Reputation'], | |
elemdict['CreationDate'], | |
elemdict['DisplayName'], | |
elemdict['LastAccessDate'], | |
elemdict.get('WebsiteUrl'), | |
elemdict.get('Location'), | |
elemdict.get('AboutMe'), | |
elemdict['Views'], | |
elemdict['UpVotes'], | |
elemdict['DownVotes'], | |
elemdict.get('ProfileImageUrl'), | |
elemdict.get('EmailHash'), | |
elemdict.get('Age')) | |
usersSQL.append(t) | |
commentsSQL = [] | |
for elem in comments: | |
elemdict = dict(elem) | |
t = (elemdict['Id'], | |
elemdict['PostId'], | |
elemdict.get('Score', 0), | |
elemdict['Text'], | |
elemdict['CreationDate'], | |
elemdict.get('UserId')) | |
commentsSQL.append(t) | |
postsSQL = [] | |
for elem in posts: | |
elemdict = dict(elem) | |
t = (elemdict['Id'], | |
elemdict['PostTypeId'], | |
elemdict.get('AcceptedAnswerId'), | |
elemdict.get('ParentId'), | |
elemdict['CreationDate'], | |
elemdict.get('Score'), | |
elemdict.get('ViewCount'), | |
elemdict['Body'], | |
elemdict.get('OwnerUserId'), | |
elemdict.get('LastEditorUserId'), | |
elemdict.get('LastEditDate'), | |
elemdict.get('LastActivityDate'), | |
elemdict.get('AnswerCount'), | |
elemdict.get('CommentCount'), | |
elemdict.get('FavoriteCount')) | |
postsSQL.append(t) | |
votesSQL = [] | |
for elem in votes: | |
elemdict = dict(elem) | |
t = (elemdict['Id'], | |
elemdict['PostId'], | |
elemdict['VoteTypeId'], | |
elemdict.get('UserId'), | |
elemdict['CreationDate']) | |
votesSQL.append(t) | |
posthistorySQL = [] | |
for elem in posthistory: | |
elemdict = dict(elem) | |
t = (elemdict['Id'], | |
elemdict['PostHistoryTypeId'], | |
elemdict['PostId'], | |
elemdict['RevisionGUID'], | |
elemdict['CreationDate'], | |
elemdict.get('UserDisplayName'), | |
elemdict.get('UserId'), | |
elemdict.get('Comment'), | |
elemdict.get('Text')) | |
posthistorySQL.append(t) | |
# Here the actual insertion takes place. | |
# Users | |
c.executemany('''INSERT INTO `users` | |
(`Id`, `Reputation`, `CreationDate`, `DisplayName`, `LastAccessDate`, | |
`WebsiteUrl`, `Location`, `AboutMe`, `Views`, `UpVotes`, `DownVotes`, | |
`ProfileImageUrl`, `EmailHash`, `Age`) | |
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''', usersSQL) | |
db.commit() | |
# Badges | |
c.executemany("""INSERT INTO badges (Id, Userid, Name, Date) | |
VALUES (%s, %s, %s, %s)""", badgesSQL) | |
db.commit() | |
# Comments | |
c.executemany('''INSERT INTO `comments` | |
(`Id`, `PostId`, `Score`, `Text`, `CreationDate`, `UserId`) | |
VALUES (%s, %s, %s, %s, %s, %s)''', commentsSQL) | |
db.commit() | |
# Posts | |
c.executemany('''INSERT INTO `posts` | |
(`Id`, | |
`PostTypeId`, | |
`AcceptedAnswerId`, | |
`ParentId`, | |
`CreationDate`, | |
`Score`, | |
`ViewCount`, | |
`Body`, | |
`OwnerUserId`, | |
`LastEditorUserId`, | |
`LastEditDate`, | |
`LastActivityDate`, | |
`AnswerCount`, | |
`CommentCount`, | |
`FavoriteCount`) | |
VALUES | |
(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''', postsSQL) | |
db.commit() | |
# Votes | |
c.executemany('''INSERT INTO `votes` | |
(`Id`, `PostId`, `VoteTypeId`, `UserId`, `CreationDate`) VALUES | |
(%s, %s, %s, %s, %s)''', votesSQL) | |
db.commit() | |
# Post History | |
c.executemany('''INSERT INTO `posthistory` | |
(`Id`, `PostHistoryTypeId`, `PostId`, `RevisionGUID`, `CreationDate`, | |
`UserDisplayName`, `UserId`, `Comment`, `Text`) | |
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)''', posthistorySQL) | |
# Sincr we are using InnoDB we need to do this. | |
db.commit() | |
# Close up the DB | |
#main('meta.sharepoint.stackexchange.com', 'metasharepointstackexchangecom') | |
main('superuser.com', 'superuser') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment