Skip to content

Instantly share code, notes, and snippets.

@rHermes
Created November 3, 2013 15:30
Show Gist options
  • Save rHermes/7291462 to your computer and use it in GitHub Desktop.
Save rHermes/7291462 to your computer and use it in GitHub Desktop.
This was an experiment, trying to parse the XML dump of stackoverflow into a mysql database. Unfortuently, the script failed once the XML dumps got large enough.
from lxml import etree
import pymysql
def fast_iter(context, func):
for event, elem in context:
func(elem)
elem.clear()
while elem.getprevious() is not None:
del elem.getparent()[0]
del context
# Since most of the functions are all alike, this is a nice way to generate the functions.
def process_sqldump(arr):
return lambda elem: arr.append(elem.items())
db = pymysql.connect(host="localhost", user="dev", passwd="xyz", charset="utf8")
c = db.cursor()
# The main def.
def main(folder, dbname):
# The items.
badges = []
comments = []
posthistory = []
posts = []
users = []
votes = []
# Having the infiles paired with their arrays make for easy looping later.
infiles = (('{}/Badges.xml'.format(folder), badges),
('{}/Comments.xml'.format(folder), comments),
('{}/PostHistory.xml'.format(folder), posthistory),
('{}/Posts.xml'.format(folder), posts),
('{}/Users.xml'.format(folder), users),
('{}/Votes.xml'.format(folder), votes))
tables = ('badges',
'comments',
'posthistory',
'posts',
'users',
'votes')
create_tables = '''
CREATE TABLE IF NOT EXISTS `badges` (
`Id` INT NOT NULL AUTO_INCREMENT,
`UserId` INT NOT NULL,
`Name` VARCHAR(50) NOT NULL,
`Date` DATETIME NOT NULL,
PRIMARY KEY (`Id`))
ENGINE = InnoDB;
CREATE TABLE IF NOT EXISTS `users` (
`Id` INT NOT NULL AUTO_INCREMENT,
`Reputation` INT NOT NULL DEFAULT 1,
`CreationDate` DATETIME NOT NULL,
`DisplayName` VARCHAR(45) NOT NULL,
`LastAccessDate` DATETIME NOT NULL,
`WebsiteUrl` VARCHAR(70) NULL,
`Location` VARCHAR(45) NULL,
`AboutMe` MEDIUMTEXT NULL,
`Views` INT NULL DEFAULT 0,
`UpVotes` INT NULL DEFAULT 0,
`DownVotes` INT NULL DEFAULT 0,
`ProfileImageUrl` VARCHAR(100) NULL,
`EmailHash` VARCHAR(45) NULL,
`Age` TINYINT UNSIGNED NULL,
PRIMARY KEY (`Id`))
ENGINE = InnoDB;
CREATE TABLE IF NOT EXISTS `comments` (
`Id` INT NOT NULL AUTO_INCREMENT,
`PostId` INT NOT NULL,
`Score` INT NULL,
`Text` MEDIUMTEXT NOT NULL,
`CreationDate` DATETIME NOT NULL,
`UserId` INT NULL,
PRIMARY KEY (`Id`))
ENGINE = InnoDB;
CREATE TABLE IF NOT EXISTS `posts` (
`Id` INT NOT NULL AUTO_INCREMENT,
`PostTypeId` TINYINT NOT NULL,
`AcceptedAnswerId` INT NULL,
`ParentId` INT NULL,
`CreationDate` DATETIME NOT NULL,
`Score` INT NOT NULL,
`ViewCount` INT NULL,
`Body` LONGTEXT NOT NULL,
`OwnerUserId` INT NULL,
`LastEditorUserId` INT NULL,
`LastEditDate` DATETIME NULL,
`LastActivityDate` DATETIME NULL,
`AnswerCount` INT NULL,
`CommentCount` INT NULL,
`FavoriteCount` INT NULL,
PRIMARY KEY (`Id`))
ENGINE = InnoDB;
CREATE TABLE IF NOT EXISTS `votes` (
`Id` INT NOT NULL AUTO_INCREMENT,
`PostId` INT NOT NULL,
`VoteTypeId` INT NOT NULL,
`UserId` INT NULL,
`CreationDate` DATETIME NOT NULL,
PRIMARY KEY (`Id`))
ENGINE = InnoDB;
CREATE TABLE IF NOT EXISTS `posthistory` (
`Id` INT NOT NULL AUTO_INCREMENT,
`PostHistoryTypeId` INT NOT NULL,
`PostId` INT NOT NULL,
`RevisionGUID` VARCHAR(36) NULL,
`CreationDate` DATETIME NULL,
`UserDisplayName` VARCHAR(45) NULL,
`UserId` INT NULL,
`Comment` MEDIUMTEXT NULL,
`Text` MEDIUMTEXT NULL,
PRIMARY KEY (`Id`))
ENGINE = InnoDB;
'''
for infile, arr in infiles:
context = etree.iterparse(infile, tag='row')
fast_iter(context, process_sqldump(arr))
# Creat schema
c.execute('DROP SCHEMA IF EXISTS {}'.format(dbname))
c.execute('CREATE SCHEMA {} CHARACTER SET utf8'.format(dbname))
c.execute('USE {}'.format(dbname))
# We don't need the existing tables.
for table in tables:
c.execute('DROP TABLE IF EXISTS {}'.format(table))
# We need the tables
c.execute(create_tables)
# This section is going to be all about formatting the data for
# clean sql insertion.
badgesSQL = []
for elem in badges:
elemdict = dict(elem)
t = (elemdict['Id'], elemdict['UserId'], elemdict['Name'], elemdict['Date'])
badgesSQL.append(t)
usersSQL = []
for elem in users:
# Since not all rows contain all the attributes, using a dict is a very
# good solution, as we get the dict.get() method.
elemdict = dict(elem)
t = (elemdict['Id'],
elemdict['Reputation'],
elemdict['CreationDate'],
elemdict['DisplayName'],
elemdict['LastAccessDate'],
elemdict.get('WebsiteUrl'),
elemdict.get('Location'),
elemdict.get('AboutMe'),
elemdict['Views'],
elemdict['UpVotes'],
elemdict['DownVotes'],
elemdict.get('ProfileImageUrl'),
elemdict.get('EmailHash'),
elemdict.get('Age'))
usersSQL.append(t)
commentsSQL = []
for elem in comments:
elemdict = dict(elem)
t = (elemdict['Id'],
elemdict['PostId'],
elemdict.get('Score', 0),
elemdict['Text'],
elemdict['CreationDate'],
elemdict.get('UserId'))
commentsSQL.append(t)
postsSQL = []
for elem in posts:
elemdict = dict(elem)
t = (elemdict['Id'],
elemdict['PostTypeId'],
elemdict.get('AcceptedAnswerId'),
elemdict.get('ParentId'),
elemdict['CreationDate'],
elemdict.get('Score'),
elemdict.get('ViewCount'),
elemdict['Body'],
elemdict.get('OwnerUserId'),
elemdict.get('LastEditorUserId'),
elemdict.get('LastEditDate'),
elemdict.get('LastActivityDate'),
elemdict.get('AnswerCount'),
elemdict.get('CommentCount'),
elemdict.get('FavoriteCount'))
postsSQL.append(t)
votesSQL = []
for elem in votes:
elemdict = dict(elem)
t = (elemdict['Id'],
elemdict['PostId'],
elemdict['VoteTypeId'],
elemdict.get('UserId'),
elemdict['CreationDate'])
votesSQL.append(t)
posthistorySQL = []
for elem in posthistory:
elemdict = dict(elem)
t = (elemdict['Id'],
elemdict['PostHistoryTypeId'],
elemdict['PostId'],
elemdict['RevisionGUID'],
elemdict['CreationDate'],
elemdict.get('UserDisplayName'),
elemdict.get('UserId'),
elemdict.get('Comment'),
elemdict.get('Text'))
posthistorySQL.append(t)
# Here the actual insertion takes place.
# Users
c.executemany('''INSERT INTO `users`
(`Id`, `Reputation`, `CreationDate`, `DisplayName`, `LastAccessDate`,
`WebsiteUrl`, `Location`, `AboutMe`, `Views`, `UpVotes`, `DownVotes`,
`ProfileImageUrl`, `EmailHash`, `Age`)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''', usersSQL)
db.commit()
# Badges
c.executemany("""INSERT INTO badges (Id, Userid, Name, Date)
VALUES (%s, %s, %s, %s)""", badgesSQL)
db.commit()
# Comments
c.executemany('''INSERT INTO `comments`
(`Id`, `PostId`, `Score`, `Text`, `CreationDate`, `UserId`)
VALUES (%s, %s, %s, %s, %s, %s)''', commentsSQL)
db.commit()
# Posts
c.executemany('''INSERT INTO `posts`
(`Id`,
`PostTypeId`,
`AcceptedAnswerId`,
`ParentId`,
`CreationDate`,
`Score`,
`ViewCount`,
`Body`,
`OwnerUserId`,
`LastEditorUserId`,
`LastEditDate`,
`LastActivityDate`,
`AnswerCount`,
`CommentCount`,
`FavoriteCount`)
VALUES
(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''', postsSQL)
db.commit()
# Votes
c.executemany('''INSERT INTO `votes`
(`Id`, `PostId`, `VoteTypeId`, `UserId`, `CreationDate`) VALUES
(%s, %s, %s, %s, %s)''', votesSQL)
db.commit()
# Post History
c.executemany('''INSERT INTO `posthistory`
(`Id`, `PostHistoryTypeId`, `PostId`, `RevisionGUID`, `CreationDate`,
`UserDisplayName`, `UserId`, `Comment`, `Text`)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)''', posthistorySQL)
# Sincr we are using InnoDB we need to do this.
db.commit()
# Close up the DB
#main('meta.sharepoint.stackexchange.com', 'metasharepointstackexchangecom')
main('superuser.com', 'superuser')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment