rHermes/PyOverflow.py

## PyOverflow.py
from lxml import etree
import pymysql

def fast_iter(context, func):
    for event, elem in context:
        func(elem)
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    del context


# Since most of the functions are all alike, this is a nice way to generate the functions.
def process_sqldump(arr):
    return lambda elem: arr.append(elem.items())

db = pymysql.connect(host="localhost", user="dev", passwd="xyz", charset="utf8")
c = db.cursor()

# The main def.
def main(folder, dbname):
    # The items.
    badges = []
    comments = []
    posthistory = []
    posts = []
    users = []
    votes = []


    # Having the infiles paired with their arrays make for easy looping later.
    infiles = (('{}/Badges.xml'.format(folder), badges),
               ('{}/Comments.xml'.format(folder), comments),
               ('{}/PostHistory.xml'.format(folder), posthistory),
               ('{}/Posts.xml'.format(folder), posts),
               ('{}/Users.xml'.format(folder), users),
               ('{}/Votes.xml'.format(folder), votes))

    tables = ('badges',
              'comments',
              'posthistory',
              'posts',
              'users',
              'votes')

    create_tables = '''
    CREATE TABLE IF NOT EXISTS `badges` (
      `Id` INT NOT NULL AUTO_INCREMENT,
      `UserId` INT NOT NULL,
      `Name` VARCHAR(50) NOT NULL,
      `Date` DATETIME NOT NULL,
      PRIMARY KEY (`Id`))
    ENGINE = InnoDB;

    CREATE TABLE IF NOT EXISTS `users` (
      `Id` INT NOT NULL AUTO_INCREMENT,
      `Reputation` INT NOT NULL DEFAULT 1,
      `CreationDate` DATETIME NOT NULL,
      `DisplayName` VARCHAR(45) NOT NULL,
      `LastAccessDate` DATETIME NOT NULL,
      `WebsiteUrl` VARCHAR(70) NULL,
      `Location` VARCHAR(45) NULL,
      `AboutMe` MEDIUMTEXT NULL,
      `Views` INT NULL DEFAULT 0,
      `UpVotes` INT NULL DEFAULT 0,
      `DownVotes` INT NULL DEFAULT 0,
      `ProfileImageUrl` VARCHAR(100) NULL,
      `EmailHash` VARCHAR(45) NULL,
      `Age` TINYINT UNSIGNED NULL,
      PRIMARY KEY (`Id`))
    ENGINE = InnoDB;

    CREATE TABLE IF NOT EXISTS `comments` (
      `Id` INT NOT NULL AUTO_INCREMENT,
      `PostId` INT NOT NULL,
      `Score` INT NULL,
      `Text` MEDIUMTEXT NOT NULL,
      `CreationDate` DATETIME NOT NULL,
      `UserId` INT NULL,
      PRIMARY KEY (`Id`))
    ENGINE = InnoDB;

    CREATE TABLE IF NOT EXISTS `posts` (
      `Id` INT NOT NULL AUTO_INCREMENT,
      `PostTypeId` TINYINT NOT NULL,
      `AcceptedAnswerId` INT NULL,
      `ParentId` INT NULL,
      `CreationDate` DATETIME NOT NULL,
      `Score` INT NOT NULL,
      `ViewCount` INT NULL,
      `Body` LONGTEXT NOT NULL,
      `OwnerUserId` INT NULL,
      `LastEditorUserId` INT NULL,
      `LastEditDate` DATETIME NULL,
      `LastActivityDate` DATETIME NULL,
      `AnswerCount` INT NULL,
      `CommentCount` INT NULL,
      `FavoriteCount` INT NULL,
      PRIMARY KEY (`Id`))
    ENGINE = InnoDB;

    CREATE TABLE IF NOT EXISTS `votes` (
      `Id` INT NOT NULL AUTO_INCREMENT,
      `PostId` INT NOT NULL,
      `VoteTypeId` INT NOT NULL,
      `UserId` INT NULL,
      `CreationDate` DATETIME NOT NULL,
      PRIMARY KEY (`Id`))
    ENGINE = InnoDB;

    CREATE TABLE IF NOT EXISTS `posthistory` (
      `Id` INT NOT NULL AUTO_INCREMENT,
      `PostHistoryTypeId` INT NOT NULL,
      `PostId` INT NOT NULL,
      `RevisionGUID` VARCHAR(36) NULL,
      `CreationDate` DATETIME NULL,
      `UserDisplayName` VARCHAR(45) NULL,
      `UserId` INT NULL,
      `Comment` MEDIUMTEXT NULL,
      `Text` MEDIUMTEXT NULL,
      PRIMARY KEY (`Id`))
    ENGINE = InnoDB;
    '''

    for infile, arr in infiles:
        context = etree.iterparse(infile, tag='row')
        fast_iter(context, process_sqldump(arr))

    # Creat schema
    c.execute('DROP SCHEMA IF EXISTS {}'.format(dbname))
    c.execute('CREATE SCHEMA {} CHARACTER SET utf8'.format(dbname))
    c.execute('USE {}'.format(dbname))

    # We don't need the existing tables.
    for table in tables:
        c.execute('DROP TABLE IF EXISTS {}'.format(table))

    # We need the tables
    c.execute(create_tables)


    # This section is going to be all about formatting the data for
    # clean sql insertion.
    badgesSQL = []
    for elem in badges:
        elemdict = dict(elem)
        t = (elemdict['Id'], elemdict['UserId'], elemdict['Name'], elemdict['Date'])
        badgesSQL.append(t)

    usersSQL = []
    for elem in users:
        # Since not all rows contain all the attributes, using a dict is a very
        # good solution, as we get the dict.get() method.
        elemdict = dict(elem)
        t = (elemdict['Id'],
            elemdict['Reputation'],
            elemdict['CreationDate'],
            elemdict['DisplayName'],
            elemdict['LastAccessDate'],
            elemdict.get('WebsiteUrl'),
            elemdict.get('Location'),
            elemdict.get('AboutMe'),
            elemdict['Views'],
            elemdict['UpVotes'],
            elemdict['DownVotes'],
            elemdict.get('ProfileImageUrl'),
            elemdict.get('EmailHash'),
            elemdict.get('Age'))
        usersSQL.append(t)

    commentsSQL = []
    for elem in comments:
        elemdict = dict(elem)
        t = (elemdict['Id'],
            elemdict['PostId'],
            elemdict.get('Score', 0),
            elemdict['Text'],
            elemdict['CreationDate'],
            elemdict.get('UserId'))
        commentsSQL.append(t)

    postsSQL = []
    for elem in posts:
        elemdict = dict(elem)
        t = (elemdict['Id'],
            elemdict['PostTypeId'],
            elemdict.get('AcceptedAnswerId'),
            elemdict.get('ParentId'),
            elemdict['CreationDate'],
            elemdict.get('Score'),
            elemdict.get('ViewCount'),
            elemdict['Body'],
            elemdict.get('OwnerUserId'),
            elemdict.get('LastEditorUserId'),
            elemdict.get('LastEditDate'),
            elemdict.get('LastActivityDate'),
            elemdict.get('AnswerCount'),
            elemdict.get('CommentCount'),
            elemdict.get('FavoriteCount'))
        postsSQL.append(t)

    votesSQL = []
    for elem in votes:
        elemdict = dict(elem)
        t = (elemdict['Id'],
            elemdict['PostId'],
            elemdict['VoteTypeId'],
            elemdict.get('UserId'),
            elemdict['CreationDate'])
        votesSQL.append(t)

    posthistorySQL = []
    for elem in posthistory:
        elemdict = dict(elem)
        t = (elemdict['Id'],
            elemdict['PostHistoryTypeId'],
            elemdict['PostId'],
            elemdict['RevisionGUID'],
            elemdict['CreationDate'],
            elemdict.get('UserDisplayName'),
            elemdict.get('UserId'),
            elemdict.get('Comment'),
            elemdict.get('Text'))
        posthistorySQL.append(t)


    # Here the actual insertion takes place.

    # Users
    c.executemany('''INSERT INTO `users`
    (`Id`, `Reputation`, `CreationDate`, `DisplayName`, `LastAccessDate`,
     `WebsiteUrl`, `Location`, `AboutMe`, `Views`, `UpVotes`, `DownVotes`,
     `ProfileImageUrl`, `EmailHash`, `Age`)
      VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''', usersSQL)

    db.commit()

    # Badges
    c.executemany("""INSERT INTO badges (Id, Userid, Name, Date)
        VALUES (%s, %s, %s, %s)""", badgesSQL)

    db.commit()

    # Comments
    c.executemany('''INSERT INTO `comments`
    (`Id`, `PostId`, `Score`, `Text`, `CreationDate`, `UserId`)
    VALUES (%s, %s, %s, %s, %s, %s)''', commentsSQL)

    db.commit()

    # Posts
    c.executemany('''INSERT INTO `posts`
    (`Id`,
    `PostTypeId`,
    `AcceptedAnswerId`,
    `ParentId`,
    `CreationDate`,
    `Score`,
    `ViewCount`,
    `Body`,
    `OwnerUserId`,
    `LastEditorUserId`,
    `LastEditDate`,
    `LastActivityDate`,
    `AnswerCount`,
    `CommentCount`,
    `FavoriteCount`)
    VALUES
    (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''', postsSQL)

    db.commit()

    # Votes
    c.executemany('''INSERT INTO `votes`
    (`Id`, `PostId`, `VoteTypeId`, `UserId`, `CreationDate`) VALUES
    (%s, %s, %s, %s, %s)''', votesSQL)

    db.commit()

    # Post History
    c.executemany('''INSERT INTO `posthistory`
    (`Id`, `PostHistoryTypeId`, `PostId`, `RevisionGUID`, `CreationDate`,
    `UserDisplayName`, `UserId`, `Comment`, `Text`)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)''', posthistorySQL)

    # Sincr we are using InnoDB we need to do this.
    db.commit()

    # Close up the DB

#main('meta.sharepoint.stackexchange.com', 'metasharepointstackexchangecom')
main('superuser.com', 'superuser')
	from lxml import etree
	import pymysql

	def fast_iter(context, func):
	for event, elem in context:
	func(elem)
	elem.clear()
	while elem.getprevious() is not None:
	del elem.getparent()[0]
	del context


	# Since most of the functions are all alike, this is a nice way to generate the functions.
	def process_sqldump(arr):
	return lambda elem: arr.append(elem.items())

	db = pymysql.connect(host="localhost", user="dev", passwd="xyz", charset="utf8")
	c = db.cursor()

	# The main def.
	def main(folder, dbname):
	# The items.
	badges = []
	comments = []
	posthistory = []
	posts = []
	users = []
	votes = []


	# Having the infiles paired with their arrays make for easy looping later.
	infiles = (('{}/Badges.xml'.format(folder), badges),
	('{}/Comments.xml'.format(folder), comments),
	('{}/PostHistory.xml'.format(folder), posthistory),
	('{}/Posts.xml'.format(folder), posts),
	('{}/Users.xml'.format(folder), users),
	('{}/Votes.xml'.format(folder), votes))

	tables = ('badges',
	'comments',
	'posthistory',
	'posts',
	'users',
	'votes')

	create_tables = '''
	CREATE TABLE IF NOT EXISTS `badges` (
	`Id` INT NOT NULL AUTO_INCREMENT,
	`UserId` INT NOT NULL,
	`Name` VARCHAR(50) NOT NULL,
	`Date` DATETIME NOT NULL,
	PRIMARY KEY (`Id`))
	ENGINE = InnoDB;

	CREATE TABLE IF NOT EXISTS `users` (
	`Id` INT NOT NULL AUTO_INCREMENT,
	`Reputation` INT NOT NULL DEFAULT 1,
	`CreationDate` DATETIME NOT NULL,
	`DisplayName` VARCHAR(45) NOT NULL,
	`LastAccessDate` DATETIME NOT NULL,
	`WebsiteUrl` VARCHAR(70) NULL,
	`Location` VARCHAR(45) NULL,
	`AboutMe` MEDIUMTEXT NULL,
	`Views` INT NULL DEFAULT 0,
	`UpVotes` INT NULL DEFAULT 0,
	`DownVotes` INT NULL DEFAULT 0,
	`ProfileImageUrl` VARCHAR(100) NULL,
	`EmailHash` VARCHAR(45) NULL,
	`Age` TINYINT UNSIGNED NULL,
	PRIMARY KEY (`Id`))
	ENGINE = InnoDB;

	CREATE TABLE IF NOT EXISTS `comments` (
	`Id` INT NOT NULL AUTO_INCREMENT,
	`PostId` INT NOT NULL,
	`Score` INT NULL,
	`Text` MEDIUMTEXT NOT NULL,
	`CreationDate` DATETIME NOT NULL,
	`UserId` INT NULL,
	PRIMARY KEY (`Id`))
	ENGINE = InnoDB;

	CREATE TABLE IF NOT EXISTS `posts` (
	`Id` INT NOT NULL AUTO_INCREMENT,
	`PostTypeId` TINYINT NOT NULL,
	`AcceptedAnswerId` INT NULL,
	`ParentId` INT NULL,
	`CreationDate` DATETIME NOT NULL,
	`Score` INT NOT NULL,
	`ViewCount` INT NULL,
	`Body` LONGTEXT NOT NULL,
	`OwnerUserId` INT NULL,
	`LastEditorUserId` INT NULL,
	`LastEditDate` DATETIME NULL,
	`LastActivityDate` DATETIME NULL,
	`AnswerCount` INT NULL,
	`CommentCount` INT NULL,
	`FavoriteCount` INT NULL,
	PRIMARY KEY (`Id`))
	ENGINE = InnoDB;

	CREATE TABLE IF NOT EXISTS `votes` (
	`Id` INT NOT NULL AUTO_INCREMENT,
	`PostId` INT NOT NULL,
	`VoteTypeId` INT NOT NULL,
	`UserId` INT NULL,
	`CreationDate` DATETIME NOT NULL,
	PRIMARY KEY (`Id`))
	ENGINE = InnoDB;

	CREATE TABLE IF NOT EXISTS `posthistory` (
	`Id` INT NOT NULL AUTO_INCREMENT,
	`PostHistoryTypeId` INT NOT NULL,
	`PostId` INT NOT NULL,
	`RevisionGUID` VARCHAR(36) NULL,
	`CreationDate` DATETIME NULL,
	`UserDisplayName` VARCHAR(45) NULL,
	`UserId` INT NULL,
	`Comment` MEDIUMTEXT NULL,
	`Text` MEDIUMTEXT NULL,
	PRIMARY KEY (`Id`))
	ENGINE = InnoDB;
	'''

	for infile, arr in infiles:
	context = etree.iterparse(infile, tag='row')
	fast_iter(context, process_sqldump(arr))

	# Creat schema
	c.execute('DROP SCHEMA IF EXISTS {}'.format(dbname))
	c.execute('CREATE SCHEMA {} CHARACTER SET utf8'.format(dbname))
	c.execute('USE {}'.format(dbname))

	# We don't need the existing tables.
	for table in tables:
	c.execute('DROP TABLE IF EXISTS {}'.format(table))

	# We need the tables
	c.execute(create_tables)


	# This section is going to be all about formatting the data for
	# clean sql insertion.
	badgesSQL = []
	for elem in badges:
	elemdict = dict(elem)
	t = (elemdict['Id'], elemdict['UserId'], elemdict['Name'], elemdict['Date'])
	badgesSQL.append(t)

	usersSQL = []
	for elem in users:
	# Since not all rows contain all the attributes, using a dict is a very
	# good solution, as we get the dict.get() method.
	elemdict = dict(elem)
	t = (elemdict['Id'],
	elemdict['Reputation'],
	elemdict['CreationDate'],
	elemdict['DisplayName'],
	elemdict['LastAccessDate'],
	elemdict.get('WebsiteUrl'),
	elemdict.get('Location'),
	elemdict.get('AboutMe'),
	elemdict['Views'],
	elemdict['UpVotes'],
	elemdict['DownVotes'],
	elemdict.get('ProfileImageUrl'),
	elemdict.get('EmailHash'),
	elemdict.get('Age'))
	usersSQL.append(t)

	commentsSQL = []
	for elem in comments:
	elemdict = dict(elem)
	t = (elemdict['Id'],
	elemdict['PostId'],
	elemdict.get('Score', 0),
	elemdict['Text'],
	elemdict['CreationDate'],
	elemdict.get('UserId'))
	commentsSQL.append(t)

	postsSQL = []
	for elem in posts:
	elemdict = dict(elem)
	t = (elemdict['Id'],
	elemdict['PostTypeId'],
	elemdict.get('AcceptedAnswerId'),
	elemdict.get('ParentId'),
	elemdict['CreationDate'],
	elemdict.get('Score'),
	elemdict.get('ViewCount'),
	elemdict['Body'],
	elemdict.get('OwnerUserId'),
	elemdict.get('LastEditorUserId'),
	elemdict.get('LastEditDate'),
	elemdict.get('LastActivityDate'),
	elemdict.get('AnswerCount'),
	elemdict.get('CommentCount'),
	elemdict.get('FavoriteCount'))
	postsSQL.append(t)

	votesSQL = []
	for elem in votes:
	elemdict = dict(elem)
	t = (elemdict['Id'],
	elemdict['PostId'],
	elemdict['VoteTypeId'],
	elemdict.get('UserId'),
	elemdict['CreationDate'])
	votesSQL.append(t)

	posthistorySQL = []
	for elem in posthistory:
	elemdict = dict(elem)
	t = (elemdict['Id'],
	elemdict['PostHistoryTypeId'],
	elemdict['PostId'],
	elemdict['RevisionGUID'],
	elemdict['CreationDate'],
	elemdict.get('UserDisplayName'),
	elemdict.get('UserId'),
	elemdict.get('Comment'),
	elemdict.get('Text'))
	posthistorySQL.append(t)



	# Here the actual insertion takes place.

	# Users
	c.executemany('''INSERT INTO `users`
	(`Id`, `Reputation`, `CreationDate`, `DisplayName`, `LastAccessDate`,
	`WebsiteUrl`, `Location`, `AboutMe`, `Views`, `UpVotes`, `DownVotes`,
	`ProfileImageUrl`, `EmailHash`, `Age`)
	VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''', usersSQL)

	db.commit()

	# Badges
	c.executemany("""INSERT INTO badges (Id, Userid, Name, Date)
	VALUES (%s, %s, %s, %s)""", badgesSQL)

	db.commit()

	# Comments
	c.executemany('''INSERT INTO `comments`
	(`Id`, `PostId`, `Score`, `Text`, `CreationDate`, `UserId`)
	VALUES (%s, %s, %s, %s, %s, %s)''', commentsSQL)

	db.commit()

	# Posts
	c.executemany('''INSERT INTO `posts`
	(`Id`,
	`PostTypeId`,
	`AcceptedAnswerId`,
	`ParentId`,
	`CreationDate`,
	`Score`,
	`ViewCount`,
	`Body`,
	`OwnerUserId`,
	`LastEditorUserId`,
	`LastEditDate`,
	`LastActivityDate`,
	`AnswerCount`,
	`CommentCount`,
	`FavoriteCount`)
	VALUES
	(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''', postsSQL)

	db.commit()

	# Votes
	c.executemany('''INSERT INTO `votes`
	(`Id`, `PostId`, `VoteTypeId`, `UserId`, `CreationDate`) VALUES
	(%s, %s, %s, %s, %s)''', votesSQL)

	db.commit()

	# Post History
	c.executemany('''INSERT INTO `posthistory`
	(`Id`, `PostHistoryTypeId`, `PostId`, `RevisionGUID`, `CreationDate`,
	`UserDisplayName`, `UserId`, `Comment`, `Text`)
	VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)''', posthistorySQL)

	# Sincr we are using InnoDB we need to do this.
	db.commit()

	# Close up the DB

	#main('meta.sharepoint.stackexchange.com', 'metasharepointstackexchangecom')
	main('superuser.com', 'superuser')