Last active
March 29, 2022 23:26
-
-
Save allisontharp/38466fc801d7a9dca96de3b3e5f3213f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Mon May 09 19:58:56 2016 | |
@author: Allison | |
In SQLite, to select for a certain month, do this: | |
select * from collection where strftime('%m', date) = strftime('%m', julianday('2016-03-01')); | |
select games.name, collection.owned from games join collection on games.bggid = collection.bggid order by owned desc limit 10; | |
0_3: added support for: | |
- average weight | |
- number wanting | |
- number trading | |
- number wishing | |
- average rating | |
0_4: added support for: | |
- ability to pickup where it left off it it errors out | |
""" | |
# TODO print line number that error occurred on | |
from datetime import datetime, date | |
from xml.dom import minidom | |
import sqlite3, unicodedata, urllib2, time, sys | |
db_loc = 'boardgamecollection_xml.sqlite' # name of database | |
startTime = datetime.now() # Used to calculate total time later | |
def try_toplevel(tag, dom): | |
try: | |
#dom = minidom.parse(urllib2.urlopen(url)) | |
out = dom.getElementsByTagName(tag) | |
except urllib2.HTTPError as e: | |
# err = 'LINE: {} BGGID: '.format(sys.exec_info()[-1].tb_lineno) | |
# f.write(err + (str(bggid) + ' Error: ' + str(e) + '\n')) | |
f.write(str(bggid) + ' Error: ' + str(e) + '\n') | |
time.sleep(10) | |
dom = minidom.parse(urllib2.urlopen(url)) | |
out = dom.getElementsByTagName(tag) | |
return out | |
def strip_accents(s): | |
return ''.join(c for c in unicodedata.normalize('NFD', s) | |
if unicodedata.category(c) != 'Mn') | |
def owned_ranked(bggid): | |
url = 'https://www.boardgamegeek.com/xmlapi2/thing?stats=1&type=boardgame,boardgameexpansion&id=' | |
url += str(bggid) | |
dom = minidom.parse(urllib2.urlopen(url)) | |
# number of users owning | |
out = try_toplevel('owned', dom) | |
try: | |
owned = int(out[0].attributes['value'].value) | |
except (ValueError, IndexError) as e: | |
owned = -1 | |
#err = 'LINE: {} BGGID: '.format(sys.exec_info()[-1].tb_lineno) | |
#f.write(err + (str(bggid) + ' Error: ' + str(e) + '\n')) | |
f.write(str(bggid) + ' Error: ' + str(e) + '\n') | |
# board game rank | |
rank = dom.getElementsByTagName('rank') | |
try: | |
rank = int(rank[0].attributes['value'].value) # change this to look at friendlyname="Board Game Rank" | |
except (ValueError, IndexError) as e: # TODO: I think IndexError happens when the BGGID is removed. Check this and do something about it | |
rank = 'NULL' | |
# board game weight | |
out = try_toplevel('averageweight', dom) | |
try: | |
weight = float(out[0].attributes['value'].value) | |
except (ValueError, IndexError) as e: | |
weight = -1 | |
# err = 'LINE: {} BGGID: '.format(sys.exec_info()[-1].tb_lineno) | |
# f.write(err + (str(bggid) + ' Error: ' + str(e) + '\n')) | |
f.write(str(bggid) + ' Error: ' + str(e) + '\n') | |
# Number of users trading | |
out = try_toplevel('trading', dom) | |
try: | |
trading = int(out[0].attributes['value'].value) | |
except (ValueError, IndexError) as e: | |
trading = -1 | |
# err = 'LINE: {} BGGID: '.format(sys.exec_info()[-1].tb_lineno) | |
# f.write(err + (str(bggid) + ' Error: ' + str(e) + '\n')) | |
f.write(str(bggid) + ' Error: ' + str(e) + '\n') | |
# Number of users wishing | |
out = try_toplevel('wishing', dom) | |
try: | |
wishing = int(out[0].attributes['value'].value) | |
except (ValueError, IndexError) as e: | |
wishing = -1 | |
# err = 'LINE: {} BGGID: '.format(sys.exec_info()[-1].tb_lineno) | |
# f.write(err + (str(bggid) + ' Error: ' + str(e) + '\n')) | |
f.write(str(bggid) + ' Error: ' + str(e) + '\n') | |
# Number of users wanting | |
out = try_toplevel('wanting', dom) | |
try: | |
wanting = int(out[0].attributes['value'].value) | |
except (ValueError, IndexError) as e: | |
wanting = -1 | |
# err = 'LINE: {} BGGID: '.format(sys.exec_info()[-1].tb_lineno) | |
# f.write(err + (str(bggid) + ' Error: ' + str(e) + '\n')) | |
f.write(str(bggid) + ' Error: ' + str(e) + '\n') | |
# Average Rating | |
out = try_toplevel('average', dom) | |
try: | |
rating = float(out[0].attributes['value'].value) | |
except (ValueError, IndexError) as e: | |
rating = -1 | |
# err = 'LINE: {} BGGID: '.format(sys.exec_info()[-1].tb_lineno) | |
# f.write(err + (str(bggid) + ' Error: ' + str(e) + '\n')) | |
f.write(str(bggid) + ' Error: ' + str(e) + '\n') | |
# BGG rating | |
out = try_toplevel('bayesaverage', dom) | |
try: | |
bggrating = float(out[0].attributes['value'].value) | |
except (ValueError, IndexError) as e: | |
bggrating = -1 | |
# err = 'LINE: {} BGGID: '.format(sys.exec_info()[-1].tb_lineno) | |
# f.write(err + (str(bggid) + ' Error: ' + str(e) + '\n')) | |
f.write(str(bggid) + ' Error: ' + str(e) + '\n') | |
# Number of Ratings | |
out = try_toplevel('usersrated', dom) | |
try: | |
numrating = float(out[0].attributes['value'].value) | |
except (ValueError, IndexError) as e: | |
numrating = -1 | |
# err = 'LINE: {} BGGID: '.format(sys.exec_info()[-1].tb_lineno) | |
# f.write(err + (str(bggid) + ' Error: ' + str(e) + '\n')) | |
f.write(str(bggid) + ' Error: ' + str(e) + '\n') | |
return [owned,rank,weight,trading,wishing,wanting,rating,bggrating,numrating, dom] | |
def add_groups(lst, table, tableid, junctiontable): | |
for name in lst: | |
name = translate(name) | |
#print name | |
query = "SELECT " + str(tableid) + " FROM " + str(table) + " WHERE name = '{nm}'".format(nm = name) | |
c.execute(query) | |
try: | |
groupid = int(c.fetchone()[0]) | |
except TypeError as e: | |
query = "INSERT INTO " + str(table) + " VALUES (NULL, '{nm}')".format(nm = name) | |
c.execute(query) | |
#print " Added " + str(name) + " to " + str(table) + " table." | |
query = "SELECT " + str(tableid) + " FROM " + str(table) + " WHERE name = '{nm}'".format(nm = name) | |
c.execute(query) | |
groupid = int(c.fetchone()[0]) | |
# err = 'LINE: {} BGGID: '.format(sys.exec_info()[-1].tb_lineno) | |
# f.write(err + (str(bggid) + ' Error: ' + str(e) + '\n')) | |
f.write(str(bggid) + ' Error: ' + str(e) + '\n') | |
try: | |
query = "INSERT INTO " + str(junctiontable) + " VALUES ({bg}, {gi})".format(bg = bggid, gi = groupid) | |
c.execute(query) | |
#print " Added to " + str(junctiontable) + " table" | |
except sqlite3.IntegrityError as e: | |
# err = 'LINE: {} BGGID: '.format(sys.exec_info()[-1].tb_lineno) | |
# f.write(err + (str(bggid) + ' Error: ' + str(e) + '\n')) | |
f.write(str(bggid) + ' Error: ' + str(e) + '\n') | |
pass | |
def translate(name): | |
try: | |
name = name.translate(None, "'&-:") | |
except TypeError as e: | |
name = unicode(name) | |
name = strip_accents(name) | |
name = name.encode("ascii", "ignore") | |
name = name.replace("'", "") | |
return name | |
url = 'https://www.boardgamegeek.com/xmlapi2/thing?stats=1&type=boardgame,boardgameexpansion&id=' | |
# Connect to SQLite Database | |
conn = sqlite3.connect(db_loc) | |
c = conn.cursor() | |
f = open('pythonout.txt', 'w') | |
f.write(str(startTime) + '\n') | |
# Check to see if games table exists | |
c.execute("SELECT name FROM sqlite_master WHERE type = 'table' AND name = 'games'") | |
if not c.fetchall(): # if the games table doesn't exist, create it | |
print "Creating Games Table" | |
c.execute("CREATE TABLE games(bggid INTEGER PRIMARY KEY, name VARCHAR(50) NOT NULL, year INTEGER, minplayers INTEGER, maxplayers INTEGER)") | |
# Check to see if collection table exists | |
c.execute("SELECT name FROM sqlite_master WHERE type = 'table' AND name = 'collection'") | |
if not c.fetchall(): | |
print "Creating Collection Table" | |
c.execute("CREATE TABLE collection(bggid INTEGER, date datetime default current_date, owned INTEGER NOT NULL, rank INTEGER, \ | |
weight REAL, trading INT, wishing INT, wanting INT, rating REAL, bggrating REAL, numrating INT, PRIMARY KEY(bggid, date))") | |
# TODO add mechanic table check | |
# TODO add gamemech table check | |
# TODO add publisher table check | |
# TODO add gamepub table check | |
# # Get list of bggids | |
# c.execute("SELECT bggid FROM games") | |
# idlist = c.fetchall() | |
# idlist = [int(i[0]) for i in idlist] # convert list of tuples to list of INTs | |
print 'before' | |
if __name__ == "__main__": | |
if len(sys.argv) == 1: | |
c.execute("SELECT bggid FROM games") | |
idlist = c.fetchall() | |
idlist = [int(i[0]) for i in idlist] # convert list of tuples to list of INTs | |
elif str(sys.argv[1]) == 'restart': | |
c.execute("SELECT bggid FROM games") | |
idlist = c.fetchall() | |
idlist = [int(i[0]) for i in idlist] # convert list of tuples to list of INTs | |
query = "SELECT max(bggid) FROM collection where date like '{dt}%'".format(dt = date.today().strftime('%Y-%m')) | |
c.execute(query) | |
maxid = c.fetchone()[0] | |
idlist = idlist[idlist.index(maxid)+1:] | |
# print ("max") | |
# print maxid | |
# TODO add total number of plays | |
# for bggid in idlist[idlist.index(24200)+1:]: | |
for bggid in idlist: | |
time.sleep(0.25) | |
#time.sleep(0.5) | |
[owned,rank,weight,trading,wishing,wanting,rating,bggrating,numrating, dom] = owned_ranked(bggid) | |
query = "INSERT INTO collection(bggid, owned, rank, weight, trading, wishing, wanting, rating, bggrating, numrating) VALUES ({bi}, {ow}, {rk}, {wt}, {td}, {wish}, {want}, {rt}, {bggr}, {nr})"\ | |
.format(bi = bggid, ow = owned, rk = rank, wt = weight, td = trading, wish = wishing, want = wanting, rt = rating, bggr = bggrating, nr = numrating) | |
c.execute(query) | |
# # for june only: | |
# link = dom.getElementsByTagName('link') | |
# families = [items.attributes['value'].value for items in link if items.attributes['type'].value == "boardgamefamily"] | |
# add_groups(families, "families", "famid", "famgame") | |
today = date.today() | |
today = today.strftime('%Y-%m') | |
# query = "SELECT count(bggid)*1.0/84000 FROM collection WHERE bggid <= {id} and date > '2016-06-30'".format(id = bggid) | |
query = "SELECT count(bggid)*1.0/84000 FROM collection WHERE date like '{dt}%'".format(dt = date.today().strftime('%Y-%m')) | |
c.execute(query) | |
tot = c.fetchone()[0] | |
print str(bggid) + ": " + "%.2f" %(tot*100) + "%" | |
if bggid%100 == 0: | |
conn.commit() | |
check = 0 | |
while check < 100: | |
bggid += 1 | |
print bggid | |
url ='https://www.boardgamegeek.com/xmlapi2/thing?stats=1&type=boardgame,boardgameexpansion&id=' | |
url += str(bggid) | |
time.sleep(0.5) | |
dom = minidom.parse(urllib2.urlopen(url)) | |
item = dom.getElementsByTagName('item') | |
bgcheck = [items.attributes['id'].value for items in item if items.attributes['type'].value == "boardgame"] | |
name = dom.getElementsByTagName('name') | |
if name and bgcheck: # it is a boardgame or expansion | |
check = 0 | |
name = name[0].attributes['value'].value | |
name = translate(name) | |
#print name | |
[owned,rank,weight,trading,wishing,wanting,rating,bggrating, numrating, dom] = owned_ranked(bggid) | |
year = dom.getElementsByTagName('yearpublished') | |
try: | |
year = int(year[0].attributes['value'].value) | |
except ValueError as e: | |
# err = 'LINE: {} BGGID: '.format(sys.exec_info()[-1].tb_lineno) | |
# f.write(err + (str(bggid) + ' Error: ' + str(e) + '\n')) | |
f.write(str(bggid) + ' Error: ' + str(e) + '\n') | |
year = 'NULL' | |
link = dom.getElementsByTagName('link') | |
publishers = [items.attributes['value'].value for items in link if items.attributes['type'].value == "boardgamepublisher"] | |
mechanics = [items.attributes['value'].value for items in link if items.attributes['type'].value == "boardgamemechanic"] | |
artists = [items.attributes['value'].value for items in link if items.attributes['type'].value == "boardgameartist"] | |
designers = [items.attributes['value'].value for items in link if items.attributes['type'].value == "boardgamedesigner"] | |
families = [items.attributes['value'].value for items in link if items.attributes['type'].value == "boardgamefamily"] | |
categories = [items.attributes['value'].value for items in link if items.attributes['type'].value == "boardgamecategory"] | |
minplayers = dom.getElementsByTagName('minplayers') | |
minplayers = minplayers[0].attributes['value'].value | |
if not minplayers: | |
minplayers = 'NULL' | |
maxplayers = dom.getElementsByTagName('maxplayers') | |
maxplayers = maxplayers[0].attributes['value'].value | |
if not maxplayers: | |
maxplayers = 'NULL' | |
add_groups(publishers, "publishers", "pubid", "gamepub") | |
add_groups(mechanics, "mechanics", "mechid", "gamemech") | |
add_groups(artists, "artists", "artid", "artgame") | |
add_groups(designers, "designers", "designid", "designgame") | |
add_groups(categories, "categories", "catid", "catgame") | |
add_groups(families, "families", "famid", "famgame") | |
query = "INSERT INTO games(bggid, name, year, minplayers, maxplayers) VALUES \ | |
({bi}, '{nm}', {yr}, {mn}, {mx})".format(bi = bggid, nm = name, yr = year, mn = minplayers, mx = maxplayers) | |
c.execute(query) | |
query = "INSERT INTO collection(bggid, owned, rank, weight, trading, wishing, wanting, rating, bggrating, numrating) VALUES ({bi}, {ow}, {rk}, {wt}, {td}, {wish}, {want}, {rt}, {bggr}, {nr})"\ | |
.format(bi = bggid, ow = owned, rk = rank, wt = weight, td = trading, wish = wishing, want = wanting, rt = rating, bggr = bggrating, nr = numrating) | |
c.execute(query) | |
today = date.today() | |
today = today.strftime('%Y-%m') | |
query = "SELECT count(bggid)*1.0/84000 FROM collection WHERE date(date) like '{td}%'".format(td = today) | |
c.execute(query) | |
tot = c.fetchone()[0] | |
print check | |
if bggid%100 == 0: | |
conn.commit() | |
else: | |
check += 1 | |
conn.commit() | |
conn.close() | |
f.close() | |
StopTime = datetime.now() # Final Time | |
print "Completed!" | |
print str(StopTime - StartTime) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment