Skip to content

Instantly share code, notes, and snippets.

@matt-bernhardt
Last active January 4, 2016 17:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save matt-bernhardt/8655646 to your computer and use it in GitHub Desktop.
Save matt-bernhardt/8655646 to your computer and use it in GitHub Desktop.
This is a quick and crude summarizing script that assembles a summary collection on top of a raw requests collection - a loose part of the OA Stats project.
# update summary data based on requests collection
# Imports
from pymongo import MongoClient
# Globals
log = ''
reqs = ''
sums = ''
def overallData():
# This builds the data dataset for the overall collection
global log
global reqs
global sums
log.write('\n### Overall Data\n\n')
print('Overall Data')
rs = reqs.aggregate( [
{ "$group" : { "_id" : "$handle" , "downloads" : { "$sum" : 1 } } } ,
{ "$group" : { "_id" : "$_id.dlc" , "size" : { "$sum" : 1 } , "downloads" : { "$sum": "$downloads"} } }
] )
for item in rs["result"]:
sums.update(
{"_id" : "Overall"},
{"$set" : {"type" : "overall", "size" : item["size"], "downloads" : item["downloads"] } },
True
)
def overallMap():
# This builds the map dataset for the overall collection
global log
global reqs
global sums
log.write('\n### Overall Map\n\n')
print('Overall Map')
rs = reqs.aggregate( [
{ "$group" : { "_id" : "$country", "downloads" : { "$sum" : 1 } } },
{ "$sort" : { "_id" : 1 } }
] )
tempData = []
for item in rs["result"]:
tempItem = {}
tempItem["country"] = item["_id"]
tempItem["downloads"] = item["downloads"]
tempData.append(tempItem)
sums.update(
{"_id" : "Overall"},
{"$set" : {"countries" : tempData}},
True
)
def overallTime():
# This builds the timeline dataset for the overall collection
global log
global reqs
global sums
log.write('\n### Overall Timeline\n\n')
print('Overall Timeline')
rs = reqs.aggregate( [
{ "$group" : { "_id" : { "$substr" : [ "$time", 0, 10] } , "downloads" : { "$sum" : 1 } } },
{ "$sort" : { "_id" : 1 } }
] )
tempData = []
for item in rs["result"]:
tempItem = {}
tempItem["date"] = item["_id"]
tempItem["downloads"] = item["downloads"]
tempData.append(tempItem)
sums.update(
{"_id" : "Overall"},
{"$set" : {"dates" : tempData}},
True
)
###############################################################################
###############################################################################
###############################################################################
def dlcData():
global log
global reqs
global sums
log.write('\n### Data\n')
print('Summarizing Data')
rs = reqs.aggregate( [
{ "$group" : { "_id" : { "dlc" : "$dlc" , "handle" : "$handle" }, "downloads" : { "$sum" : 1 } } } ,
{ "$group" : { "_id" : "$_id.dlc" , "size" : { "$sum" : 1 } , "downloads" : { "$sum": "$downloads"} } } ,
{ "$sort" : { "_id" : 1 } }
] )
for item in rs["result"]:
log.write(' '+str(item)+'\n')
dlc = item["_id"]
tempQuery = {}
tempQuery['type'] = "dlc"
tempQuery['size'] = item["size"]
tempQuery['downloads'] = item["downloads"]
sums.update(
{"_id" : dlc},
{"$set" : tempQuery},
True
)
def dlcTime():
# This builds a list of download totals, for each day, for each DLC.
# That list, grouped by DLC, is then stored in the summaries collection.
global log
global reqs
global sums
log.write('\n### Timeline\n\n')
print('Summarizing Timeline')
# Because the list of dates is prohibitively large, we can't get a master recordset like we do
# for the map - instead we have to split the queries up for each DLC.
# Get list of DLCs
dlcs = reqs.distinct('dlc')
for term in dlcs:
# For each DLC...
thisDLC = term
print(str(thisDLC))
log.write(' '+str(thisDLC)+'\n')
# ... get the list of daily download totals ...
rs = reqs.aggregate( [
{ "$match": { "dlc" : thisDLC } },
{ "$group" : { "_id" : { "$substr" : [ "$time", 0, 10] } , "downloads" : { "$sum" : 1 } } },
{ "$sort": { "_id" : 1 } }
] )
# ... and for each daily total, build tempData ...
tempData = []
for item in rs["result"]:
# Build local item
tempItem = {}
tempItem["date"] = item["_id"]
tempItem["downloads"] = item["downloads"]
# Append local item to tempData
tempData.append(tempItem)
# ... and store that tempData in the master summary
sums.update(
{"_id" : thisDLC},
{"$set" : {"dates" : tempData}},
True
)
def dlcMap():
# This builds a list of download totals, for each country, for each DLC.
# That list, grouped by DLC, is then stored in the summaries collection
global log
global reqs
global sums
log.write('\n### Map\n')
print('Summarizing Map')
rs = reqs.aggregate( [
{ "$group" : { "_id" : { "dlc" : "$dlc" , "country" : "$country" }, "downloads" : { "$sum" : 1 } } } ,
{ "$group" : { "_id" : "$_id" , "downloads" : { "$sum" : "$downloads" } } },
{ "$sort" : { "_id" : 1 } }
] )
# Sample returned document:
# {u'downloads': 2, u'_id': {u'dlc': u'Aerospace Biomedical and Life Support Engineering', u'country': u'020'}}
tempData = []
tempItem = {}
lastDLC = ''
for item in rs["result"]:
dlc = item["_id"]["dlc"]
# Before we do anything else, check to see if the last row was the last of that DLC
# If so, deal with that DLC's completed list
if(dlc!=lastDLC and lastDLC!=''):
log.write('\nSummary for _'+str(lastDLC)+'_ \n')
log.write(' '+str(tempData)+'\n')
# Append tempData inside countries of summary document
sums.update(
{"_id" : lastDLC},
{"$set" : {"countries" : tempData}},
True
)
log.write('\n')
tempData = []
# log.write(' '+str(item)+'\n')
# Append this record to the tempData list
# items in this list have the shape:
# {
# "country": "020",
# "downloads" : 2
# }
tempItem = {}
tempItem["country"] = item["_id"]["country"]
tempItem["downloads"] = item["downloads"]
tempData.append(tempItem)
# Store this dlc value, for comparison to next record, to figure out when the overall list
lastDLC = dlc
# Don't forget the last group
log.write('\nSummary for _'+str(lastDLC)+'_ \n')
log.write(' '+str(tempData)+'\n')
sums.update(
{"_id" : lastDLC},
{"$set" : {"countries" : tempData}},
True
)
log.write('\n')
###############################################################################
###############################################################################
###############################################################################
###############################################################################
def authorData():
# This builds the data dataset for each author in collection
global log
global reqs
global sums
log.write('\n### Author Data\n\n')
print('Author Data')
rs = reqs.aggregate( [
{ "$group" : { "_id" : { "author" : "$author" , "handle" : "$handle" }, "downloads" : { "$sum" : 1 } } } ,
{ "$group" : { "_id" : "$_id.author" , "size" : { "$sum" : 1 } , "downloads" : { "$sum": "$downloads"} } } ,
{ "$sort" : { "_id" : 1 } }
] )
for item in rs["result"]:
log.write(' '+str(item)+'\n')
author = item["_id"]
tempQuery = {}
tempQuery['type'] = "author"
tempQuery['size'] = item["size"]
tempQuery['downloads'] = item["downloads"]
sums.update(
{"_id" : author},
{"$set" : tempQuery},
True
)
def authorTime():
# This builds the timeline dataset for each author in the collection
global log
global reqs
global sums
log.write('\n### Author Timeline\n\n')
print('Author Timeline')
# Because the list of dates is prohibitively large, we can't get a master recordset like we do
# for the map - instead we have to split the queries up for each Author.
# Get list of Authors
authors = reqs.distinct('author')
for term in authors:
# For each Author...
thisAuthor = term
print(str(thisAuthor))
log.write(' '+str(thisAuthor)+'\n')
# ... get the list of daily download totals ...
rs = reqs.aggregate( [
{ "$match": { "author" : thisAuthor } },
{ "$group" : { "_id" : { "$substr" : [ "$time", 0, 10] } , "downloads" : { "$sum" : 1 } } },
{ "$sort": { "_id" : 1 } }
] )
# ... and for each daily total, build tempData ...
tempData = []
for item in rs["result"]:
# Build local item
tempItem = {}
tempItem["date"] = item["_id"]
tempItem["downloads"] = item["downloads"]
# Append local item to tempData
tempData.append(tempItem)
# ... and store that tempData in the master summary
sums.update(
{"_id" : thisAuthor},
{"$set" : {"dates" : tempData}},
True
)
def authorMap():
# This builds the map dataset for each author in the collection
global log
global reqs
global sums
log.write('\n### Author Map\n\n')
print('Author Map')
rs = reqs.aggregate( [
{ "$group" : { "_id" : { "author" : "$author" , "country" : "$country" }, "downloads" : { "$sum" : 1 } } } ,
{ "$group" : { "_id" : "$_id" , "downloads" : { "$sum" : "$downloads" } } },
{ "$sort" : { "_id" : 1 } }
] )
# Sample returned document:
# {u'downloads': 2, u'_id': {u'author': u'http://example.com/author/1098', u'country': u'020'}}
tempData = []
tempItem = {}
lastAuthor = ''
for item in rs["result"]:
author = item["_id"]["author"]
# Before we do anything else, check to see if the last row was the last of that author
# If so, deal with that author's completed list
if(author!=lastAuthor and lastAuthor!=''):
log.write('\nSummary for _'+str(lastAuthor)+'_ \n')
log.write(' '+str(tempData)+'\n')
# Append tempData inside countries of summary document
sums.update(
{"_id" : lastAuthor},
{"$set" : {"countries" : tempData}},
True
)
log.write('\n')
tempData = []
# log.write(' '+str(item)+'\n')
# Append this record to the tempData list
# items in this list have the shape:
# {
# "country": "020",
# "downloads" : 2
# }
tempItem = {}
tempItem["country"] = item["_id"]["country"]
tempItem["downloads"] = item["downloads"]
tempData.append(tempItem)
# Store this author value, for comparison to next record, to figure out when the overall list
lastAuthor = author
# Don't forget the last group
log.write('\nSummary for _'+str(lastAuthor)+'_ \n')
log.write(' '+str(tempData)+'\n')
sums.update(
{"_id" : lastAuthor},
{"$set" : {"countries" : tempData}},
True
)
log.write('\n')
def authorParents():
# This builds the timeline dataset for each author in the collection
global log
global reqs
global sums
log.write('\n### Author Parents\n\n')
print('Author Parents')
# Because the list of dates is prohibitively large, we can't get a master recordset like we do
# for the map - instead we have to split the queries up for each Author.
# Get list of Authors
authors = reqs.distinct('author')
for term in authors:
# For each Author...
thisAuthor = term
print(str(thisAuthor))
log.write(' '+str(thisAuthor)+'\n')
# db.requests.distinct('dlc',{'author':'http://example.com/author/1171'}).sort({'_id':1})
# per https://jira.mongodb.org/browse/PYTHON-331
rs = reqs.find( { "author" : thisAuthor } ).distinct("dlc")
# ... and for each daily total, build tempData ...
tempData = []
for item in rs:
# Build local item
tempItem = {}
tempItem["parent"] = item
# Append local item to tempData
tempData.append(tempItem)
# ... and store that tempData in the master summary
sums.update(
{"_id" : thisAuthor},
{"$set" : {"parents" : tempData}},
True
)
###############################################################################
###############################################################################
###############################################################################
def paperData():
# This builds the data dataset for each paper in collection
global log
global reqs
global sums
log.write('\n### Paper Data\n\n')
print('Paper Data')
rs = reqs.aggregate( [
{ "$group" : { "_id" : "$handle" , "downloads" : { "$sum" : 1} } } ,
{ "$sort" : { "_id" : 1 } }
] )
for item in rs["result"]:
log.write(' '+str(item)+'\n')
paper = item["_id"]
tempQuery = {}
tempQuery['type'] = "paper"
tempQuery['downloads'] = item["downloads"]
sums.update(
{"_id" : paper},
{"$set" : tempQuery},
True
)
def paperTime():
# This builds the timeline dataset for each paper in the collection
global log
global reqs
global sums
log.write('\n### Paper Timeline\n\n')
print('Paper Timeline')
# Because the list of dates is prohibitively large, we can't get a master recordset like we do
# for the map - instead we have to split the queries up for each Paper.
# Get list of Papers
papers = reqs.distinct('handle')
for term in papers:
# For each Paper...
thisPaper = term
print(str(thisPaper))
log.write(' '+str(thisPaper)+'\n')
# ... get the list of daily download totals ...
rs = reqs.aggregate( [
{ "$match": { "handle" : thisPaper } },
{ "$group" : { "_id" : { "$substr" : [ "$time", 0, 10] } , "downloads" : { "$sum" : 1 } } },
{ "$sort": { "_id" : 1 } }
] )
# ... and for each daily total, build tempData ...
tempData = []
for item in rs["result"]:
# Build local item
tempItem = {}
tempItem["date"] = item["_id"]
tempItem["downloads"] = item["downloads"]
# Append local item to tempData
tempData.append(tempItem)
# ... and store that tempData in the master summary
sums.update(
{"_id" : thisPaper},
{"$set" : {"dates" : tempData}},
True
)
def paperMap():
# This builds the map dataset for each paper in the collection
global log
global reqs
global sums
log.write('\n### Paper Map\n\n')
print('Paper Map')
rs = reqs.aggregate( [
{ "$group" : { "_id" : { "handle" : "$handle" , "country" : "$country" }, "downloads" : { "$sum" : 1 } } } ,
{ "$group" : { "_id" : "$_id" , "downloads" : { "$sum" : "$downloads" } } },
{ "$sort" : { "_id" : 1 } }
] )
# Sample returned document:
# {u'downloads': 2, u'_id': {u'handle': u'http://example.com/handle/53', u'country': u'020'}}
tempData = []
tempItem = {}
lastPaper = ''
for item in rs["result"]:
paper = item["_id"]["handle"]
# Before we do anything else, check to see if the last row was the last of that paper
# If so, deal with that paper's completed list
if(paper!=lastPaper and lastPaper!=''):
log.write('\nSummary for _'+str(lastPaper)+'_ \n')
log.write(' '+str(tempData)+'\n')
# Append tempData inside countries of summary document
sums.update(
{"_id" : lastPaper},
{"$set" : {"countries" : tempData}},
True
)
log.write('\n')
tempData = []
# log.write(' '+str(item)+'\n')
# Append this record to the tempData list
# items in this list have the shape:
# {
# "country": "020",
# "downloads" : 2
# }
tempItem = {}
tempItem["country"] = item["_id"]["country"]
tempItem["downloads"] = item["downloads"]
tempData.append(tempItem)
# Store this paper value, for comparison to next record, to figure out when the overall list
lastPaper = paper
# Don't forget the last group
log.write('\nSummary for _'+str(lastPaper)+'_ \n')
log.write(' '+str(tempData)+'\n')
sums.update(
{"_id" : lastPaper},
{"$set" : {"countries" : tempData}},
True
)
log.write('\n')
def paperParents():
# This builds the parents dataset for each paper in the collection
global log
global reqs
global sums
log.write('\n### Paper Parents\n\n')
print('Paper Parents')
# Get list of Papers
papers = reqs.distinct('handle')
for term in papers:
# For each Paper...
thisPaper = term
print(str(thisPaper))
log.write(' '+str(thisPaper)+'\n')
# db.requests.distinct('author',{'handle':'http://example.com/handle/1171'}).sort({'_id':1})
# per https://jira.mongodb.org/browse/PYTHON-331
rs = reqs.find( { "handle" : thisPaper } ).distinct("author")
# ... and for each daily total, build tempData ...
tempData = []
for item in rs:
# Build local item
tempItem = {}
tempItem["parent"] = item
# Append local item to tempData
tempData.append(tempItem)
# ... and store that tempData in the master summary
sums.update(
{"_id" : thisPaper},
{"$set" : {"parents" : tempData}},
True
)
###############################################################################
###############################################################################
###############################################################################
def main():
global log
global reqs
global sums
log = open('summarize.log','w')
log.write('Summarizing oastats.requests into oastats.summaries\n')
# Connect
local = MongoClient('mongodb://localhost')
log.write('Connections made\n')
reqs = local.oastats.requests
sums = local.oastats.summaries
log.write('reqs and sums defined\n')
overallData()
overallMap()
overallTime()
dlcData()
dlcMap()
dlcTime()
authorData()
authorMap()
authorTime()
authorParents()
paperData()
paperMap()
paperTime()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment