Last active
January 4, 2016 17:39
-
-
Save matt-bernhardt/8655646 to your computer and use it in GitHub Desktop.
This is a quick and crude summarizing script that assembles a summary collection on top of a raw requests collection - a loose part of the OA Stats project.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# update summary data based on requests collection | |
# Imports | |
from pymongo import MongoClient | |
# Globals | |
log = '' | |
reqs = '' | |
sums = '' | |
def overallData(): | |
# This builds the data dataset for the overall collection | |
global log | |
global reqs | |
global sums | |
log.write('\n### Overall Data\n\n') | |
print('Overall Data') | |
rs = reqs.aggregate( [ | |
{ "$group" : { "_id" : "$handle" , "downloads" : { "$sum" : 1 } } } , | |
{ "$group" : { "_id" : "$_id.dlc" , "size" : { "$sum" : 1 } , "downloads" : { "$sum": "$downloads"} } } | |
] ) | |
for item in rs["result"]: | |
sums.update( | |
{"_id" : "Overall"}, | |
{"$set" : {"type" : "overall", "size" : item["size"], "downloads" : item["downloads"] } }, | |
True | |
) | |
def overallMap(): | |
# This builds the map dataset for the overall collection | |
global log | |
global reqs | |
global sums | |
log.write('\n### Overall Map\n\n') | |
print('Overall Map') | |
rs = reqs.aggregate( [ | |
{ "$group" : { "_id" : "$country", "downloads" : { "$sum" : 1 } } }, | |
{ "$sort" : { "_id" : 1 } } | |
] ) | |
tempData = [] | |
for item in rs["result"]: | |
tempItem = {} | |
tempItem["country"] = item["_id"] | |
tempItem["downloads"] = item["downloads"] | |
tempData.append(tempItem) | |
sums.update( | |
{"_id" : "Overall"}, | |
{"$set" : {"countries" : tempData}}, | |
True | |
) | |
def overallTime(): | |
# This builds the timeline dataset for the overall collection | |
global log | |
global reqs | |
global sums | |
log.write('\n### Overall Timeline\n\n') | |
print('Overall Timeline') | |
rs = reqs.aggregate( [ | |
{ "$group" : { "_id" : { "$substr" : [ "$time", 0, 10] } , "downloads" : { "$sum" : 1 } } }, | |
{ "$sort" : { "_id" : 1 } } | |
] ) | |
tempData = [] | |
for item in rs["result"]: | |
tempItem = {} | |
tempItem["date"] = item["_id"] | |
tempItem["downloads"] = item["downloads"] | |
tempData.append(tempItem) | |
sums.update( | |
{"_id" : "Overall"}, | |
{"$set" : {"dates" : tempData}}, | |
True | |
) | |
############################################################################### | |
############################################################################### | |
############################################################################### | |
def dlcData(): | |
global log | |
global reqs | |
global sums | |
log.write('\n### Data\n') | |
print('Summarizing Data') | |
rs = reqs.aggregate( [ | |
{ "$group" : { "_id" : { "dlc" : "$dlc" , "handle" : "$handle" }, "downloads" : { "$sum" : 1 } } } , | |
{ "$group" : { "_id" : "$_id.dlc" , "size" : { "$sum" : 1 } , "downloads" : { "$sum": "$downloads"} } } , | |
{ "$sort" : { "_id" : 1 } } | |
] ) | |
for item in rs["result"]: | |
log.write(' '+str(item)+'\n') | |
dlc = item["_id"] | |
tempQuery = {} | |
tempQuery['type'] = "dlc" | |
tempQuery['size'] = item["size"] | |
tempQuery['downloads'] = item["downloads"] | |
sums.update( | |
{"_id" : dlc}, | |
{"$set" : tempQuery}, | |
True | |
) | |
def dlcTime(): | |
# This builds a list of download totals, for each day, for each DLC. | |
# That list, grouped by DLC, is then stored in the summaries collection. | |
global log | |
global reqs | |
global sums | |
log.write('\n### Timeline\n\n') | |
print('Summarizing Timeline') | |
# Because the list of dates is prohibitively large, we can't get a master recordset like we do | |
# for the map - instead we have to split the queries up for each DLC. | |
# Get list of DLCs | |
dlcs = reqs.distinct('dlc') | |
for term in dlcs: | |
# For each DLC... | |
thisDLC = term | |
print(str(thisDLC)) | |
log.write(' '+str(thisDLC)+'\n') | |
# ... get the list of daily download totals ... | |
rs = reqs.aggregate( [ | |
{ "$match": { "dlc" : thisDLC } }, | |
{ "$group" : { "_id" : { "$substr" : [ "$time", 0, 10] } , "downloads" : { "$sum" : 1 } } }, | |
{ "$sort": { "_id" : 1 } } | |
] ) | |
# ... and for each daily total, build tempData ... | |
tempData = [] | |
for item in rs["result"]: | |
# Build local item | |
tempItem = {} | |
tempItem["date"] = item["_id"] | |
tempItem["downloads"] = item["downloads"] | |
# Append local item to tempData | |
tempData.append(tempItem) | |
# ... and store that tempData in the master summary | |
sums.update( | |
{"_id" : thisDLC}, | |
{"$set" : {"dates" : tempData}}, | |
True | |
) | |
def dlcMap(): | |
# This builds a list of download totals, for each country, for each DLC. | |
# That list, grouped by DLC, is then stored in the summaries collection | |
global log | |
global reqs | |
global sums | |
log.write('\n### Map\n') | |
print('Summarizing Map') | |
rs = reqs.aggregate( [ | |
{ "$group" : { "_id" : { "dlc" : "$dlc" , "country" : "$country" }, "downloads" : { "$sum" : 1 } } } , | |
{ "$group" : { "_id" : "$_id" , "downloads" : { "$sum" : "$downloads" } } }, | |
{ "$sort" : { "_id" : 1 } } | |
] ) | |
# Sample returned document: | |
# {u'downloads': 2, u'_id': {u'dlc': u'Aerospace Biomedical and Life Support Engineering', u'country': u'020'}} | |
tempData = [] | |
tempItem = {} | |
lastDLC = '' | |
for item in rs["result"]: | |
dlc = item["_id"]["dlc"] | |
# Before we do anything else, check to see if the last row was the last of that DLC | |
# If so, deal with that DLC's completed list | |
if(dlc!=lastDLC and lastDLC!=''): | |
log.write('\nSummary for _'+str(lastDLC)+'_ \n') | |
log.write(' '+str(tempData)+'\n') | |
# Append tempData inside countries of summary document | |
sums.update( | |
{"_id" : lastDLC}, | |
{"$set" : {"countries" : tempData}}, | |
True | |
) | |
log.write('\n') | |
tempData = [] | |
# log.write(' '+str(item)+'\n') | |
# Append this record to the tempData list | |
# items in this list have the shape: | |
# { | |
# "country": "020", | |
# "downloads" : 2 | |
# } | |
tempItem = {} | |
tempItem["country"] = item["_id"]["country"] | |
tempItem["downloads"] = item["downloads"] | |
tempData.append(tempItem) | |
# Store this dlc value, for comparison to next record, to figure out when the overall list | |
lastDLC = dlc | |
# Don't forget the last group | |
log.write('\nSummary for _'+str(lastDLC)+'_ \n') | |
log.write(' '+str(tempData)+'\n') | |
sums.update( | |
{"_id" : lastDLC}, | |
{"$set" : {"countries" : tempData}}, | |
True | |
) | |
log.write('\n') | |
############################################################################### | |
############################################################################### | |
############################################################################### | |
############################################################################### | |
def authorData(): | |
# This builds the data dataset for each author in collection | |
global log | |
global reqs | |
global sums | |
log.write('\n### Author Data\n\n') | |
print('Author Data') | |
rs = reqs.aggregate( [ | |
{ "$group" : { "_id" : { "author" : "$author" , "handle" : "$handle" }, "downloads" : { "$sum" : 1 } } } , | |
{ "$group" : { "_id" : "$_id.author" , "size" : { "$sum" : 1 } , "downloads" : { "$sum": "$downloads"} } } , | |
{ "$sort" : { "_id" : 1 } } | |
] ) | |
for item in rs["result"]: | |
log.write(' '+str(item)+'\n') | |
author = item["_id"] | |
tempQuery = {} | |
tempQuery['type'] = "author" | |
tempQuery['size'] = item["size"] | |
tempQuery['downloads'] = item["downloads"] | |
sums.update( | |
{"_id" : author}, | |
{"$set" : tempQuery}, | |
True | |
) | |
def authorTime(): | |
# This builds the timeline dataset for each author in the collection | |
global log | |
global reqs | |
global sums | |
log.write('\n### Author Timeline\n\n') | |
print('Author Timeline') | |
# Because the list of dates is prohibitively large, we can't get a master recordset like we do | |
# for the map - instead we have to split the queries up for each Author. | |
# Get list of Authors | |
authors = reqs.distinct('author') | |
for term in authors: | |
# For each Author... | |
thisAuthor = term | |
print(str(thisAuthor)) | |
log.write(' '+str(thisAuthor)+'\n') | |
# ... get the list of daily download totals ... | |
rs = reqs.aggregate( [ | |
{ "$match": { "author" : thisAuthor } }, | |
{ "$group" : { "_id" : { "$substr" : [ "$time", 0, 10] } , "downloads" : { "$sum" : 1 } } }, | |
{ "$sort": { "_id" : 1 } } | |
] ) | |
# ... and for each daily total, build tempData ... | |
tempData = [] | |
for item in rs["result"]: | |
# Build local item | |
tempItem = {} | |
tempItem["date"] = item["_id"] | |
tempItem["downloads"] = item["downloads"] | |
# Append local item to tempData | |
tempData.append(tempItem) | |
# ... and store that tempData in the master summary | |
sums.update( | |
{"_id" : thisAuthor}, | |
{"$set" : {"dates" : tempData}}, | |
True | |
) | |
def authorMap(): | |
# This builds the map dataset for each author in the collection | |
global log | |
global reqs | |
global sums | |
log.write('\n### Author Map\n\n') | |
print('Author Map') | |
rs = reqs.aggregate( [ | |
{ "$group" : { "_id" : { "author" : "$author" , "country" : "$country" }, "downloads" : { "$sum" : 1 } } } , | |
{ "$group" : { "_id" : "$_id" , "downloads" : { "$sum" : "$downloads" } } }, | |
{ "$sort" : { "_id" : 1 } } | |
] ) | |
# Sample returned document: | |
# {u'downloads': 2, u'_id': {u'author': u'http://example.com/author/1098', u'country': u'020'}} | |
tempData = [] | |
tempItem = {} | |
lastAuthor = '' | |
for item in rs["result"]: | |
author = item["_id"]["author"] | |
# Before we do anything else, check to see if the last row was the last of that author | |
# If so, deal with that author's completed list | |
if(author!=lastAuthor and lastAuthor!=''): | |
log.write('\nSummary for _'+str(lastAuthor)+'_ \n') | |
log.write(' '+str(tempData)+'\n') | |
# Append tempData inside countries of summary document | |
sums.update( | |
{"_id" : lastAuthor}, | |
{"$set" : {"countries" : tempData}}, | |
True | |
) | |
log.write('\n') | |
tempData = [] | |
# log.write(' '+str(item)+'\n') | |
# Append this record to the tempData list | |
# items in this list have the shape: | |
# { | |
# "country": "020", | |
# "downloads" : 2 | |
# } | |
tempItem = {} | |
tempItem["country"] = item["_id"]["country"] | |
tempItem["downloads"] = item["downloads"] | |
tempData.append(tempItem) | |
# Store this author value, for comparison to next record, to figure out when the overall list | |
lastAuthor = author | |
# Don't forget the last group | |
log.write('\nSummary for _'+str(lastAuthor)+'_ \n') | |
log.write(' '+str(tempData)+'\n') | |
sums.update( | |
{"_id" : lastAuthor}, | |
{"$set" : {"countries" : tempData}}, | |
True | |
) | |
log.write('\n') | |
def authorParents(): | |
# This builds the timeline dataset for each author in the collection | |
global log | |
global reqs | |
global sums | |
log.write('\n### Author Parents\n\n') | |
print('Author Parents') | |
# Because the list of dates is prohibitively large, we can't get a master recordset like we do | |
# for the map - instead we have to split the queries up for each Author. | |
# Get list of Authors | |
authors = reqs.distinct('author') | |
for term in authors: | |
# For each Author... | |
thisAuthor = term | |
print(str(thisAuthor)) | |
log.write(' '+str(thisAuthor)+'\n') | |
# db.requests.distinct('dlc',{'author':'http://example.com/author/1171'}).sort({'_id':1}) | |
# per https://jira.mongodb.org/browse/PYTHON-331 | |
rs = reqs.find( { "author" : thisAuthor } ).distinct("dlc") | |
# ... and for each daily total, build tempData ... | |
tempData = [] | |
for item in rs: | |
# Build local item | |
tempItem = {} | |
tempItem["parent"] = item | |
# Append local item to tempData | |
tempData.append(tempItem) | |
# ... and store that tempData in the master summary | |
sums.update( | |
{"_id" : thisAuthor}, | |
{"$set" : {"parents" : tempData}}, | |
True | |
) | |
############################################################################### | |
############################################################################### | |
############################################################################### | |
def paperData(): | |
# This builds the data dataset for each paper in collection | |
global log | |
global reqs | |
global sums | |
log.write('\n### Paper Data\n\n') | |
print('Paper Data') | |
rs = reqs.aggregate( [ | |
{ "$group" : { "_id" : "$handle" , "downloads" : { "$sum" : 1} } } , | |
{ "$sort" : { "_id" : 1 } } | |
] ) | |
for item in rs["result"]: | |
log.write(' '+str(item)+'\n') | |
paper = item["_id"] | |
tempQuery = {} | |
tempQuery['type'] = "paper" | |
tempQuery['downloads'] = item["downloads"] | |
sums.update( | |
{"_id" : paper}, | |
{"$set" : tempQuery}, | |
True | |
) | |
def paperTime(): | |
# This builds the timeline dataset for each paper in the collection | |
global log | |
global reqs | |
global sums | |
log.write('\n### Paper Timeline\n\n') | |
print('Paper Timeline') | |
# Because the list of dates is prohibitively large, we can't get a master recordset like we do | |
# for the map - instead we have to split the queries up for each Paper. | |
# Get list of Papers | |
papers = reqs.distinct('handle') | |
for term in papers: | |
# For each Paper... | |
thisPaper = term | |
print(str(thisPaper)) | |
log.write(' '+str(thisPaper)+'\n') | |
# ... get the list of daily download totals ... | |
rs = reqs.aggregate( [ | |
{ "$match": { "handle" : thisPaper } }, | |
{ "$group" : { "_id" : { "$substr" : [ "$time", 0, 10] } , "downloads" : { "$sum" : 1 } } }, | |
{ "$sort": { "_id" : 1 } } | |
] ) | |
# ... and for each daily total, build tempData ... | |
tempData = [] | |
for item in rs["result"]: | |
# Build local item | |
tempItem = {} | |
tempItem["date"] = item["_id"] | |
tempItem["downloads"] = item["downloads"] | |
# Append local item to tempData | |
tempData.append(tempItem) | |
# ... and store that tempData in the master summary | |
sums.update( | |
{"_id" : thisPaper}, | |
{"$set" : {"dates" : tempData}}, | |
True | |
) | |
def paperMap(): | |
# This builds the map dataset for each paper in the collection | |
global log | |
global reqs | |
global sums | |
log.write('\n### Paper Map\n\n') | |
print('Paper Map') | |
rs = reqs.aggregate( [ | |
{ "$group" : { "_id" : { "handle" : "$handle" , "country" : "$country" }, "downloads" : { "$sum" : 1 } } } , | |
{ "$group" : { "_id" : "$_id" , "downloads" : { "$sum" : "$downloads" } } }, | |
{ "$sort" : { "_id" : 1 } } | |
] ) | |
# Sample returned document: | |
# {u'downloads': 2, u'_id': {u'handle': u'http://example.com/handle/53', u'country': u'020'}} | |
tempData = [] | |
tempItem = {} | |
lastPaper = '' | |
for item in rs["result"]: | |
paper = item["_id"]["handle"] | |
# Before we do anything else, check to see if the last row was the last of that paper | |
# If so, deal with that paper's completed list | |
if(paper!=lastPaper and lastPaper!=''): | |
log.write('\nSummary for _'+str(lastPaper)+'_ \n') | |
log.write(' '+str(tempData)+'\n') | |
# Append tempData inside countries of summary document | |
sums.update( | |
{"_id" : lastPaper}, | |
{"$set" : {"countries" : tempData}}, | |
True | |
) | |
log.write('\n') | |
tempData = [] | |
# log.write(' '+str(item)+'\n') | |
# Append this record to the tempData list | |
# items in this list have the shape: | |
# { | |
# "country": "020", | |
# "downloads" : 2 | |
# } | |
tempItem = {} | |
tempItem["country"] = item["_id"]["country"] | |
tempItem["downloads"] = item["downloads"] | |
tempData.append(tempItem) | |
# Store this paper value, for comparison to next record, to figure out when the overall list | |
lastPaper = paper | |
# Don't forget the last group | |
log.write('\nSummary for _'+str(lastPaper)+'_ \n') | |
log.write(' '+str(tempData)+'\n') | |
sums.update( | |
{"_id" : lastPaper}, | |
{"$set" : {"countries" : tempData}}, | |
True | |
) | |
log.write('\n') | |
def paperParents(): | |
# This builds the parents dataset for each paper in the collection | |
global log | |
global reqs | |
global sums | |
log.write('\n### Paper Parents\n\n') | |
print('Paper Parents') | |
# Get list of Papers | |
papers = reqs.distinct('handle') | |
for term in papers: | |
# For each Paper... | |
thisPaper = term | |
print(str(thisPaper)) | |
log.write(' '+str(thisPaper)+'\n') | |
# db.requests.distinct('author',{'handle':'http://example.com/handle/1171'}).sort({'_id':1}) | |
# per https://jira.mongodb.org/browse/PYTHON-331 | |
rs = reqs.find( { "handle" : thisPaper } ).distinct("author") | |
# ... and for each daily total, build tempData ... | |
tempData = [] | |
for item in rs: | |
# Build local item | |
tempItem = {} | |
tempItem["parent"] = item | |
# Append local item to tempData | |
tempData.append(tempItem) | |
# ... and store that tempData in the master summary | |
sums.update( | |
{"_id" : thisPaper}, | |
{"$set" : {"parents" : tempData}}, | |
True | |
) | |
############################################################################### | |
############################################################################### | |
############################################################################### | |
def main(): | |
global log | |
global reqs | |
global sums | |
log = open('summarize.log','w') | |
log.write('Summarizing oastats.requests into oastats.summaries\n') | |
# Connect | |
local = MongoClient('mongodb://localhost') | |
log.write('Connections made\n') | |
reqs = local.oastats.requests | |
sums = local.oastats.summaries | |
log.write('reqs and sums defined\n') | |
overallData() | |
overallMap() | |
overallTime() | |
dlcData() | |
dlcMap() | |
dlcTime() | |
authorData() | |
authorMap() | |
authorTime() | |
authorParents() | |
paperData() | |
paperMap() | |
paperTime() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment