matt-bernhardt/summarize.py

## summarize.py
# update summary data based on requests collection

# Imports
from pymongo import MongoClient

# Globals
log = ''
reqs = ''
sums = ''

def overallData():
  # This builds the data dataset for the overall collection

  global log
  global reqs
  global sums

  log.write('\n### Overall Data\n\n')
  print('Overall Data')

  rs = reqs.aggregate( [
    { "$group" : { "_id" : "$handle" , "downloads" : { "$sum" : 1 } } } ,
    { "$group" : { "_id" : "$_id.dlc" , "size" : { "$sum" : 1 } , "downloads" : { "$sum": "$downloads"} } }
  ] )

  for item in rs["result"]:
    sums.update(
      {"_id" : "Overall"},
      {"$set" : {"type" : "overall", "size" : item["size"], "downloads" : item["downloads"] } },
      True
    )

def overallMap():
  # This builds the map dataset for the overall collection

  global log
  global reqs
  global sums

  log.write('\n### Overall Map\n\n')
  print('Overall Map')

  rs = reqs.aggregate( [
    { "$group" : { "_id" : "$country", "downloads" : { "$sum" : 1 } } },
    { "$sort" : { "_id" : 1 } }
  ] )

  tempData = []
  for item in rs["result"]:
    tempItem = {}
    tempItem["country"] = item["_id"]
    tempItem["downloads"] = item["downloads"]
    tempData.append(tempItem)

  sums.update(
    {"_id" : "Overall"},
    {"$set" : {"countries" : tempData}},
    True
  )

def overallTime():
  # This builds the timeline dataset for the overall collection

  global log
  global reqs
  global sums

  log.write('\n### Overall Timeline\n\n')
  print('Overall Timeline')

  rs = reqs.aggregate( [
    { "$group" : { "_id" : { "$substr" : [ "$time", 0, 10] } , "downloads" : { "$sum" : 1 } } },
    { "$sort" : { "_id" : 1 } }
  ] )

  tempData = []
  for item in rs["result"]:
    tempItem = {}
    tempItem["date"] = item["_id"]
    tempItem["downloads"] = item["downloads"]
    tempData.append(tempItem)

  sums.update(
    {"_id" : "Overall"},
    {"$set" : {"dates" : tempData}},
    True
  )

###############################################################################
###############################################################################
###############################################################################

def dlcData():
  global log
  global reqs
  global sums

  log.write('\n### Data\n')
  print('Summarizing Data')

  rs = reqs.aggregate( [
    { "$group" : { "_id" : { "dlc" : "$dlc" , "handle" : "$handle" }, "downloads" : { "$sum" : 1 } } } ,
    { "$group" : { "_id" : "$_id.dlc" , "size" : { "$sum" : 1 } , "downloads" : { "$sum": "$downloads"} } } ,
    { "$sort" : { "_id" : 1 } }
  ] )

  for item in rs["result"]:
    log.write('  '+str(item)+'\n')
    dlc = item["_id"]
    tempQuery = {}
    tempQuery['type'] = "dlc"
    tempQuery['size'] = item["size"]
    tempQuery['downloads'] = item["downloads"]
    sums.update(
      {"_id" : dlc},
      {"$set" : tempQuery},
      True
    )

def dlcTime():
  # This builds a list of download totals, for each day, for each DLC.
  # That list, grouped by DLC, is then stored in the summaries collection.

  global log
  global reqs
  global sums

  log.write('\n### Timeline\n\n')
  print('Summarizing Timeline')

  # Because the list of dates is prohibitively large, we can't get a master recordset like we do
  # for the map - instead we have to split the queries up for each DLC.

  # Get list of DLCs
  dlcs = reqs.distinct('dlc')
  for term in dlcs:
    # For each DLC...
    thisDLC = term
    print(str(thisDLC))
    log.write('  '+str(thisDLC)+'\n')

    # ... get the list of daily download totals ...
    rs = reqs.aggregate( [
      { "$match": { "dlc" : thisDLC } },
      { "$group" : { "_id" : { "$substr" : [ "$time", 0, 10] } , "downloads" : { "$sum" : 1 } } },
      { "$sort": { "_id" : 1 } }
    ] )

    # ... and for each daily total, build tempData ...
    tempData = []
    for item in rs["result"]:
      # Build local item
      tempItem = {}
      tempItem["date"] = item["_id"]
      tempItem["downloads"] = item["downloads"]
      # Append local item to tempData
      tempData.append(tempItem)

    # ... and store that tempData in the master summary
    sums.update(
      {"_id" : thisDLC},
      {"$set" : {"dates" : tempData}},
      True
    )

def dlcMap():
  # This builds a list of download totals, for each country, for each DLC.
  # That list, grouped by DLC, is then stored in the summaries collection

  global log
  global reqs
  global sums

  log.write('\n### Map\n')
  print('Summarizing Map')

  rs = reqs.aggregate( [
    { "$group" : { "_id" : { "dlc" : "$dlc" , "country" : "$country" }, "downloads" : { "$sum" : 1 } } } ,
    { "$group" : { "_id" : "$_id" , "downloads" : { "$sum" : "$downloads" } } },
    { "$sort" : { "_id" : 1 } }
  ] )
  # Sample returned document:
  # {u'downloads': 2, u'_id': {u'dlc': u'Aerospace Biomedical and Life Support Engineering', u'country': u'020'}}

  tempData = []
  tempItem = {}
  lastDLC = ''
  for item in rs["result"]:
    dlc = item["_id"]["dlc"]

    # Before we do anything else, check to see if the last row was the last of that DLC
    # If so, deal with that DLC's completed list
    if(dlc!=lastDLC and lastDLC!=''):
      log.write('\nSummary for _'+str(lastDLC)+'_ \n')
      log.write('  '+str(tempData)+'\n')
      # Append tempData inside countries of summary document
      sums.update(
        {"_id" : lastDLC},
        {"$set" : {"countries" : tempData}},
        True
      )
      log.write('\n')
      tempData = []

    # log.write('  '+str(item)+'\n')

    # Append this record to the tempData list
    # items in this list have the shape:
    # {
    #   "country": "020",
    #   "downloads" : 2
    # }
    tempItem = {}
    tempItem["country"] = item["_id"]["country"]
    tempItem["downloads"] = item["downloads"]
    tempData.append(tempItem)

    # Store this dlc value, for comparison to next record, to figure out when the overall list
    lastDLC = dlc

  # Don't forget the last group
  log.write('\nSummary for _'+str(lastDLC)+'_ \n')
  log.write('  '+str(tempData)+'\n')
  sums.update(
    {"_id" : lastDLC},
    {"$set" : {"countries" : tempData}},
    True
  )

  log.write('\n')

###############################################################################
###############################################################################
###############################################################################
###############################################################################

def authorData():
  # This builds the data dataset for each author in collection

  global log
  global reqs
  global sums

  log.write('\n### Author Data\n\n')
  print('Author Data')

  rs = reqs.aggregate( [
    { "$group" : { "_id" : { "author" : "$author" , "handle" : "$handle" }, "downloads" : { "$sum" : 1 } } } ,
    { "$group" : { "_id" : "$_id.author" , "size" : { "$sum" : 1 } , "downloads" : { "$sum": "$downloads"} } } ,
    { "$sort" : { "_id" : 1 } }
  ] )

  for item in rs["result"]:
    log.write('  '+str(item)+'\n')
    author = item["_id"]
    tempQuery = {}
    tempQuery['type'] = "author"
    tempQuery['size'] = item["size"]
    tempQuery['downloads'] = item["downloads"]
    sums.update(
      {"_id" : author},
      {"$set" : tempQuery},
      True
    )

def authorTime():
  # This builds the timeline dataset for each author in the collection

  global log
  global reqs
  global sums

  log.write('\n### Author Timeline\n\n')
  print('Author Timeline')

  # Because the list of dates is prohibitively large, we can't get a master recordset like we do
  # for the map - instead we have to split the queries up for each Author.

  # Get list of Authors
  authors = reqs.distinct('author')
  for term in authors:
    # For each Author...
    thisAuthor = term
    print(str(thisAuthor))
    log.write('  '+str(thisAuthor)+'\n')

    # ... get the list of daily download totals ...
    rs = reqs.aggregate( [
      { "$match": { "author" : thisAuthor } },
      { "$group" : { "_id" : { "$substr" : [ "$time", 0, 10] } , "downloads" : { "$sum" : 1 } } },
      { "$sort": { "_id" : 1 } }
    ] )

    # ... and for each daily total, build tempData ...
    tempData = []
    for item in rs["result"]:
      # Build local item
      tempItem = {}
      tempItem["date"] = item["_id"]
      tempItem["downloads"] = item["downloads"]
      # Append local item to tempData
      tempData.append(tempItem)

    # ... and store that tempData in the master summary
    sums.update(
      {"_id" : thisAuthor},
      {"$set" : {"dates" : tempData}},
      True
    )

def authorMap():
  # This builds the map dataset for each author in the collection

  global log
  global reqs
  global sums

  log.write('\n### Author Map\n\n')
  print('Author Map')

  rs = reqs.aggregate( [
    { "$group" : { "_id" : { "author" : "$author" , "country" : "$country" }, "downloads" : { "$sum" : 1 } } } ,
    { "$group" : { "_id" : "$_id" , "downloads" : { "$sum" : "$downloads" } } },
    { "$sort" : { "_id" : 1 } }
  ] )
  # Sample returned document:
  # {u'downloads': 2, u'_id': {u'author': u'http://example.com/author/1098', u'country': u'020'}}

  tempData = []
  tempItem = {}
  lastAuthor = ''
  for item in rs["result"]:
    author = item["_id"]["author"]

    # Before we do anything else, check to see if the last row was the last of that author
    # If so, deal with that author's completed list
    if(author!=lastAuthor and lastAuthor!=''):
      log.write('\nSummary for _'+str(lastAuthor)+'_ \n')
      log.write('  '+str(tempData)+'\n')
      # Append tempData inside countries of summary document
      sums.update(
        {"_id" : lastAuthor},
        {"$set" : {"countries" : tempData}},
        True
      )
      log.write('\n')
      tempData = []

    # log.write('  '+str(item)+'\n')

    # Append this record to the tempData list
    # items in this list have the shape:
    # {
    #   "country": "020",
    #   "downloads" : 2
    # }
    tempItem = {}
    tempItem["country"] = item["_id"]["country"]
    tempItem["downloads"] = item["downloads"]
    tempData.append(tempItem)

    # Store this author value, for comparison to next record, to figure out when the overall list
    lastAuthor = author

  # Don't forget the last group
  log.write('\nSummary for _'+str(lastAuthor)+'_ \n')
  log.write('  '+str(tempData)+'\n')
  sums.update(
    {"_id" : lastAuthor},
    {"$set" : {"countries" : tempData}},
    True
  )

  log.write('\n')

def authorParents():
  # This builds the timeline dataset for each author in the collection

  global log
  global reqs
  global sums

  log.write('\n### Author Parents\n\n')
  print('Author Parents')

  # Because the list of dates is prohibitively large, we can't get a master recordset like we do
  # for the map - instead we have to split the queries up for each Author.

  # Get list of Authors
  authors = reqs.distinct('author')
  for term in authors:
    # For each Author...
    thisAuthor = term
    print(str(thisAuthor))
    log.write('  '+str(thisAuthor)+'\n')

    # db.requests.distinct('dlc',{'author':'http://example.com/author/1171'}).sort({'_id':1})
    # per https://jira.mongodb.org/browse/PYTHON-331
    rs = reqs.find( { "author" : thisAuthor } ).distinct("dlc")

    # ... and for each daily total, build tempData ...
    tempData = []
    for item in rs:
      # Build local item
      tempItem = {}
      tempItem["parent"] = item
      # Append local item to tempData
      tempData.append(tempItem)

    # ... and store that tempData in the master summary
    sums.update(
      {"_id" : thisAuthor},
      {"$set" : {"parents" : tempData}},
      True
    )


###############################################################################
###############################################################################
###############################################################################

def paperData():
  # This builds the data dataset for each paper in collection

  global log
  global reqs
  global sums

  log.write('\n### Paper Data\n\n')
  print('Paper Data')

  rs = reqs.aggregate( [
    { "$group" : { "_id" : "$handle" , "downloads" : { "$sum" : 1} } } ,
    { "$sort" : { "_id" : 1 } }
  ] )

  for item in rs["result"]:
    log.write('  '+str(item)+'\n')
    paper = item["_id"]
    tempQuery = {}
    tempQuery['type'] = "paper"
    tempQuery['downloads'] = item["downloads"]
    sums.update(
      {"_id" : paper},
      {"$set" : tempQuery},
      True
    )

def paperTime():
  # This builds the timeline dataset for each paper in the collection

  global log
  global reqs
  global sums

  log.write('\n### Paper Timeline\n\n')
  print('Paper Timeline')

  # Because the list of dates is prohibitively large, we can't get a master recordset like we do
  # for the map - instead we have to split the queries up for each Paper.

  # Get list of Papers
  papers = reqs.distinct('handle')
  for term in papers:
    # For each Paper...
    thisPaper = term
    print(str(thisPaper))
    log.write('  '+str(thisPaper)+'\n')

    # ... get the list of daily download totals ...
    rs = reqs.aggregate( [
      { "$match": { "handle" : thisPaper } },
      { "$group" : { "_id" : { "$substr" : [ "$time", 0, 10] } , "downloads" : { "$sum" : 1 } } },
      { "$sort": { "_id" : 1 } }
    ] )

    # ... and for each daily total, build tempData ...
    tempData = []
    for item in rs["result"]:
      # Build local item
      tempItem = {}
      tempItem["date"] = item["_id"]
      tempItem["downloads"] = item["downloads"]
      # Append local item to tempData
      tempData.append(tempItem)

    # ... and store that tempData in the master summary
    sums.update(
      {"_id" : thisPaper},
      {"$set" : {"dates" : tempData}},
      True
    )

def paperMap():
  # This builds the map dataset for each paper in the collection

  global log
  global reqs
  global sums

  log.write('\n### Paper Map\n\n')
  print('Paper Map')

  rs = reqs.aggregate( [
    { "$group" : { "_id" : { "handle" : "$handle" , "country" : "$country" }, "downloads" : { "$sum" : 1 } } } ,
    { "$group" : { "_id" : "$_id" , "downloads" : { "$sum" : "$downloads" } } },
    { "$sort" : { "_id" : 1 } }
  ] )
  # Sample returned document:
  # {u'downloads': 2, u'_id': {u'handle': u'http://example.com/handle/53', u'country': u'020'}}

  tempData = []
  tempItem = {}
  lastPaper = ''
  for item in rs["result"]:
    paper = item["_id"]["handle"]

    # Before we do anything else, check to see if the last row was the last of that paper
    # If so, deal with that paper's completed list
    if(paper!=lastPaper and lastPaper!=''):
      log.write('\nSummary for _'+str(lastPaper)+'_ \n')
      log.write('  '+str(tempData)+'\n')
      # Append tempData inside countries of summary document
      sums.update(
        {"_id" : lastPaper},
        {"$set" : {"countries" : tempData}},
        True
      )
      log.write('\n')
      tempData = []

    # log.write('  '+str(item)+'\n')

    # Append this record to the tempData list
    # items in this list have the shape:
    # {
    #   "country": "020",
    #   "downloads" : 2
    # }
    tempItem = {}
    tempItem["country"] = item["_id"]["country"]
    tempItem["downloads"] = item["downloads"]
    tempData.append(tempItem)

    # Store this paper value, for comparison to next record, to figure out when the overall list
    lastPaper = paper

  # Don't forget the last group
  log.write('\nSummary for _'+str(lastPaper)+'_ \n')
  log.write('  '+str(tempData)+'\n')
  sums.update(
    {"_id" : lastPaper},
    {"$set" : {"countries" : tempData}},
    True
  )

  log.write('\n')

def paperParents():
  # This builds the parents dataset for each paper in the collection

  global log
  global reqs
  global sums

  log.write('\n### Paper Parents\n\n')
  print('Paper Parents')

  # Get list of Papers
  papers = reqs.distinct('handle')
  for term in papers:
    # For each Paper...
    thisPaper = term
    print(str(thisPaper))
    log.write('  '+str(thisPaper)+'\n')

    # db.requests.distinct('author',{'handle':'http://example.com/handle/1171'}).sort({'_id':1})
    # per https://jira.mongodb.org/browse/PYTHON-331
    rs = reqs.find( { "handle" : thisPaper } ).distinct("author")

    # ... and for each daily total, build tempData ...
    tempData = []
    for item in rs:
      # Build local item
      tempItem = {}
      tempItem["parent"] = item
      # Append local item to tempData
      tempData.append(tempItem)

    # ... and store that tempData in the master summary
    sums.update(
      {"_id" : thisPaper},
      {"$set" : {"parents" : tempData}},
      True
    )


###############################################################################
###############################################################################
###############################################################################

def main():
  global log
  global reqs
  global sums

  log = open('summarize.log','w')

  log.write('Summarizing oastats.requests into oastats.summaries\n')

  # Connect
  local = MongoClient('mongodb://localhost')
  log.write('Connections made\n')

  reqs = local.oastats.requests
  sums = local.oastats.summaries
  log.write('reqs and sums defined\n')

  overallData()
  overallMap()
  overallTime()

  dlcData()
  dlcMap()
  dlcTime()

  authorData()
  authorMap()
  authorTime()
  authorParents()

  paperData()
  paperMap()
  paperTime()

if __name__ == "__main__":
  main()
	# update summary data based on requests collection

	# Imports
	from pymongo import MongoClient

	# Globals
	log = ''
	reqs = ''
	sums = ''

	def overallData():
	# This builds the data dataset for the overall collection

	global log
	global reqs
	global sums

	log.write('\n### Overall Data\n\n')
	print('Overall Data')

	rs = reqs.aggregate( [
	{ "$group" : { "_id" : "$handle" , "downloads" : { "$sum" : 1 } } } ,
	{ "$group" : { "_id" : "$_id.dlc" , "size" : { "$sum" : 1 } , "downloads" : { "$sum": "$downloads"} } }
	] )

	for item in rs["result"]:
	sums.update(
	{"_id" : "Overall"},
	{"$set" : {"type" : "overall", "size" : item["size"], "downloads" : item["downloads"] } },
	True
	)

	def overallMap():
	# This builds the map dataset for the overall collection

	global log
	global reqs
	global sums

	log.write('\n### Overall Map\n\n')
	print('Overall Map')

	rs = reqs.aggregate( [
	{ "$group" : { "_id" : "$country", "downloads" : { "$sum" : 1 } } },
	{ "$sort" : { "_id" : 1 } }
	] )

	tempData = []
	for item in rs["result"]:
	tempItem = {}
	tempItem["country"] = item["_id"]
	tempItem["downloads"] = item["downloads"]
	tempData.append(tempItem)

	sums.update(
	{"_id" : "Overall"},
	{"$set" : {"countries" : tempData}},
	True
	)

	def overallTime():
	# This builds the timeline dataset for the overall collection

	global log
	global reqs
	global sums

	log.write('\n### Overall Timeline\n\n')
	print('Overall Timeline')

	rs = reqs.aggregate( [
	{ "$group" : { "_id" : { "$substr" : [ "$time", 0, 10] } , "downloads" : { "$sum" : 1 } } },
	{ "$sort" : { "_id" : 1 } }
	] )

	tempData = []
	for item in rs["result"]:
	tempItem = {}
	tempItem["date"] = item["_id"]
	tempItem["downloads"] = item["downloads"]
	tempData.append(tempItem)

	sums.update(
	{"_id" : "Overall"},
	{"$set" : {"dates" : tempData}},
	True
	)

	###############################################################################
	###############################################################################
	###############################################################################

	def dlcData():
	global log
	global reqs
	global sums

	log.write('\n### Data\n')
	print('Summarizing Data')

	rs = reqs.aggregate( [
	{ "$group" : { "_id" : { "dlc" : "$dlc" , "handle" : "$handle" }, "downloads" : { "$sum" : 1 } } } ,
	{ "$group" : { "_id" : "$_id.dlc" , "size" : { "$sum" : 1 } , "downloads" : { "$sum": "$downloads"} } } ,
	{ "$sort" : { "_id" : 1 } }
	] )

	for item in rs["result"]:
	log.write(' '+str(item)+'\n')
	dlc = item["_id"]
	tempQuery = {}
	tempQuery['type'] = "dlc"
	tempQuery['size'] = item["size"]
	tempQuery['downloads'] = item["downloads"]
	sums.update(
	{"_id" : dlc},
	{"$set" : tempQuery},
	True
	)

	def dlcTime():
	# This builds a list of download totals, for each day, for each DLC.
	# That list, grouped by DLC, is then stored in the summaries collection.

	global log
	global reqs
	global sums

	log.write('\n### Timeline\n\n')
	print('Summarizing Timeline')

	# Because the list of dates is prohibitively large, we can't get a master recordset like we do
	# for the map - instead we have to split the queries up for each DLC.

	# Get list of DLCs
	dlcs = reqs.distinct('dlc')
	for term in dlcs:
	# For each DLC...
	thisDLC = term
	print(str(thisDLC))
	log.write(' '+str(thisDLC)+'\n')

	# ... get the list of daily download totals ...
	rs = reqs.aggregate( [
	{ "$match": { "dlc" : thisDLC } },
	{ "$group" : { "_id" : { "$substr" : [ "$time", 0, 10] } , "downloads" : { "$sum" : 1 } } },
	{ "$sort": { "_id" : 1 } }
	] )

	# ... and for each daily total, build tempData ...
	tempData = []
	for item in rs["result"]:
	# Build local item
	tempItem = {}
	tempItem["date"] = item["_id"]
	tempItem["downloads"] = item["downloads"]
	# Append local item to tempData
	tempData.append(tempItem)

	# ... and store that tempData in the master summary
	sums.update(
	{"_id" : thisDLC},
	{"$set" : {"dates" : tempData}},
	True
	)

	def dlcMap():
	# This builds a list of download totals, for each country, for each DLC.
	# That list, grouped by DLC, is then stored in the summaries collection

	global log
	global reqs
	global sums

	log.write('\n### Map\n')
	print('Summarizing Map')

	rs = reqs.aggregate( [
	{ "$group" : { "_id" : { "dlc" : "$dlc" , "country" : "$country" }, "downloads" : { "$sum" : 1 } } } ,
	{ "$group" : { "_id" : "$_id" , "downloads" : { "$sum" : "$downloads" } } },
	{ "$sort" : { "_id" : 1 } }
	] )
	# Sample returned document:
	# {u'downloads': 2, u'_id': {u'dlc': u'Aerospace Biomedical and Life Support Engineering', u'country': u'020'}}

	tempData = []
	tempItem = {}
	lastDLC = ''
	for item in rs["result"]:
	dlc = item["_id"]["dlc"]

	# Before we do anything else, check to see if the last row was the last of that DLC
	# If so, deal with that DLC's completed list
	if(dlc!=lastDLC and lastDLC!=''):
	log.write('\nSummary for _'+str(lastDLC)+'_ \n')
	log.write(' '+str(tempData)+'\n')
	# Append tempData inside countries of summary document
	sums.update(
	{"_id" : lastDLC},
	{"$set" : {"countries" : tempData}},
	True
	)
	log.write('\n')
	tempData = []

	# log.write(' '+str(item)+'\n')

	# Append this record to the tempData list
	# items in this list have the shape:
	# {
	# "country": "020",
	# "downloads" : 2
	# }
	tempItem = {}
	tempItem["country"] = item["_id"]["country"]
	tempItem["downloads"] = item["downloads"]
	tempData.append(tempItem)

	# Store this dlc value, for comparison to next record, to figure out when the overall list
	lastDLC = dlc

	# Don't forget the last group
	log.write('\nSummary for _'+str(lastDLC)+'_ \n')
	log.write(' '+str(tempData)+'\n')
	sums.update(
	{"_id" : lastDLC},
	{"$set" : {"countries" : tempData}},
	True
	)

	log.write('\n')

	###############################################################################
	###############################################################################
	###############################################################################
	###############################################################################

	def authorData():
	# This builds the data dataset for each author in collection

	global log
	global reqs
	global sums

	log.write('\n### Author Data\n\n')
	print('Author Data')

	rs = reqs.aggregate( [
	{ "$group" : { "_id" : { "author" : "$author" , "handle" : "$handle" }, "downloads" : { "$sum" : 1 } } } ,
	{ "$group" : { "_id" : "$_id.author" , "size" : { "$sum" : 1 } , "downloads" : { "$sum": "$downloads"} } } ,
	{ "$sort" : { "_id" : 1 } }
	] )

	for item in rs["result"]:
	log.write(' '+str(item)+'\n')
	author = item["_id"]
	tempQuery = {}
	tempQuery['type'] = "author"
	tempQuery['size'] = item["size"]
	tempQuery['downloads'] = item["downloads"]
	sums.update(
	{"_id" : author},
	{"$set" : tempQuery},
	True
	)

	def authorTime():
	# This builds the timeline dataset for each author in the collection

	global log
	global reqs
	global sums

	log.write('\n### Author Timeline\n\n')
	print('Author Timeline')

	# Because the list of dates is prohibitively large, we can't get a master recordset like we do
	# for the map - instead we have to split the queries up for each Author.

	# Get list of Authors
	authors = reqs.distinct('author')
	for term in authors:
	# For each Author...
	thisAuthor = term
	print(str(thisAuthor))
	log.write(' '+str(thisAuthor)+'\n')

	# ... get the list of daily download totals ...
	rs = reqs.aggregate( [
	{ "$match": { "author" : thisAuthor } },
	{ "$group" : { "_id" : { "$substr" : [ "$time", 0, 10] } , "downloads" : { "$sum" : 1 } } },
	{ "$sort": { "_id" : 1 } }
	] )

	# ... and for each daily total, build tempData ...
	tempData = []
	for item in rs["result"]:
	# Build local item
	tempItem = {}
	tempItem["date"] = item["_id"]
	tempItem["downloads"] = item["downloads"]
	# Append local item to tempData
	tempData.append(tempItem)

	# ... and store that tempData in the master summary
	sums.update(
	{"_id" : thisAuthor},
	{"$set" : {"dates" : tempData}},
	True
	)

	def authorMap():
	# This builds the map dataset for each author in the collection

	global log
	global reqs
	global sums

	log.write('\n### Author Map\n\n')
	print('Author Map')

	rs = reqs.aggregate( [
	{ "$group" : { "_id" : { "author" : "$author" , "country" : "$country" }, "downloads" : { "$sum" : 1 } } } ,
	{ "$group" : { "_id" : "$_id" , "downloads" : { "$sum" : "$downloads" } } },
	{ "$sort" : { "_id" : 1 } }
	] )
	# Sample returned document:
	# {u'downloads': 2, u'_id': {u'author': u'http://example.com/author/1098', u'country': u'020'}}

	tempData = []
	tempItem = {}
	lastAuthor = ''
	for item in rs["result"]:
	author = item["_id"]["author"]

	# Before we do anything else, check to see if the last row was the last of that author
	# If so, deal with that author's completed list
	if(author!=lastAuthor and lastAuthor!=''):
	log.write('\nSummary for _'+str(lastAuthor)+'_ \n')
	log.write(' '+str(tempData)+'\n')
	# Append tempData inside countries of summary document
	sums.update(
	{"_id" : lastAuthor},
	{"$set" : {"countries" : tempData}},
	True
	)
	log.write('\n')
	tempData = []

	# log.write(' '+str(item)+'\n')

	# Append this record to the tempData list
	# items in this list have the shape:
	# {
	# "country": "020",
	# "downloads" : 2
	# }
	tempItem = {}
	tempItem["country"] = item["_id"]["country"]
	tempItem["downloads"] = item["downloads"]
	tempData.append(tempItem)

	# Store this author value, for comparison to next record, to figure out when the overall list
	lastAuthor = author

	# Don't forget the last group
	log.write('\nSummary for _'+str(lastAuthor)+'_ \n')
	log.write(' '+str(tempData)+'\n')
	sums.update(
	{"_id" : lastAuthor},
	{"$set" : {"countries" : tempData}},
	True
	)

	log.write('\n')

	def authorParents():
	# This builds the timeline dataset for each author in the collection

	global log
	global reqs
	global sums

	log.write('\n### Author Parents\n\n')
	print('Author Parents')

	# Because the list of dates is prohibitively large, we can't get a master recordset like we do
	# for the map - instead we have to split the queries up for each Author.

	# Get list of Authors
	authors = reqs.distinct('author')
	for term in authors:
	# For each Author...
	thisAuthor = term
	print(str(thisAuthor))
	log.write(' '+str(thisAuthor)+'\n')

	# db.requests.distinct('dlc',{'author':'http://example.com/author/1171'}).sort({'_id':1})
	# per https://jira.mongodb.org/browse/PYTHON-331
	rs = reqs.find( { "author" : thisAuthor } ).distinct("dlc")

	# ... and for each daily total, build tempData ...
	tempData = []
	for item in rs:
	# Build local item
	tempItem = {}
	tempItem["parent"] = item
	# Append local item to tempData
	tempData.append(tempItem)

	# ... and store that tempData in the master summary
	sums.update(
	{"_id" : thisAuthor},
	{"$set" : {"parents" : tempData}},
	True
	)


	###############################################################################
	###############################################################################
	###############################################################################

	def paperData():
	# This builds the data dataset for each paper in collection

	global log
	global reqs
	global sums

	log.write('\n### Paper Data\n\n')
	print('Paper Data')

	rs = reqs.aggregate( [
	{ "$group" : { "_id" : "$handle" , "downloads" : { "$sum" : 1} } } ,
	{ "$sort" : { "_id" : 1 } }
	] )

	for item in rs["result"]:
	log.write(' '+str(item)+'\n')
	paper = item["_id"]
	tempQuery = {}
	tempQuery['type'] = "paper"
	tempQuery['downloads'] = item["downloads"]
	sums.update(
	{"_id" : paper},
	{"$set" : tempQuery},
	True
	)

	def paperTime():
	# This builds the timeline dataset for each paper in the collection

	global log
	global reqs
	global sums

	log.write('\n### Paper Timeline\n\n')
	print('Paper Timeline')

	# Because the list of dates is prohibitively large, we can't get a master recordset like we do
	# for the map - instead we have to split the queries up for each Paper.

	# Get list of Papers
	papers = reqs.distinct('handle')
	for term in papers:
	# For each Paper...
	thisPaper = term
	print(str(thisPaper))
	log.write(' '+str(thisPaper)+'\n')

	# ... get the list of daily download totals ...
	rs = reqs.aggregate( [
	{ "$match": { "handle" : thisPaper } },
	{ "$group" : { "_id" : { "$substr" : [ "$time", 0, 10] } , "downloads" : { "$sum" : 1 } } },
	{ "$sort": { "_id" : 1 } }
	] )

	# ... and for each daily total, build tempData ...
	tempData = []
	for item in rs["result"]:
	# Build local item
	tempItem = {}
	tempItem["date"] = item["_id"]
	tempItem["downloads"] = item["downloads"]
	# Append local item to tempData
	tempData.append(tempItem)

	# ... and store that tempData in the master summary
	sums.update(
	{"_id" : thisPaper},
	{"$set" : {"dates" : tempData}},
	True
	)

	def paperMap():
	# This builds the map dataset for each paper in the collection

	global log
	global reqs
	global sums

	log.write('\n### Paper Map\n\n')
	print('Paper Map')

	rs = reqs.aggregate( [
	{ "$group" : { "_id" : { "handle" : "$handle" , "country" : "$country" }, "downloads" : { "$sum" : 1 } } } ,
	{ "$group" : { "_id" : "$_id" , "downloads" : { "$sum" : "$downloads" } } },
	{ "$sort" : { "_id" : 1 } }
	] )
	# Sample returned document:
	# {u'downloads': 2, u'_id': {u'handle': u'http://example.com/handle/53', u'country': u'020'}}

	tempData = []
	tempItem = {}
	lastPaper = ''
	for item in rs["result"]:
	paper = item["_id"]["handle"]

	# Before we do anything else, check to see if the last row was the last of that paper
	# If so, deal with that paper's completed list
	if(paper!=lastPaper and lastPaper!=''):
	log.write('\nSummary for _'+str(lastPaper)+'_ \n')
	log.write(' '+str(tempData)+'\n')
	# Append tempData inside countries of summary document
	sums.update(
	{"_id" : lastPaper},
	{"$set" : {"countries" : tempData}},
	True
	)
	log.write('\n')
	tempData = []

	# log.write(' '+str(item)+'\n')

	# Append this record to the tempData list
	# items in this list have the shape:
	# {
	# "country": "020",
	# "downloads" : 2
	# }
	tempItem = {}
	tempItem["country"] = item["_id"]["country"]
	tempItem["downloads"] = item["downloads"]
	tempData.append(tempItem)

	# Store this paper value, for comparison to next record, to figure out when the overall list
	lastPaper = paper

	# Don't forget the last group
	log.write('\nSummary for _'+str(lastPaper)+'_ \n')
	log.write(' '+str(tempData)+'\n')
	sums.update(
	{"_id" : lastPaper},
	{"$set" : {"countries" : tempData}},
	True
	)

	log.write('\n')

	def paperParents():
	# This builds the parents dataset for each paper in the collection

	global log
	global reqs
	global sums

	log.write('\n### Paper Parents\n\n')
	print('Paper Parents')

	# Get list of Papers
	papers = reqs.distinct('handle')
	for term in papers:
	# For each Paper...
	thisPaper = term
	print(str(thisPaper))
	log.write(' '+str(thisPaper)+'\n')

	# db.requests.distinct('author',{'handle':'http://example.com/handle/1171'}).sort({'_id':1})
	# per https://jira.mongodb.org/browse/PYTHON-331
	rs = reqs.find( { "handle" : thisPaper } ).distinct("author")

	# ... and for each daily total, build tempData ...
	tempData = []
	for item in rs:
	# Build local item
	tempItem = {}
	tempItem["parent"] = item
	# Append local item to tempData
	tempData.append(tempItem)

	# ... and store that tempData in the master summary
	sums.update(
	{"_id" : thisPaper},
	{"$set" : {"parents" : tempData}},
	True
	)


	###############################################################################
	###############################################################################
	###############################################################################

	def main():
	global log
	global reqs
	global sums

	log = open('summarize.log','w')

	log.write('Summarizing oastats.requests into oastats.summaries\n')

	# Connect
	local = MongoClient('mongodb://localhost')
	log.write('Connections made\n')

	reqs = local.oastats.requests
	sums = local.oastats.summaries
	log.write('reqs and sums defined\n')

	overallData()
	overallMap()
	overallTime()

	dlcData()
	dlcMap()
	dlcTime()

	authorData()
	authorMap()
	authorTime()
	authorParents()

	paperData()
	paperMap()
	paperTime()

	if __name__ == "__main__":
	main()