Skip to content

Instantly share code, notes, and snippets.

@danizen
Last active February 12, 2018 16:39
Show Gist options
  • Save danizen/e9243c699b206a1c41107f7c8392b223 to your computer and use it in GitHub Desktop.
Save danizen/e9243c699b206a1c41107f7c8392b223 to your computer and use it in GitHub Desktop.
aggregation of crawled references using mongodb
{
"referrerLinkText": null,
"isRootParentReference": false,
"sitemapLastMod": null,
"parentRootReference": null,
"referrerLinkTag": null,
"sitemapChangeFreq": null,
"crawlState": "REJECTED",
"isValid": false,
"contentType": "text/html",
"stage": "PROCESSED",
"sitemapPriority": null,
"referrerReference": null,
"referrerLinkTitle": null,
"crawlDate": "2018-02-06T22:21:19.869000",
"reference": "https://stemcells.nih.gov",
"depth": 0,
"contentChecksum": null,
"originalReference": null,
"_id": "5a7a289639ec2e4736d0854b",
"metaChecksum": null
}
/* Pipe the data back and build a chart like this */
var chart = c3.generate({
size: {
width: 960,
},
data: {
x: 'x',
columns: [
['x', '2018-02-05', '2018-02-06', '2018-02-07', '2018-02-08', '2018-02-09', '2018-02-10'],
['New', 300, 350, 300, 0, 0, 120],
['Redirect', 130, 100, 140, 200, 150, 50],
['Other', 12, 16, 20, 12, 10, 7],
],
types: {
New: 'area',
Redirect: 'area',
Other: 'area'
// 'line', 'spline', 'step', 'area', 'area-step' are also available to stack
},
groups: [['New', 'Redirect', 'Other']],
colors: {
New: '#18993c',
Redirect: '#f4b642',
Other: 'f45342'
}
},
axis: {
x: {
type: 'timeseries',
tick: {
format: '%Y-%m-%d'
}
},
y: {
label: 'Pages'
}
}
});
# assume refs is a reference to a Mongo collection
cursor = refs.aggregate([
# the "$match" pipeline operator basically makes sure that these references are processed.
# they should have a crawlState, but just in case, we make sure of it.
{'$match': {
'stage': 'PROCESSED',
'crawlState': {'$exists': True}},
}},
# group here basically counts up the references in each state
{'$group': {
'_id': { 'outcome': '$crawlState' },
'count': { '$sum': 1 }
}}
])
# get the results below
list(cursor)
# In my case, I get the following:
[{'_id': {'outcome': 'REJECTED'}, 'count': 264},
{'_id': {'outcome': 'BAD_STATUS'}, 'count': 89},
{'_id': {'outcome': 'REDIRECT'}, 'count': 3511},
{'_id': {'outcome': 'NOT_FOUND'}, 'count': 19},
{'_id': {'outcome': 'ERROR'}, 'count': 50},
{'_id': {'outcome': 'NEW'}, 'count': 11634}]
# again, we assume refs is the collection
from datetime import datetime, timedelta, time
now = datetime.now()
startofday = datetime.combine(now, time.min)
# these will be the buckets for our aggregation
boundaries = [datetime.min]+[startofday - timedelta(days=i) for i in range(5,0,-1)]+[now]
# Mongo wants the array of buckets to be sorted in ascending order, which is why we used range(5,0,-1)
# anyway, we get something like this:
[datetime.datetime(1, 1, 1, 0, 0),
datetime.datetime(2018, 2, 4, 0, 0),
datetime.datetime(2018, 2, 5, 0, 0),
datetime.datetime(2018, 2, 6, 0, 0),
datetime.datetime(2018, 2, 7, 0, 0),
datetime.datetime(2018, 2, 8, 0, 0),
datetime.datetime(2018, 2, 9, 15, 15, 3, 165743)]
# Now, we construct a bucketing based on boundaries
bucekt = {'$bucket': {
'groupBy': '$crawlDate',
'boundaries': boundaries,
'output': {
'count': { '$sum': 1 }
}
}}
# The two most popular crawlStates are 'NEW' and 'REDIRECT', so we lump the rest together:
cursor = refs.aggregate([
# We again select references that have been processee, but this time we check that the dates are recent enough
{'$match': {
'stage': 'PROCESSED',
'crawlState': {'$exists': True}},
'crawlDate': {'$gte': startofday - timedelta(days=5) }},
}},
# now we facet them
{'$facet':{
'new': {
{'$match': {'crawlState': 'NEW' }}, bucket
],
'redirect': [
{'$match': {'crawlState': 'REDIRECT'}}, bucket
],
'other': [
{'$match': {'crawlState': { '$nin': [ 'NEW', 'REDIRECT' ]}}}, bucket
]
}}
])
# For this calculation, I get reasonable results as I restarted the crawl from scratch recently
# and the shutdown was real enough that it is not running today
[{'new': [{'_id': datetime.datetime(2018, 2, 6, 0, 0), 'count': 837},
{'_id': datetime.datetime(2018, 2, 7, 0, 0), 'count': 7098},
{'_id': datetime.datetime(2018, 2, 8, 0, 0), 'count': 3699}],
'other': [{'_id': datetime.datetime(2018, 2, 6, 0, 0), 'count': 33},
{'_id': datetime.datetime(2018, 2, 7, 0, 0), 'count': 261},
{'_id': datetime.datetime(2018, 2, 8, 0, 0), 'count': 99}],
'redirect': [{'_id': datetime.datetime(2018, 2, 6, 0, 0), 'count': 69},
{'_id': datetime.datetime(2018, 2, 7, 0, 0), 'count': 1829},
{'_id': datetime.datetime(2018, 2, 8, 0, 0), 'count': 1613}]}]
# How can I do this without knowing the crawlStates in advance?
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment