Skip to content

Instantly share code, notes, and snippets.

@abelsonlive
Last active December 17, 2015 03:19
Show Gist options
  • Save abelsonlive/5542464 to your computer and use it in GitHub Desktop.
Save abelsonlive/5542464 to your computer and use it in GitHub Desktop.
# helper fx for filering data #
def query_jsonpath(line, path):
from jsonpath import jsonpath
value = jsonpath(line, path)
if value:
return value[0] # jsonpath returns matches in a list for some reason
else:
return None # fill in 'null' if there's not a match for a particular field
def extract_fields(line, config): # query lines of json w/ jsonpath for specific fields
json_paths = config['fields']
data = {}
for path in json_paths:
data[path] = query_jsonpath(line, path)
return data
class MRSnacker(MRJob):
INPUT_PROTOCOL = JSONValueProtocol
OUTPUT_PROTOCOL = JSONValueProtocol
def configure_options(self):
"""Add command-line options specific to this script."""
super(MRSnacker, self).configure_options()
self.add_file_option(
'--config', dest='config_file', default='mr.config', type='str',
help=('the yaml-formatted configuration file, defaults to mr.config'))
def settings(self):
self.config = load_config(self.options.config_file)
self.urls = load_urls(self.config)
self.uids = load_uids(self.config)
self.patterns = load_patterns(self.config)
def filter_logs(self, _, line):
user = get_user(line)
url = get_url(line)
if match_user(user, self.uids):
if match_url(url, self.urls, self.patterns):
data = extract_fields(line, self.config)
data['user'] = user
data['story'] = url
yield user, data
def steps(self):
return [self.mr(mapper_init=self.settings,
mapper=self.filter_logs),]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment