Last active
December 17, 2015 03:19
-
-
Save abelsonlive/5542464 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# helper fx for filering data # | |
def query_jsonpath(line, path): | |
from jsonpath import jsonpath | |
value = jsonpath(line, path) | |
if value: | |
return value[0] # jsonpath returns matches in a list for some reason | |
else: | |
return None # fill in 'null' if there's not a match for a particular field | |
def extract_fields(line, config): # query lines of json w/ jsonpath for specific fields | |
json_paths = config['fields'] | |
data = {} | |
for path in json_paths: | |
data[path] = query_jsonpath(line, path) | |
return data | |
class MRSnacker(MRJob): | |
INPUT_PROTOCOL = JSONValueProtocol | |
OUTPUT_PROTOCOL = JSONValueProtocol | |
def configure_options(self): | |
"""Add command-line options specific to this script.""" | |
super(MRSnacker, self).configure_options() | |
self.add_file_option( | |
'--config', dest='config_file', default='mr.config', type='str', | |
help=('the yaml-formatted configuration file, defaults to mr.config')) | |
def settings(self): | |
self.config = load_config(self.options.config_file) | |
self.urls = load_urls(self.config) | |
self.uids = load_uids(self.config) | |
self.patterns = load_patterns(self.config) | |
def filter_logs(self, _, line): | |
user = get_user(line) | |
url = get_url(line) | |
if match_user(user, self.uids): | |
if match_url(url, self.urls, self.patterns): | |
data = extract_fields(line, self.config) | |
data['user'] = user | |
data['story'] = url | |
yield user, data | |
def steps(self): | |
return [self.mr(mapper_init=self.settings, | |
mapper=self.filter_logs),] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment