Skip to content

Instantly share code, notes, and snippets.

@meyarivan
meyarivan / weblog_job.py
Created September 23, 2014 16:38
Simple streaming job to process raw weblogs
#!/usr/bin/env python
import sys, os
import codecs
import datetime
import mrjob.job
import mrjob.protocol
import simplejson as json
import math
@meyarivan
meyarivan / get_rawdata_sizes.py
Last active August 29, 2015 14:04
Compute sizes of raw_data:* from a silly sample of live data
#!/usr/bin/env python
import os, sys
import math
import happybase
import time
import simplejson as json
import struct
from datetime import datetime, timedelta
#!/usr/bin/python
import os
import sys
from datetime import date, timedelta, datetime
import simplejson as json
def parse(filex):
data = json.loads(filex.read(), 'utf8')
# pig -param orig=/user/bcolloran/data/fhrFullDump_2014-01-31/ -param fetchids=/tmp/sample_list.txt -param jointype=merge -param output=DEST_PATH fetch_reports.pig
register '/opt/cloudera/parcels/CDH/lib/pig/piggybank.jar';
fulldump = LOAD '$orig' USING org.apache.pig.piggybank.storage.SequenceFileLoader AS (key:chararray, value:chararray);
ids_to_fetch_raw = LOAD '$fetchids' USING PigStorage() AS (key:chararray, ign:chararray);
ids_to_fetch = ORDER ids_to_fetch_raw BY key;
common = JOIN fulldump by key, ids_to_fetch by key USING '$jointype';