Skip to content

Instantly share code, notes, and snippets.

# pig -param orig=/user/bcolloran/data/fhrFullDump_2014-01-31/ -param fetchids=/tmp/sample_list.txt -param jointype=merge -param output=DEST_PATH fetch_reports.pig
register '/opt/cloudera/parcels/CDH/lib/pig/piggybank.jar';
fulldump = LOAD '$orig' USING org.apache.pig.piggybank.storage.SequenceFileLoader AS (key:chararray, value:chararray);
ids_to_fetch_raw = LOAD '$fetchids' USING PigStorage() AS (key:chararray, ign:chararray);
ids_to_fetch = ORDER ids_to_fetch_raw BY key;
common = JOIN fulldump by key, ids_to_fetch by key USING '$jointype';
#!/usr/bin/python
import os
import sys
from datetime import date, timedelta, datetime
import simplejson as json
def parse(filex):
data = json.loads(filex.read(), 'utf8')
@meyarivan
meyarivan / get_rawdata_sizes.py
Last active August 29, 2015 14:04
Compute sizes of raw_data:* from a silly sample of live data
#!/usr/bin/env python
import os, sys
import math
import happybase
import time
import simplejson as json
import struct
from datetime import datetime, timedelta
@meyarivan
meyarivan / weblog_job.py
Created September 23, 2014 16:38
Simple streaming job to process raw weblogs
#!/usr/bin/env python
import sys, os
import codecs
import datetime
import mrjob.job
import mrjob.protocol
import simplejson as json
import math
#!/usr/bin/env python
import mechanize
import logging
import sys, os
USER = 'someuser@mozilla.com'
PASSWORD = 'somepassword'
CONFLUENCE_BASE_URL = "https://mana.mozilla.org"
CONFLUENCE_MANAGE_INDEX_URL = "https://mana.mozilla.org/wiki/admin/viewindexqueue.action"
REGISTER 'socorro-toolbox-0.1-SNAPSHOT.jar'
REGISTER 'akela-0.6-SNAPSHOT.jar'
register 'jackson-core-2.0.6.jar'
register 'jackson-databind-2.0.6.jar'
register 'jackson-annotations-2.0.6.jar'
SET pig.logfile socorro-modulelist.log;
SET default_parallel 30;
SET mapred.compress.map.output false;
/* SET mapred.map.output.compression.codec org.apache.hadoop.io.compress.SnappyCodec; */
import os, sys
import org.apache.pig.tools.pigstats.PigStatusReporter as PigStatusReporter
import org.apache.pig.tools.counters.PigCounterHelper as PigCounterHelper
import org.apache.pig.impl.util.UDFContext as UDFContext
reporter = PigCounterHelper()
@outputSchema('modules:bag{t:tuple(filename:chararray,version:chararray,debug_file:chararray,debug_id:chararray,base_addr:chararray,max_addr:chararray)}')
#!/usr/bin/env python2
import sys
grouped = {}
def parse_line(linex):
parts = linex.split(' ', 9)
# TODO
#
# [1] restrict to valid firefox versions
SELECT DATE(TIME_SLICE(adi.bl_date, 168, 'hour', 'start')) AS "Ping Date" ,
adi.v_prod_major AS "Product Version" ,
l.country_name AS "Country" ,
adi.locale AS "Locale" ,
adi.channel AS "Release Channel" ,
name total_usable_slots total_usable_disk net_price num_hosts per_node_price
----------- -------------------- ------------------- ----------- ----------- ----------------
m2.4xlarge 42 11550 8.26 7 1.18
m1.xlarge 40 33000 9.2 20 0.46
d2.4xlarge 42 71910 10.44 3 3.48
m2.2xlarge 40 16400 11.8 20 0.59
d2.2xlarge 42 83790 12.18 7 1.74
i2.4xlarge 42 9510 12.21 3 4.07
d2.8xlarge 68 95940 13.92 2 6.96
i2.2xlarge 42 10990 14.21 7 2.03