Skip to content

Instantly share code, notes, and snippets.

@matthayes
Created January 3, 2012 19:12
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save matthayes/1556399 to your computer and use it in GitHub Desktop.
Save matthayes/1556399 to your computer and use it in GitHub Desktop.
Compute quantiles for temperature data using DataFu
rmf temperature2.txt;
-- Load distributions and counts of random data to generate.
-- temperature_mean_stdev.txt will generate 1 billion measurements for each of three
-- sensors.
temperature_mean_stdev = LOAD 'temperature_mean_stdev.txt' AS (id:chararray, mean:double, stdev:double, count:long, dummy:int);
DEFINE rand_gaussian `rand_gaussian.py` SHIP('rand_gaussian.py');
temperature_mean_stdev = DISTINCT temperature_mean_stdev PARALLEL 80;
temperature_mean_stdev = FOREACH temperature_mean_stdev GENERATE id, mean, stdev, count;
-- stream distributions through python script to generate random data
temperature = STREAM temperature_mean_stdev THROUGH rand_gaussian AS (id:chararray, temp:double);
STORE temperature INTO 'temperature2.txt';
require 'rubystats'
# Generates 10,000 measurements for three imaginary temperature sensors.
sensors = []
sensors << {:id => 1, :mean => 60.0, :stdev => 5.0}
sensors << {:id => 2, :mean => 50.0, :stdev => 10.0}
sensors << {:id => 3, :mean => 40.0, :stdev => 3.0}
File.open('temperature.txt','w') do |file|
sensors.each do |sensor|
id = sensor[:id]
dist = Rubystats::NormalDistribution.new(sensor[:mean],sensor[:stdev])
dist.rng(10000).each do |value|
file.write "#{id}\t#{value}\n"
end
end
end
(1,(30.524038,56.62764,60.000134,63.372384,90.561695))
(2,(-9.845137,43.25512,49.999536,56.74441,109.714687))
(3,(21.564769,37.976644,40.000025,42.023622,58.057268))
#!/usr/bin/env python
# encoding: utf-8
# Used by generate_temperature_data.pig to generate massive amount of random
# gaussian data through Pig.
import random
import sys
for line in sys.stdin:
(i, mean, stdev, count) = line.strip().split('\t')
count = int(count)
mean = float(mean)
stdev = float(stdev)
while(count > 0):
val = random.gauss(mean,stdev)
sys.stdout.write('%s\t%f\n' % (i, val))
count-=1
(1,(30.524038,55.993967,59.488968,62.775554,90.561695))
(2,(-9.845137,41.95725,48.977708,55.554239,109.714687))
(3,(21.564769,37.569332,39.692373,41.666762,58.057268))
define Quartile datafu.pig.stats.Quantile('0.0','0.25','0.5','0.75','1.0');
temperature = LOAD 'temperature.txt' AS (id:chararray, temp:double);
temperature = GROUP temperature BY id;
temperature_quartiles = FOREACH temperature {
sorted = ORDER temperature by temp; -- must be sorted
GENERATE group as id, Quartile(sorted.temp) as quartiles;
}
DUMP temperature_quartiles
define Quartile datafu.pig.stats.StreamingQuantile('0.0','0.25','0.5','0.75','1.0');
temperature = LOAD 'temperature.txt' AS (id:chararray, temp:double);
temperature = GROUP temperature BY id;
temperature_quartiles = FOREACH temperature {
-- sort not necessary
GENERATE group as id, Quartile(temperature.temp) as quartiles;
}
DUMP temperature_quartiles
(1,(41.58171454288797,56.24183579452584,59.61727093346221,62.919576028265375,79.2841731889925))
(2,(14.393515179526304,42.55929349057328,49.50432161293486,56.020101184758644,91.03574746442487))
(3,(29.865710766927595,37.64744333815733,39.84941055349095,41.77693877565934,51.31349575866486))
(1,(41.58171454288797,56.559375253601715,59.91093458980706,63.335574106080365,79.2841731889925))
(2,(14.393515179526304,43.39558395897533,50.081758806889766,56.54245916209963,91.03574746442487))
(3,(29.865710766927595,37.86257868882021,39.97075970657039,41.989584898364704,51.31349575866486))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment