Created
January 3, 2012 19:12
-
-
Save matthayes/1556399 to your computer and use it in GitHub Desktop.
Compute quantiles for temperature data using DataFu
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rmf temperature2.txt; | |
-- Load distributions and counts of random data to generate. | |
-- temperature_mean_stdev.txt will generate 1 billion measurements for each of three | |
-- sensors. | |
temperature_mean_stdev = LOAD 'temperature_mean_stdev.txt' AS (id:chararray, mean:double, stdev:double, count:long, dummy:int); | |
DEFINE rand_gaussian `rand_gaussian.py` SHIP('rand_gaussian.py'); | |
temperature_mean_stdev = DISTINCT temperature_mean_stdev PARALLEL 80; | |
temperature_mean_stdev = FOREACH temperature_mean_stdev GENERATE id, mean, stdev, count; | |
-- stream distributions through python script to generate random data | |
temperature = STREAM temperature_mean_stdev THROUGH rand_gaussian AS (id:chararray, temp:double); | |
STORE temperature INTO 'temperature2.txt'; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubystats' | |
# Generates 10,000 measurements for three imaginary temperature sensors. | |
sensors = [] | |
sensors << {:id => 1, :mean => 60.0, :stdev => 5.0} | |
sensors << {:id => 2, :mean => 50.0, :stdev => 10.0} | |
sensors << {:id => 3, :mean => 40.0, :stdev => 3.0} | |
File.open('temperature.txt','w') do |file| | |
sensors.each do |sensor| | |
id = sensor[:id] | |
dist = Rubystats::NormalDistribution.new(sensor[:mean],sensor[:stdev]) | |
dist.rng(10000).each do |value| | |
file.write "#{id}\t#{value}\n" | |
end | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(1,(30.524038,56.62764,60.000134,63.372384,90.561695)) | |
(2,(-9.845137,43.25512,49.999536,56.74441,109.714687)) | |
(3,(21.564769,37.976644,40.000025,42.023622,58.057268)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
# Used by generate_temperature_data.pig to generate massive amount of random | |
# gaussian data through Pig. | |
import random | |
import sys | |
for line in sys.stdin: | |
(i, mean, stdev, count) = line.strip().split('\t') | |
count = int(count) | |
mean = float(mean) | |
stdev = float(stdev) | |
while(count > 0): | |
val = random.gauss(mean,stdev) | |
sys.stdout.write('%s\t%f\n' % (i, val)) | |
count-=1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(1,(30.524038,55.993967,59.488968,62.775554,90.561695)) | |
(2,(-9.845137,41.95725,48.977708,55.554239,109.714687)) | |
(3,(21.564769,37.569332,39.692373,41.666762,58.057268)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
define Quartile datafu.pig.stats.Quantile('0.0','0.25','0.5','0.75','1.0'); | |
temperature = LOAD 'temperature.txt' AS (id:chararray, temp:double); | |
temperature = GROUP temperature BY id; | |
temperature_quartiles = FOREACH temperature { | |
sorted = ORDER temperature by temp; -- must be sorted | |
GENERATE group as id, Quartile(sorted.temp) as quartiles; | |
} | |
DUMP temperature_quartiles |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
define Quartile datafu.pig.stats.StreamingQuantile('0.0','0.25','0.5','0.75','1.0'); | |
temperature = LOAD 'temperature.txt' AS (id:chararray, temp:double); | |
temperature = GROUP temperature BY id; | |
temperature_quartiles = FOREACH temperature { | |
-- sort not necessary | |
GENERATE group as id, Quartile(temperature.temp) as quartiles; | |
} | |
DUMP temperature_quartiles |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(1,(41.58171454288797,56.24183579452584,59.61727093346221,62.919576028265375,79.2841731889925)) | |
(2,(14.393515179526304,42.55929349057328,49.50432161293486,56.020101184758644,91.03574746442487)) | |
(3,(29.865710766927595,37.64744333815733,39.84941055349095,41.77693877565934,51.31349575866486)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(1,(41.58171454288797,56.559375253601715,59.91093458980706,63.335574106080365,79.2841731889925)) | |
(2,(14.393515179526304,43.39558395897533,50.081758806889766,56.54245916209963,91.03574746442487)) | |
(3,(29.865710766927595,37.86257868882021,39.97075970657039,41.989584898364704,51.31349575866486)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment