Skip to content

Instantly share code, notes, and snippets.

@ashrithr
Last active December 13, 2015 17:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ashrithr/4950394 to your computer and use it in GitHub Desktop.
Save ashrithr/4950394 to your computer and use it in GitHub Desktop.
ruby random data generator based on probability used for analytics
#!/usr/bin/env ruby
# ---
# => Script to generate random data required for data analytics
# Author: Ashrith
# ---
require 'rubygems'
require 'fileutils'
require 'benchmark'
require 'parallel' #gem install parallel
require 'ruby-progressbar' #gem install ruby-progressbar
require 'optparse'
# => Command line Arguments
@options = {}
option_parser = OptionParser.new do |opts|
executable_name = File.basename($PROGRAM_NAME)
opts.banner = "Usage: #{executable_name} [options]"
opts.on("-l LINES", "--lines LINES",
"number of lines to generate to analytics file") do |lines|
@options[:lines] = lines.to_i
end
opts.on("-m", "--multiple-tables", "Generate data for hive") do
@hive_data = true
end
opts.on("-p PATH", "--output-path PATH",
"Path where output should be generated to") do |path|
@options[:path] = path
end
opts.on("-e", "--extra-data", "generate extra data") do
@extras = true
end
opts.on("-h","--help","Help") do
puts option_parser
exit
end
end # => end option_parser
begin
option_parser.parse!
rescue OptionParser::ParseError => e
STDERR.puts e.message
STDERR.puts option_parser
exit 1
end
unless @options.has_key?(:lines)
STDERR.puts 'Missing required argument --lines or -l'
STDERR.puts option_parser
exit 1
end # => end option_parser logic
# => Globals
@lines = @options[:lines] #No. of lines to generate
@lines_per_process = 500000 #lines to gen by indvidual process
@num_of_processes = @lines > @lines_per_process ?
@lines / @lines_per_process :
1 #num_of_processes required to compute
@options[:path] = "/tmp" unless @options.has_key?(:path)
#Output path
@cd = "," #column delimeter
@ld = "\n" #line delimeter
#@extras = false #generate extra data : name, phoneno,
# email, address
@cid_start = 1000 #Customer id start int
@gender_with_probability = { #Gender hash with probability
:male => 30,
:female => 70
}
@lifetime_days = 90 #Life time in days
@friendcount_maxrange = 100 #Friends count maximum range
@friendcount_zero_probability = 0.3 #30% of times users dont have friends
@paid_subscriber_percent = 0.5 #5% users are paid customers
@paid_subscriber_frndcount = 5 #users whose frnd_cnt > 5 pay
@age_with_probability = { #Age hash with probability
18 => 15,
19 => 12,
20 => 12,
21 => 11,
22 => 11,
23 => 9,
24 => 7,
25 => 6,
26 => 5,
27 => 4,
28 => 3,
29 => 2,
30 => 2,
}
@countries_with_probability = { #Countries hash with probabilities
"USA" => 60,
"UK" => 25,
"CANADA" => 5,
"MEXICO" => 5,
"GERMANY" => 10,
"FRANCE" => 10,
"EGYPT" => 5
}
@games_female = {
:citygame => 50,
:pictionary => 30,
:scramble => 15,
:sniper => 5,
}
@games_male = {
:sniper => 70,
:scramble => 20,
:pictionary => 10,
:citygame => 10,
}
# => Definations
def choose_weighted(weighted)
# => Returns value picked from hash passed randomly based on its weight
# All the weights in the hash must be integers
# Ex: marbles = { :black => 51, :white => 17 }
# 3.times { puts choose_weighted(marbles) }
#caluculate the total weight
sum = weighted.inject(0) do |sum, item_and_weight|
sum += item_and_weight[1]
end
#assign a random from total weight to target
target = rand(sum)
#return a value based on its weight
weighted.each do |item, weight|
return item if target <= weight
target -= weight
end
end
def gen_phone_num
# => Returns a random phone number
"#{rand(900) + 100}-#{rand(900) + 100}-#{rand(1000) + 1000}"
end
def gen_int_phone_num
# => Returns a random international phone number
"011-#{rand(100) + 1}-#{rand(100) + 10}-#{rand(1000) + 1000}"
end
def gen_email(name)
# => Returns a random email based on the usersname
firstname = name.split.first
lastname = name.split.last
domains = %w(yahoo.com gmail.com privacy.net webmail.com msn.com
hotmail.com example.com privacy.net)
return "#{(firstname + lastname).downcase}"\
"#{rand(100)}\@#{domains[rand(domains.size)]}"
end
def gen_date(from=0.0, to=Time.now)
# => Returns a random date, also can specify range to it
# Ex: gen_date
# gen_date(Time.local(2010, 1, 1))
# gen_date(Time.local(2010, 1, 1), Time.local(2010, 7, 1))
Time.at(from + rand * (to.to_f - from.to_f))
end
class Names
# => Class that will return some random names based on gender
def self.initial
letters_arr = ('A'..'Z').to_a
letters_arr[rand(letters_arr.size)]
end
@@lastnames = %w(ABEL ANDERSON ANDREWS ANTHONY BAKER BROWN BURROWS CLARK
CLARKE CLARKSON DAVIDSON DAVIES DAVIS DENT EDWARDS GARCIA
GRANT HALL HARRIS HARRISON JACKSON JEFFRIES JEFFERSON JOHNSON
JONES KIRBY KIRK LAKE LEE LEWIS MARTIN MARTINEZ MAJOR MILLER
MOORE OATES PETERS PETERSON ROBERTSON ROBINSON RODRIGUEZ
SMITH SMYTHE STEVENS TAYLOR THATCHER THOMAS THOMPSON WALKER
WASHINGTON WHITE WILLIAMS WILSON YORKE)
def self.lastname
@@lastnames[rand(@@lastnames.size)]
end
@@male_first_names =
%w(ADAM ANTHONY ARTHUR BRIAN CHARLES CHRISTOPHER DANIEL DAVID DONALD EDGAR
EDWARD EDWIN GEORGE HAROLD HERBERT HUGH JAMES JASON JOHN JOSEPH KENNETH
KEVIN MARCUS MARK MATTHEW MICHAEL PAUL PHILIP RICHARD ROBERT ROGER RONALD
SIMON STEVEN TERRY THOMAS WILLIAM)
@@female_first_names =
%w(ALISON ANN ANNA ANNE BARBARA BETTY BERYL CAROL CHARLOTTE CHERYL DEBORAH
DIANA DONNA DOROTHY ELIZABETH EVE FELICITY FIONA HELEN HELENA JENNIFER
JESSICA JUDITH KAREN KIMBERLY LAURA LINDA LISA LUCY MARGARET MARIA MARY
MICHELLE NANCY PATRICIA POLLY ROBYN RUTH SANDRA SARAH SHARON SUSAN
TABITHA URSULA VICTORIA WENDY)
def self.female_name
# => Returns a female name
"#{@@female_first_names[rand(@@female_first_names.size)]} #{lastname}"
end
def self.male_name
# => Returns a male name
"#{@@male_first_names[rand(@@male_first_names.size)]} #{lastname}"
end
end
class Address
# => Class that will return some random based addresses now only supports USA
# and UK addresses
@@street_names = %w( Acacia Beech Birch Cedar Cherry Chestnut Elm Larch Laurel
Linden Maple Oak Pine Rose Walnut Willow Adams Franklin Jackson Jefferson
Lincoln Madison Washington Wilson Churchill Tyndale Latimer Cranmer Highland
Hill Park Woodland Sunset Virginia 1st 2nd 4th 5th 34th 42nd
)
@@street_types = %w( St Ave Rd Blvd Trl Rdg Pl Pkwy Ct Circle )
def self.address_line_1
# => Returns address line 1
"#{rand(4000)} #{@@street_names[rand(@@street_names.size)]}"\
" #{@@street_types[rand(@@street_types.size)]}"
end
@@line2types = ["Apt", "Bsmt", "Bldg", "Dept", "Fl", "Frnt", "Hngr", "Lbby",
"Lot", "Lowr", "Ofc", "Ph", "Pier", "Rear", "Rm", "Side", "Slip", "Spc",
"Stop", "Ste", "Trlr", "Unit", "Uppr"]
def self.address_line_2
# => Returns address line 2
"#{@@line2types[rand(@@line2types.size)]} #{rand(999)}"
end
def self.zipcode
# => Returns a zip code
"%05d" % rand(99999)
end
def self.uk_post_code
# => Returns UK Zip code
post_towns = %w(BM CB CV LE LI LS KT MK NE OX PL YO)
num1 = rand(100).to_s
num2 = rand(100).to_s
letters_arr = ("AA".."ZZ").to_a
letters = letters_arr[rand(letters_arr.size)]
return "#{post_towns[rand(post_towns.size)]}#{num1} #{num2}#{letters}"
end
@@us_states = ["AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL",
"GA", "HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA",
"MD", "ME", "MI", "MN", "MO", "MS", "MT", "NC", "ND", "NE",
"NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI",
"SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV",
"WY"]
def self.state
# => Returns a state
@@us_states[rand(@@us_states.size)]
end
end #end Locations
# => MAIN LOOP
def main_loop(num_of_lines, cid_start, proc, progress=true)
counter = cid_start + (num_of_lines - 1)
#initialize the progressbar if progress = true
progressbar = ProgressBar.create(:total => num_of_lines,
:format => '%a |%b>>%i| %p%% %t') if progress
#Create output dir if does not exist
FileUtils.mkdir_p(@options[:path]) unless File.exists?(@options[:path])
#File Management
if @hive_data
cust_table = "#{@options[:path]}/analytics_customer#{proc}.data"
revn_table = "#{@options[:path]}/analytics_revenue#{proc}.data"
fact_table = "#{@options[:path]}/analytics_facts#{proc}.data"
cust_file_handle = File.open(cust_table, "w")
revn_file_handle = File.open(revn_table, "w")
fact_file_handle = File.open(fact_table, "w")
else
output_file = "#{@options[:path]}/analytics_#{proc}.data"
output_file_handle = File.open(output_file, "w")
end
#header for the file
if @hive_data
cust_file_handle.
puts("cid#{@cd}name#{@cd}gender#{@cd}register_data#{@cd}country")
revn_file_handle.puts("cid#{@cd}date#{@cd}revenue")
fact_file_handle.puts("cid#{@cd}date#{@cd}citygame_played"\
"#{@cd}pictionarygame_played#{@cd}scramblegame_played#{@cd}"\
"snipergame_played")
else
@extras ?
output_file_handle.puts("cid#{@cd}gender#{@cd}age#{@cd}country#{@cd}"\
"registerdate#{@cd}name#{@cd}email#{@cd}phone#{@cd}address#{@cd}"\
"friend_count#{@cd}lifetime"\
"#{@cd}citygame_played#{@cd}pictionarygame_played#{@cd}scramblegame_played"\
"#{@cd}snipergame_played#{@cd}revenue#{@cd}paid") :
output_file_handle.puts("cid#{@cd}gender#{@cd}age#{@cd}country#{@cd}"\
"registerdate#{@cd}friend_count#{@cd}lifetime#{@cd}citygame_played#{@cd}"\
"pictionarygame_played#{@cd}scramblegame_played#{@cd}snipergame_played"\
"#{@cd}revenue#{@cd}paid")
end # => End header generation
(cid_start..counter).each do |cid|
# => gender
gender = choose_weighted(@gender_with_probability)
# => Registration date(generate date from 2010-1-1 to now)
register_date = gen_date(Time.local(2010, 1, 1))
# => age
age = choose_weighted(@age_with_probability)
# => country
country = choose_weighted(@countries_with_probability)
name = gender == :male ? Names.male_name : Names.female_name
email = gen_email(name)
phone = country == "USA" ? gen_phone_num : gen_int_phone_num
case country
when "USA"
address = "#{Address.address_line_1} #{Address.address_line_2}"\
" #{Address.state} #{Address.zipcode}"
when "UK"
address = "#{Address.address_line_1} #{Address.address_line_2}"\
" #{Address.uk_post_code}"
when "CANADA"
address = "N/A"
when "MEXICO"
address = "N/A"
when "GERMANY"
address = "N/A"
when "FRANCE"
address = "N/A"
else #egypt
address = "N/A"
end
# => total_days user played
total_days = rand(@lifetime_days)
# => friends_count
if rand < @friendcount_zero_probability
#30% of users do not have friends at all
friend_count = 0
else
#10% of users will have fried count >5 and other will be friend < 5
rand < 0.2 ?
friend_count = rand(@paid_subscriber_frndcount..@friendcount_maxrange) :
friend_count = rand(0..@paid_subscriber_frndcount)
end
# => paid customer
if ( friend_count > 5 and total_days > 10 )
if rand < @paid_subscriber_percent
paid_subscriber = "yes"
else
paid_subscriber = "no"
end
else
paid_subscriber = "no"
end
# ( friend_count > 5 and total_days > 10 ) ? paid_subscriber = "yes" :
# paid_subscriber = "no"
# => revenue
if paid_subscriber == "yes"
rand < 0.8 ? revenue = rand(5..30) : revenue = rand(30..99)
else
revenue = 0
end
# => games_played by user
#intialize gamecounters
city_counter = 0
pictionary_counter = 0
sniper_counter = 0
scramble_counter = 0
gender_game_hash = gender == :male ? @games_male : @games_female
total_days.times do
case choose_weighted(gender_game_hash)
when :citygame
city_counter += 1
when :pictionary
pictionary_counter += 1
when :scramble
scramble_counter += 1
else
sniper_counter += 1
end
end
# => build final strings
if @hive_data
(customer_tbl ||= "") << "#{cid}" << @cd << "#{name}" << @cd <<
"#{gender}" << @cd << "#{register_date}" << @cd <<
"#{country}"
(revenue_tbl ||= "") << "#{cid}" << @cd << "#{register_date}" << @cd <<
"#{revenue}"
(fact_tbl ||= "") << "#{cid}" << @cd << "#{register_date}" << @cd <<
"#{city_counter}" << @cd <<
"#{pictionary_counter}" << @cd <<
"#{scramble_counter}" << @cd <<
"#{sniper_counter}"
else
if @extras
( final_string ||= "" ) << "#{cid}" << @cd << "#{gender}" << @cd <<
"#{age}" << @cd << "#{country}" << @cd <<
"#{register_date}" << @cd <<
"#{name}" << @cd << "#{email}" << @cd <<
"#{phone}" << @cd << "#{address}" << @cd <<
"#{friend_count}" << @cd << "#{total_days}" <<
@cd << "#{city_counter}" <<
@cd << "#{pictionary_counter}" << @cd <<
"#{scramble_counter}" << @cd <<
"#{sniper_counter}" << @cd << "#{revenue}" <<
@cd << "#{paid_subscriber}"
else
( final_string ||= "" ) << "#{cid}" << @cd << "#{gender}" << @cd <<
"#{age}" << @cd << "#{country}" << @cd <<
"#{register_date}" << @cd <<
"#{friend_count}" << @cd << "#{total_days}" <<
@cd << "#{city_counter}" << @cd <<
"#{pictionary_counter}" << @cd <<
"#{scramble_counter}" << @cd <<
"#{sniper_counter}" << @cd << "#{revenue}" <<
@cd << "#{paid_subscriber}"
end
end
# => write out to file
if @hive_data
cust_file_handle.puts customer_tbl
revn_file_handle.puts revenue_tbl
fact_file_handle.puts fact_tbl
else
output_file_handle.puts final_string
end
progressbar.increment if progress
end
# => close the file
if @hive_data
cust_file_handle.close
revn_file_handle.close
fact_file_handle.close
else
output_file_handle.close
end
end #end main_loop
#Benchmark the time took to complete the program
time_took = Benchmark.measure do
#parallel runs to_generate lines which are > than 100k
if @num_of_processes > 1
puts "Parallel mode, generating data to /tmp"
progress = ProgressBar.create(:total => @num_of_processes,
:format => '%a |%b>>%i| %p%% %t')
results = Parallel.map(1..@num_of_processes,
:finish => lambda { |i, item| progress.increment }) do |process|
main_loop(@lines_per_process, @cid_start + (@lines_per_process* process),
process, false)
end
else
puts "Sinle process mode, generating data to #{@options[:path]}"
if @hive_data
(tmp_file ||= []) << "#{@options[:path]}/analytics_customer.data" <<
"#{@options[:path]}/analytics_revenue.data" <<
"#{@options[:path]}/analytics_facts.data"
else
(tmp_file ||= []) << "#{@options[:path]}/analytics.data"
end
main_loop(@lines, @cid_start, 0)
tmp_file.clear
end
end #benchmark end
puts "Time took to generate #{@lines} : #{time_took}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment