Create a gist now

Instantly share code, notes, and snippets.

Clean Uruguay names & gender dataset
# encoding: UTF-8
require 'csv'
filename = 'nombre_nacim_por_anio_y_sexo.csv'
class Name
attr_reader :name, :gender, :male_count, :female_count, :year
def self.valid?(name)
!!name
end
def initialize(name, gender, freq, year=1960)
@year = year
@name = format(name)
@gender = translate(gender)
@male_count = 0
@female_count = 0
if female?
@female_count += freq.to_i
elsif male?
@male_count += freq.to_i
end
end
def format(cad)
cad = cad.strip.downcase
cad[0] = cad[0].upcase
cad
end
def translate(str)
if str && str.strip.downcase == 'femenino'
'female'
elsif str && str.strip.downcase == 'masculino'
'male'
else
'unknown'
end
end
def female?
gender == 'female'
end
def male?
gender == 'male'
end
def ==(other)
return @name == other.name && @gender == other.gender
end
def merge(other)
@male_count += other.male_count
@female_count += other.female_count
end
def <=>(other)
return @name <=> other.name
end
def to_s
"<Name: #{@name} | #{@gender} | #{@freq}>"
end
def to_row
#Name,years.appearing,count.male,count.female,prob.gender,obs.male,est.male,upper,lower
[@name, year_appearing, male_count, female_count, prob_gender, nil, nil, nil, nil]
end
def year_appearing
2014 - year
end
def prob_gender
if male_count > female_count
'male'
elsif female_count > male_count
'female'
else
'unknown'
end
end
end
names = []
CSV.foreach(filename, "r:windows-1252") do |row|
if Name.valid?(row[2])
new_name = Name.new(row[2], row[1], row[3])
old_name = names.detect { |name| name == new_name }
if old_name
old_name.merge(new_name)
else
names << new_name
end
end
end
CSV.open("output_monte.csv", "w:iso-8859-1") do |csv|
names.sort.each { |name| csv << name.to_row }
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment