Skip to content

Instantly share code, notes, and snippets.

@demimismo
Created October 28, 2008 14:59
Show Gist options
  • Save demimismo/20399 to your computer and use it in GitHub Desktop.
Save demimismo/20399 to your computer and use it in GitHub Desktop.
# I use this method to parse large csv files, clean the data and insert
# it in a database after some cleaning, it's pretty fast
# you can use it like this:
load_csv_data(PRELOAD_DIR+'socios.csv', 'users') do |csv, thing_id, row|
csv << [thing_id,
row['NOMBRE'].to_s.to_permalink+'.'+row['APELLIDOS'].to_s.to_permalink, # login
row['E_MAIL'], # email
row['SEXO'], # gender
(row['NO_DATOS'] == 'S' || row['NO_DATOS'] == 'VERDADERO' ? 1 : 0 ), # no_datos
]
end
def load_csv_data(file_name, table_name)
STDERR.print("\nParsing '#{file_name}' to insert data in '#{table_name}'\n")
n_lines = (%x[wc -l #{file_name}]).split(' ')[0].to_f
time_start = Time.now
begin
require 'fastercsv'
count = 0.0
csv_string = FasterCSV.open("data.csv", "w", { :col_sep => ";", :force_quotes => true }) do |csv|
FasterCSV.open("#{file_name}", { :col_sep => "\t", :headers => :first_row }).each do |row|
thing_id = row[0].to_i
if thing_id && thing_id > 0
yield csv, thing_id, row
percentage = ((count += 1) * 100.0)/n_lines
STDERR.print "%.0f%%..." % percentage if (percentage.modulo(2.0) < 0.001)
end
end
end
rescue EOFError
f.close
end
total_lines = (%x[wc -l data.csv]).split(' ')[0].to_i
STDERR.print("\nInserting #{total_lines} lines in table '#{table_name}'\n")
ActiveRecord::Base.connection.execute("LOAD DATA INFILE '#{File.join(File.dirname(__FILE__), '..', '..', 'data.csv')}' IGNORE INTO TABLE #{table_name} FIELDS TERMINATED BY ';' ENCLOSED BY '\"';")
warn("Finished importing '#{file_name}'.\nLines: #{n_lines.to_i.to_s} (#{(n_lines.to_i - total_lines.to_i).to_s} lost)\nTime: %.2f minutes\n" % ((Time.now - time_start)/60))
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment