Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
processing large csv files in ruby
class ZendeskTicketsJob
extend Resque::Plugins::ExponentialBackoff
@queue = :low
FIELDS = ['zendesk_id', 'requester_id', 'assignee_id', 'group', 'subject', 'tags', 'status', 'priority', 'via', 'ticket_type', 'created_at', 'assigned_at', 'solved_at', 'resolution_time', 'satisfaction', 'group_stations', 'assignee_stations', 'reopens', 'replies', 'first_reply_time_in_minutes', 'first_reply_time_in_minutes_within_business_hours', 'first_resolution_time_in_minutes', 'first_resolution_time_in_minutes_within_business_hours', 'full_resolution_time_in_minutes', 'full_resolution_time_in_minutes_within_business_hours', 'agent_wait_time_in_minutes', 'agent_wait_time_in_minutes_within_business_hours', 'requester_wait_time_in_minutes', 'requester_wait_time_in_minutes_within_business_hours', 'reservation_code', 'requires_manual_closing']
def self.perform(url)
`rm /tmp/zendesk_tickets*`
`wget #{url} -O /tmp/zendesk_tickets.csv.zip`
`unzip -p /tmp/zendesk_tickets.csv.zip > /tmp/zendesk_tickets.csv`
# IO.foreach doesn't read the entire file into memory at once, which is good since a standard FasterCSV.parse on this file can take an hour or more
lines = []
IO.foreach('/tmp/zendesk_tickets.csv') do |line|
lines << line
if lines.size >= 1000
lines = FasterCSV.parse(lines.join) rescue next
store lines
lines = []
end
end
store lines
end
def self.store lines
return if lines.blank?
puts "Storing #{lines.size} rows..."
connection.execute("
INSERT INTO zendesk_tickets (#{fields_to_sql FIELDS})
VALUES #{data_to_sql lines}
ON DUPLICATE KEY
UPDATE #{fields_to_update_sql FIELDS[1..-1]}
")
end
def self.fields_to_sql fields
fields.collect{|f| connection.quote_column_name(f) }.join(',')
end
def self.data_to_sql lines
data = lines.collect do |v|
# ... some manipulation of the data into our own formats
v
end.compact
data.collect do |row|
"(#{row.collect{|f| connection.quote(f)}.join(',')})"
end.join(', ')
end
def self.connection
@connection ||= ZendeskTicket.new.connection
end
def self.fields_to_update_sql fields
fields.collect do |f|
qcn = connection.quote_column_name(f)
"#{qcn} = VALUES(#{qcn})"
end.join(', ')
end
end
@sgringwe

This comment has been minimized.

Copy link

@sgringwe sgringwe commented Aug 12, 2014

What kind of performance did this yield for you?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment