Skip to content

Instantly share code, notes, and snippets.

@henrikj242
Last active August 21, 2018 11:51
Show Gist options
  • Save henrikj242/d3c423afc964a08a05526d60bb4facd5 to your computer and use it in GitHub Desktop.
Save henrikj242/d3c423afc964a08a05526d60bb4facd5 to your computer and use it in GitHub Desktop.
## -------------------------------------------------------------------------------------
# This script parses a csv file, groups customers (whom may occur more than once), based on their email.
# It attempts to include the most meaningful stem data (first name, last name etc) for each customer
# - This means taking the last value after sorting so to not use empty values, but it the same customer
# exists more than once with actual values in his stem data, it may become invalid.
# It generates a row for each customer, with their purchases listed in the same way as the input file:
# - All skus in one column, separated by a comma,
# - All order dates in one column, separated by a comma
# ... etc
# Certain columns (7-13) are skipped
# FirstName,LastName,Email,CountryCode,ShippingAddress.postal_code,ShippingAddress.city,BoughtItems,
# skip elements at index 7 - 13
# RegisteredItems, RegisteredItems.Sku, RegisteredItems.item_name, RegisteredItems.PurchaseDate, RegisteredItems.SerialNumber,
# RegisteredItems.RegisteredDate, RegisteredItems.BoughtFrom,
# BoughtItems.sku,BoughtItems.OrderId,BoughtItems.tax_amount,BoughtItems.name,BoughtItems.CurrencyCode,BoughtItems.OrderDate,BoughtItems.price,
# BoughtItems.ShippingDate,BoughtItems.quantity,BoughtItems.UsedVoucher
# group purchases by lower case email
# output conflicting rows in a separate file
require 'csv'
class Parser
def initialize
@debug = true
@source = 'in.csv'
# @customers = {}
@customers = {
'test@example.com' => {
first_name: 'John',
last_name: 'Doe',
email: 'test@example.com',
country_code: 'US',
postal_code: '1234',
city: 'Los Angeles',
bought_items: '-',
items: [
{
sku: 'abcd-1234',
order_id: '12345678',
tax_amount: '100',
product_name: 'Beoplay H2',
currency_code: 'US',
order_date: 'Jan 1 1907',
price: '200',
shipping_date: 'Jan 3 1907',
quantity: '42',
used_voucher: 'x'
}
]
}
}
end
def customer_values(first_name, last_name, email, country_code, postal_code, city, bought_items)
existing = @customers[email&.downcase]
if existing.nil?
{
first_name: first_name,
last_name: last_name,
email: email,
country_code: country_code,
postal_code: postal_code,
city: city,
bought_items: bought_items,
items: []
}
else
{
first_name: [first_name, existing[:first_name]].sort.last,
last_name: [last_name, existing[:last_name]].sort.last,
email: [email, existing[:email]].sort.last,
country_code: [country_code, existing[:country_code]].sort.last,
postal_code: [postal_code, existing[:postal_code]].sort.last,
city: [city, existing[:city]].sort.last,
bought_items: [bought_items, existing[:bought_items]].sort.last,
items: existing[:items]
}
end
end
def parse
i = 0
CSV.foreach(@source, quote_char: '"', col_sep: ',') do |row|
i += 1
# next if i < 185500
row.slice!(7,7)
first_name, last_name, email, country_code, postal_code, city, bought_items,
skus, order_ids, tax_amounts, product_names, currency_codes,
order_dates, prices, shipping_dates, quantities, vouchers = row.map(&:to_s)
customer = {email => customer_values(first_name, last_name, email, country_code, postal_code, city, bought_items)}
if skus
order_ids = order_ids.split(',')
tax_amounts = tax_amounts.split(',')
product_names = product_names.split(',')
currency_codes = currency_codes.split(',')
order_date = order_dates.split(',')
prices = prices.split(',')
shipping_dates = shipping_dates.split(',')
quantity = quantities.split(',')
voucher = vouchers.split(',')
skus.split(',').each_with_index do |sku, index|
customer[email][:items] << {
sku: sku,
order_id: (order_ids[index] if index < order_ids.size).to_s,
tax_amount: (tax_amounts[index] if index < tax_amounts.size).to_s,
product_name: (product_names[index] if index < product_names.size).to_s,
currency_code: (currency_codes[index] if index < currency_codes.size).to_s,
order_date: (order_dates[index] if index < order_dates.size).to_s,
price: (prices[index] if index < prices.size).to_s,
shipping_date: (shipping_dates[index] if index < shipping_dates.size).to_s,
quantity: (quantities[index] if index < quantities.size).to_s,
voucher: (vouchers[index] if index < vouchers.size).to_s
}
end
end
puts "DEBUG line #{i}; customer: #{customer}" if @debug
@customers[email.downcase] = customer[email]
break if i > 1000000 # first 150000 in less than 5 minutes, 10:25
end
end
def customers
@customers
end
end
def items_to_csv(items)
skus = []
order_id = []
tax_amount = []
product_name = []
currency_code = []
order_date = []
price = []
shipping_date = []
quantity = []
voucher = []
if (items.is_a?(Array) && items.size > 0)
items.each do |item|
skus << item[:sku]
order_id << item[:order_id]
tax_amount << item[:tax_amount]
product_name << item[:product_name]
currency_code << item[:currency_code]
order_date << item[:order_date]
price << item[:price]
shipping_date << item[:shipping_date]
quantity << item[:quantity]
voucher << item[:voucher]
end
end
'"' + skus.join(',') + '",' +
'"' + order_id.join(',') + '",' +
'"' + tax_amount.join(',') + '",' +
'"' + product_name.join(',') + '",' +
'"' + currency_code.join(',') + '",' +
'"' + order_date.join(',') + '",' +
'"' + price.join(',') + '",' +
'"' + shipping_date.join(',') + '",' +
'"' + quantity.join(',') + '",' +
'"' + voucher.join(',') + '"'
end
p = Parser.new
p.parse
File.open('out.csv', 'w') do |f|
p.customers.each do |k, v|
f.puts [
v[:first_name], v[:last_name], v[:email], v[:country_code],
v[:postal_code], v[:city], v[:bought_items]
].map{ |e| e.include?(',') ? "\"#{e}\"" : e }.join(',') + ',' + items_to_csv(v[:items])
end
end
# This script opens a csv file where some field contains several values (sku's).
# It generates a new file where each of these values determine a new row,
# in which the remaining fields are just copied.
# In this specific case, some other fields contain the same number of values,
# corresponding with the values in field[4] (purchase dates and order numbers)
# The output is formatted with a tab separator and all fields are quoted.
require 'csv'
def get_item(items, index)
items = items.split(',')
items[index]
end
File.open('destination.csv', 'w') do |file|
CSV.foreach('source.csv') do |row|
lines = []
skus = row[4].split(',').each_with_index do |sku, index|
line = [
row[0], row[1], row[2], row[3], sku, get_item(row[5], index), get_item(row[6], index), get_item(row[7], index)
]
lines << line
end
lines.each do |line|
file.puts line.map{|item| "\"#{item}\""}.join("\t")
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment