smoll/add_up_single_column_in_tsv.rb

## add_up_single_column_in_tsv.rb
require 'benchmark'
require 'smarter_csv'
require 'bigdecimal' # using BigDecimal instead of Float
require 'bigdecimal/util'

# modify these variables
filename = "new_statement_detail_Yorck_Eysel_10942_Q12014.txt" # put the real path to your .txt file here
rows_per_chunk = 10000 # processes this many rows in each "chunk"
name_of_column = "Label Share Net Receipts"
number_format = "EU" # set this to "EU" or "US"
# end modify section

raise "Set number_format to either \"EU\" or \"US\"!" unless number_format == "US" or number_format == "EU"
col = name_of_column.downcase.gsub( /\s/, '_' ).gsub( "'", "" ).to_sym #  turn "Label Share Net Receipts" into :label_share_net_receipts

options = {
  :chunk_size => rows_per_chunk,
  :col_sep => "\t",
  :row_sep => "\n",
  :strip_chars_from_headers => /[\-']/, # if any header columns have apostrophes, this removes them so we don't get a MalformedCSV exception
  :force_simple_split => true,
  :key_mapping => {col => :receipts}, # this remaps something like :label_net_share_receipts to :receipts in the hash so it's less verbose
  :remove_unmapped_keys => true # this ignores columns that aren't mapped, speeds up processing
}

row_count = 0
receipts_total = 0
b = Benchmark.measure do
  chunk_num = 0
  puts "Using #{options[:chunk_size]} row chunks!"
  SmarterCSV.process(filename, options) do |chunk|
    # if we need to debug the chunks due to errors, uncomment the following line:
    #puts chunk.inspect
    chunk_num += 1
    puts "Processing chunk: #{chunk_num}" # this will print a line for each chunk that is processed, to see progress. You can comment this out.

    if chunk_num == 1 # logic to double-check number format, but only do it for the first chunk
      commas = chunk.inject(0) do |c, hash|
        c + hash[:receipts].to_s.scan(",").size # increment if value contains a comma
      end
      periods = chunk.inject(0) do |p, hash|
        p + hash[:receipts].to_s.scan(".").size # increment if value contains a comma
      end

      # Todo: change these puts to raises when we implement this in our test suite:
      puts "Found more periods than commas...are you sure number format is EU?" if number_format == "EU" && periods > commas
      puts "Found more commas than periods...are you sure number format is US?" if number_format == "US" && commas > periods
    end

    receipts_total += chunk.inject(0) do |sum, hash|
      row_count += 1

      if number_format == "EU"
        hash[:receipts].gsub!(".","_") # Ruby uses "_" as a locale-agnostic thousands separator
        hash[:receipts].gsub!(",",".") # use period as decimal delimiter
      end

      sum + hash[:receipts].to_d # convert each entry to BigDecimal
    end
  end
end
puts "-"*80
puts "File row count: #{row_count}"
puts "receipts_total final: #{receipts_total.to_s('F')}"
puts "Rounted to 2 decimals: #{receipts_total.round(2).to_s('F')}"
puts "-"*80
puts "Total time to process was #{b.total.round(0)} seconds"
	require 'benchmark'
	require 'smarter_csv'
	require 'bigdecimal' # using BigDecimal instead of Float
	require 'bigdecimal/util'

	# modify these variables
	filename = "new_statement_detail_Yorck_Eysel_10942_Q12014.txt" # put the real path to your .txt file here
	rows_per_chunk = 10000 # processes this many rows in each "chunk"
	name_of_column = "Label Share Net Receipts"
	number_format = "EU" # set this to "EU" or "US"
	# end modify section

	raise "Set number_format to either \"EU\" or \"US\"!" unless number_format == "US" or number_format == "EU"
	col = name_of_column.downcase.gsub( /\s/, '_' ).gsub( "'", "" ).to_sym # turn "Label Share Net Receipts" into :label_share_net_receipts

	options = {
	:chunk_size => rows_per_chunk,
	:col_sep => "\t",
	:row_sep => "\n",
	:strip_chars_from_headers => /[\-']/, # if any header columns have apostrophes, this removes them so we don't get a MalformedCSV exception
	:force_simple_split => true,
	:key_mapping => {col => :receipts}, # this remaps something like :label_net_share_receipts to :receipts in the hash so it's less verbose
	:remove_unmapped_keys => true # this ignores columns that aren't mapped, speeds up processing
	}

	row_count = 0
	receipts_total = 0
	b = Benchmark.measure do
	chunk_num = 0
	puts "Using #{options[:chunk_size]} row chunks!"
	SmarterCSV.process(filename, options) do \|chunk\|
	# if we need to debug the chunks due to errors, uncomment the following line:
	#puts chunk.inspect
	chunk_num += 1
	puts "Processing chunk: #{chunk_num}" # this will print a line for each chunk that is processed, to see progress. You can comment this out.

	if chunk_num == 1 # logic to double-check number format, but only do it for the first chunk
	commas = chunk.inject(0) do \|c, hash\|
	c + hash[:receipts].to_s.scan(",").size # increment if value contains a comma
	end
	periods = chunk.inject(0) do \|p, hash\|
	p + hash[:receipts].to_s.scan(".").size # increment if value contains a comma
	end

	# Todo: change these puts to raises when we implement this in our test suite:
	puts "Found more periods than commas...are you sure number format is EU?" if number_format == "EU" && periods > commas
	puts "Found more commas than periods...are you sure number format is US?" if number_format == "US" && commas > periods
	end

	receipts_total += chunk.inject(0) do \|sum, hash\|
	row_count += 1

	if number_format == "EU"
	hash[:receipts].gsub!(".","_") # Ruby uses "_" as a locale-agnostic thousands separator
	hash[:receipts].gsub!(",",".") # use period as decimal delimiter
	end

	sum + hash[:receipts].to_d # convert each entry to BigDecimal
	end
	end
	end
	puts "-"*80
	puts "File row count: #{row_count}"
	puts "receipts_total final: #{receipts_total.to_s('F')}"
	puts "Rounted to 2 decimals: #{receipts_total.round(2).to_s('F')}"
	puts "-"*80
	puts "Total time to process was #{b.total.round(0)} seconds"