sovetnik/dexter.rb

## dexter.rb
class Dexter

  # file = "/home/Rfiles/EURUSD.csv"

  def initialize file
    @@pair = 'EU'       # For output .csv name
    @@file = File.new file
    @@directory = File.dirname file
    @@head = @@file.gets.chomp.split(',')
    @@time = Time.now
    p "Source file is #{@@file}"
    p "directory is #{@@directory}"
    p "pair is #{@@pair}"
    p "head is #{@@head}"
  end

  def by_month
    moment = [["2006", "13"]]
    string = 'fuck'
    stop = false
    month_arr = []
    while stop == false
      str = @@file.gets
      unless str.nil?
        if str.chomp.scan(%r{^(\d{4}).(\d{2})}) == moment
          month_arr << str
          string = str
        else
          create_csv(month_name(moment), month_arr)
          moment = str.chomp.scan(%r{^(\d{4}).(\d{2})})
          month_arr = []
          month_arr << str
        end
      else
        create_csv(month_name(moment), month_arr)
        stop = true
        p 'Mission complete!'
      end
    end
  end

  def by_week
    moment = '1887.05.01'.to_date
    stop = false
    week_arr = []
    while stop == false
      str = @@file.gets
      unless str.nil?
        mom = str.chomp.scan(%r{^(\d{4}.\d{2}.\d{2}) }).flatten.first.to_date
        if mom.cweek == moment.cweek
          week_arr << str
        else
          create_csv(week_name(moment), week_arr)
          moment = mom
          week_arr = []
          week_arr << str
        end
      else
        create_csv(week_name(moment), week_arr)
        stop = true
        p 'Mission complete!'
      end
    end
  end

  def week_name moment
    name_arr = []
    name_arr << @@pair
    name_arr << moment.cwyear
    name_arr << moment.cweek
    name_arr.join('-')
  end

  def month_name moment
    moment = moment << @@pair
    name = moment.rotate(-1).join('-')
  end

  def create_csv(name, array)
    File.open("#{@@directory}/#{name}.csv", 'w') do |csv_object|
      csv_object << @@head.join(',')
      csv_object << "\r\n"
      array.each do |row_array|
        csv_object << row_array
      end
    end
    long = (Time.now - @@time).round(2)
    arr_size = array.size
    p "File #{name}.csv with #{arr_size} rows created at #{long} seconds"
    @@time = Time.now
  end

end

## gistfile1.txt
Dexter - class for split CSV by row
time series by month or week from CSV files more than GB.

Process in memory < 0.5 Gb

for example: EURUSD.csv

Time,Ask,Bid,AskVolume,BidVolume
2007.01.31 16:56:41.560,1.29956,1.29941,17.5,8.8
2007.01.31 16:56:41.660,1.2996,1.29945,32.7,15.7
2007.01.31 16:56:41.670,1.29943,1.29938,22.9,23.2
2007.01.31 16:56:41.769,1.29952,1.29947,40.7,15
2007.01.31 16:56:41.907,1.2996,1.29945,20.8,27.7
2007.01.31 16:56:42.006,1.29948,1.29938,0.8,4.8

usage:

victim = "/home/Rfiles/EURUSD.csv"
carnage = Dexter.new victim
carnage.by_week # breaks on weekends
carnage.by_month # yes, by month

output: weeks

"File EU-2006-52.csv with 2847 rows created at 0.09 seconds"
"File EU-2007-1.csv with 310311 rows created at 6.81 seconds"
	class Dexter

	# file = "/home/Rfiles/EURUSD.csv"

	def initialize file
	@@pair = 'EU' # For output .csv name
	@@file = File.new file
	@@directory = File.dirname file
	@@head = @@file.gets.chomp.split(',')
	@@time = Time.now
	p "Source file is #{@@file}"
	p "directory is #{@@directory}"
	p "pair is #{@@pair}"
	p "head is #{@@head}"
	end

	def by_month
	moment = [["2006", "13"]]
	string = 'fuck'
	stop = false
	month_arr = []
	while stop == false
	str = @@file.gets
	unless str.nil?
	if str.chomp.scan(%r{^(\d{4}).(\d{2})}) == moment
	month_arr << str
	string = str
	else
	create_csv(month_name(moment), month_arr)
	moment = str.chomp.scan(%r{^(\d{4}).(\d{2})})
	month_arr = []
	month_arr << str
	end
	else
	create_csv(month_name(moment), month_arr)
	stop = true
	p 'Mission complete!'
	end
	end
	end

	def by_week
	moment = '1887.05.01'.to_date
	stop = false
	week_arr = []
	while stop == false
	str = @@file.gets
	unless str.nil?
	mom = str.chomp.scan(%r{^(\d{4}.\d{2}.\d{2}) }).flatten.first.to_date
	if mom.cweek == moment.cweek
	week_arr << str
	else
	create_csv(week_name(moment), week_arr)
	moment = mom
	week_arr = []
	week_arr << str
	end
	else
	create_csv(week_name(moment), week_arr)
	stop = true
	p 'Mission complete!'
	end
	end
	end

	def week_name moment
	name_arr = []
	name_arr << @@pair
	name_arr << moment.cwyear
	name_arr << moment.cweek
	name_arr.join('-')
	end

	def month_name moment
	moment = moment << @@pair
	name = moment.rotate(-1).join('-')
	end

	def create_csv(name, array)
	File.open("#{@@directory}/#{name}.csv", 'w') do \|csv_object\|
	csv_object << @@head.join(',')
	csv_object << "\r\n"
	array.each do \|row_array\|
	csv_object << row_array
	end
	end
	long = (Time.now - @@time).round(2)
	arr_size = array.size
	p "File #{name}.csv with #{arr_size} rows created at #{long} seconds"
	@@time = Time.now
	end

	end
	Dexter - class for split CSV by row
	time series by month or week from CSV files more than GB.

	Process in memory < 0.5 Gb

	for example: EURUSD.csv

	Time,Ask,Bid,AskVolume,BidVolume
	2007.01.31 16:56:41.560,1.29956,1.29941,17.5,8.8
	2007.01.31 16:56:41.660,1.2996,1.29945,32.7,15.7
	2007.01.31 16:56:41.670,1.29943,1.29938,22.9,23.2
	2007.01.31 16:56:41.769,1.29952,1.29947,40.7,15
	2007.01.31 16:56:41.907,1.2996,1.29945,20.8,27.7
	2007.01.31 16:56:42.006,1.29948,1.29938,0.8,4.8

	usage:

	victim = "/home/Rfiles/EURUSD.csv"
	carnage = Dexter.new victim
	carnage.by_week # breaks on weekends
	carnage.by_month # yes, by month

	output: weeks

	"File EU-2006-52.csv with 2847 rows created at 0.09 seconds"
	"File EU-2007-1.csv with 310311 rows created at 6.81 seconds"