Skip to content

Instantly share code, notes, and snippets.

@jhamon
Last active December 31, 2015 01:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jhamon/7916485 to your computer and use it in GitHub Desktop.
Save jhamon/7916485 to your computer and use it in GitHub Desktop.
#merge! will merge together time series data in O(m) time where m is the length of the shorter time series data set. Run specs with `rspec time_series_spec.rb --color --format doc`. See also `ruby time_series_benchmarks.rb` to see how #merge! performs on a very large receiver array. The non-bang #merge is always slow and has poor memory performa…
require 'date'
class TimeSeries
attr_reader :data, :date
def initialize(data, date)
raise ArgumentError unless data.is_a?(Array) && date.is_a?(Date)
raise ArgumentError, "TimeSeries cannot be empty" if data.empty?
@data = data
@date = date
end
def starts_before?(other_ts)
raise ArgumentError unless other_ts.is_a?(TimeSeries)
@date < other_ts.date
end
def dup
TimeSeries.new(@data.dup, @date.dup)
end
def merge(other_ts)
return self.dup if other_ts == self
raise ArgumentError unless other_ts.is_a?(TimeSeries)
smallest, largest = [self, other_ts].sort_by { |x| x.data.length }
dupped_large = largest.dup
dupped_large.merge!(smallest)
end
def merge!(other_ts)
return self if other_ts == self
raise ArgumentError unless other_ts.is_a?(TimeSeries)
if @data.size <= other_ts.data.size
@data.replace merge_self_onto_other(other_ts)
else
merge_other_onto_self!(other_ts)
end
@date = other_ts.date.dup unless starts_before?(other_ts)
self
end
private
def merge_other_onto_self!(other_ts)
time_delta = (@date - other_ts.date).to_i.abs
if other_ts.starts_before?(self)
base = pad_front_with_nils!(@data, time_delta)
time_delta = 0
else
base = @data
end
merge_onto_base_w_delta!(base, other_ts.data, time_delta)
end
def merge_self_onto_other(other_ts)
# This non-bang method is called becausethe receiver should never
# mutate its arguments.
time_delta = (@date - other_ts.date).to_i.abs
if starts_before?(other_ts)
base = pad_front_with_nils!(other_ts.data.dup, time_delta)
time_delta = 0
else
base = other_ts.data.dup
end
merge_onto_base_w_delta!(base, @data, time_delta)
end
def merge_onto_base_w_delta!(base_data, other_data, time_delta=0)
other_data.each_index do |i|
base_data[time_delta+i] = average(other_data[i], base_data[time_delta+i])
end
base_data
end
def average(a, b)
return a if b.nil?
return b if a.nil?
(a+b).fdiv(2)
end
def pad_front_with_nils!(arr, num_of_nils)
arr.unshift(*[nil]*num_of_nils)
end
end
require "benchmark"
require "./time_series.rb"
require "date"
today = Date.new(2013, 12, 11)
past_date = Date.new(2012, 1, 1)
future_date = Date.new(2016, 1, 1)
big_data = Array.new(10_000_000) { rand (1000) }
small_data = [1,2,3,4,5,6]
small_old = TimeSeries.new(small_data, past_date)
small_today = TimeSeries.new(small_data, today)
small_young = TimeSeries.new(small_data, future_date)
big_today = TimeSeries.new(big_data, today)
n = 100000
Benchmark.bmbm do |x|
x.report("#{n} runs: merge! small array into 10 million element array - temporally aligned") do
# Should be linear in length of small array
n.times do
big_today.merge!(small_today)
end
end
x.report("#{n} runs: merge! small array into 10 million element array - small data younger") do
# Should be linear in length of small array
n.times do
big_today.merge!(small_young)
end
end
x.report("#{n} runs: merge! small into 10 million element array - small data older") do
# Should be linear in length of small array
n.times do
big_today.merge!(small_old)
end
end
x.report("#{n} runs: merge! huge array into itself") do
# Should be constant time, since self merges just return self
n.times do
big_today.merge!(big_today)
end
end
end
require 'rspec'
require 'date'
require './time_series.rb'
require './time_series_spec_shared_stuff.rb'
describe "TimeSeries" do
let(:older) { Date.new(2013, 1, 1) }
let(:younger) { Date.new(2013, 1, 5) }
let(:long) { [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] }
let(:short) { [9, 8, 7, 6, 5, 4] }
let(:expected_merge_date) { older }
subject(:time_series) { TimeSeries.new([1,2,3,4], younger) }
it "should have a data array" do
expect(time_series).to respond_to(:data)
expect(time_series.data).to be_a(Array)
end
it "cannot have an empty data field" do
expect { TimeSeries.new([], older)}.to raise_error(ArgumentError)
end
it "should have a date" do
expect(time_series).to respond_to(:date)
expect(time_series.date).to be_a(Date)
end
describe "#starts_before?" do
let(:older_time_series) { TimeSeries.new(long, older)}
it "returns true if start date is earlier than other time series" do
expect(time_series.starts_before?(older_time_series)).to be_false
end
it "returns false if start date is compared with an older time series" do
expect(older_time_series.starts_before?(time_series)).to be_true
end
it "raises an error unless given another TimeSeries object" do
expect {time_series.starts_before?([1,2,3])}.to raise_error(ArgumentError)
end
end
describe "#dup" do
it "returns a copy with new object_id" do
expect(time_series.dup).not_to eq(time_series)
end
it "should also dup data and date attributes" do
dupped_series = time_series.dup
expect(dupped_series.date.object_id).not_to eq(time_series.date.object_id)
expect(dupped_series.data.object_id).not_to eq(time_series.data.object_id)
end
end
describe "#merge" do
let(:merge_method) { :merge }
context "when merging a TimeSeries with itself" do
include_context "self-merge"
it_behaves_like "a proper merge function"
it_behaves_like "a non-bang method"
end
context "when merging two non-overlapping series" do
include_context "non-overlapping"
it "uses nil to represent missing data" do
expect(ts1.merge(ts2).data).to include(nil)
end
it_behaves_like "a proper merge function"
it_behaves_like "a non-bang method"
end
context "when merging a shorter, younger series onto a longer, older one" do
include_context "shorter, younger onto longer, older"
it_behaves_like "a proper merge function"
it_behaves_like "a non-bang method"
end
context "when merging a shorter, older series onto a longer, younger one" do
include_context "shorter, older onto longer, younger"
it_behaves_like "a proper merge function"
it_behaves_like "a non-bang method"
end
context "when merging a longer, younger series onto a shorter, older one" do
include_context "longer, younger onto shorter, older"
it_behaves_like "a proper merge function"
it_behaves_like "a non-bang method"
end
context "when merging a longer, older series onto a shorter, younger one" do
include_context "longer, older onto shorter, younger"
it_behaves_like "a proper merge function"
it_behaves_like "a non-bang method"
end
end
describe "#merge!" do
let(:merge_method) { :merge! }
context "when merging a TimeSeries with itself" do
include_context "self-merge"
it "should return itself" do
expect(ts1.merge!(ts1)).to be(ts1)
end
it "should maintain a reference to the same data array" do
expect(ts1.data).to be(ts1.merge!(ts1).data)
end
it "should maintain a reference to the same date object" do
expect(ts1.date).to be(ts1.merge!(ts1).date)
end
end
context "when merging two non-overlapping series" do
include_context "non-overlapping"
it "uses nil to represent missing data" do
expect(ts1.merge!(ts2).data).to include(nil)
end
it_behaves_like "a proper merge function"
it_behaves_like "a dangerous (bang) method"
end
context "when merging a shorter, younger series onto a longer, older one" do
include_context "shorter, younger onto longer, older"
it_behaves_like "a proper merge function"
it_behaves_like "a dangerous (bang) method"
end
context "when merging a shorter, older series onto a longer, younger one" do
include_context "shorter, older onto longer, younger"
it_behaves_like "a proper merge function"
it_behaves_like "a dangerous (bang) method"
end
context "when merging a longer, younger series onto a shorter, older one" do
include_context "longer, younger onto shorter, older"
it_behaves_like "a proper merge function"
it_behaves_like "a dangerous (bang) method"
end
context "when merging a longer, older series onto a shorter, younger one" do
include_context "longer, older onto shorter, younger"
it_behaves_like "a proper merge function"
it_behaves_like "a dangerous (bang) method"
end
end
end
shared_examples "a non-bang method" do
it "should not mutate the receiver TimeSeries" do
copied_obj = ts1.dup
ts1.send(merge_method, ts2)
expect(ts1.date).to eq(copied_obj.date)
expect(ts1.data).to eq(copied_obj.data)
end
it "should not mutate the argument TimeSeries" do
copied_obj = ts2.dup
ts1.send(merge_method, ts2)
expect(ts2.date).to eq(copied_obj.date)
expect(ts2.data).to eq(copied_obj.data)
end
it "should produce a new object" do
expect(ts1.send(merge_method, ts2).object_id).not_to eq(ts1.object_id)
expect(ts1.send(merge_method, ts2).data.object_id).not_to eq(ts1.data.object_id)
expect(ts1.send(merge_method, ts2).date.object_id).not_to eq(ts1.date.object_id)
expect(ts1.send(merge_method, ts2).data.object_id).not_to eq(ts2.data.object_id)
expect(ts1.send(merge_method, ts2).date.object_id).not_to eq(ts2.date.object_id)
end
end
shared_examples "a dangerous (bang) method" do
it "should keep a reference to the same data array" do
expect(ts1.data.object_id).to eq(ts1.send(merge_method, ts2).data.object_id)
end
it "should mutate the receiver TimeSeries" do
copied_obj = ts1.dup
ts1.send(merge_method, ts2)
expect(ts1.data).not_to eq(copied_obj.data)
end
it "should not mutate the argument TimeSeries" do
copied_obj = ts2.dup
ts1.merge(ts2)
expect(ts2.date).to eq(copied_obj.date)
expect(ts2.data).to eq(copied_obj.data)
end
end
shared_examples "a proper merge function" do
it "returns a new TimeSeries" do
expect(ts1.send(merge_method, ts1)).to be_a(TimeSeries)
end
it "should be commutative" do
right_dup = ts1.dup
left_to_right_data = ts1.send(merge_method, ts2).data
right_to_left_data = ts2.send(merge_method, right_dup).data
expect(left_to_right_data).to eq(right_to_left_data)
end
it "should give the correct start date" do
expect(ts1.send(merge_method, ts2).date).to eq(expected_merge_date)
end
it "returns a TimeSeries that averages redundant data" do
expect(ts1.send(merge_method, ts2).data).to eq(expected_merge_result)
end
it "should not reference same data array as argument" do
expect(ts1.send(merge_method, ts2).data.object_id).not_to eq(ts2.data.object_id)
end
it "should not refence the same date object as the argument" do
expect(ts1.send(merge_method, ts2).date.object_id).not_to eq(ts2.date.object_id)
end
end
shared_context "self-merge" do
let(:ts1) { TimeSeries.new(short, older) }
let(:ts2) { ts1 }
let(:expected_merge_result) { ts1.data.dup }
# 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
# 9, 8, 7, 6, 5, 4
# 1, 2, 3, 4, 7, 7, 7, 7, 7, 7
end
shared_context "shorter, younger onto longer, older" do
let(:ts1) { TimeSeries.new(short, younger) }
let(:ts2) { TimeSeries.new(long, older) }
let(:expected_merge_result) {[1, 2, 3, 4, 7, 7, 7, 7, 7, 7]}
# 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
# 9, 8, 7, 6, 5, 4
# 1, 2, 3, 4, 7, 7, 7, 7, 7, 7
end
shared_context "shorter, older onto longer, younger" do
let(:ts1) { TimeSeries.new(short, older) }
let(:ts2) { TimeSeries.new(long, younger) }
let(:expected_merge_result) {[9, 8, 7, 6, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10]}
# 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
# 9, 8, 7, 6, 5, 4
# 9, 8, 7, 6, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10
end
shared_context "longer, younger onto shorter, older" do
let(:ts1) { TimeSeries.new(long, younger) }
let(:ts2) { TimeSeries.new(short, older) }
let(:expected_merge_result) {[9, 8, 7, 6, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10]}
# 9, 8, 7, 6, 5, 4
# 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
# 9, 8, 7, 6, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10
end
shared_context "longer, older onto shorter, younger" do
let(:ts1) { TimeSeries.new(long, older) }
let(:ts2) { TimeSeries.new(short, younger) }
let(:expected_merge_result) {[1, 2, 3, 4, 7, 7, 7, 7, 7, 7]}
# 9, 8, 7, 6, 5, 4
# 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
# 1, 2, 3, 4, 7, 7, 7, 7, 7, 7
end
shared_context "non-overlapping" do
let(:ts1) { TimeSeries.new([1, 2, 3, 4], Date.new(2013, 3, 1)) }
let(:ts2) { TimeSeries.new([1, 2, 3, 4], Date.new(2013, 3, 6)) }
let(:expected_merge_result) {[1,2,3,4,nil,1,2,3,4]}
let(:expected_merge_date) { Date.new(2013, 3, 1) }
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment