Last active
December 31, 2015 01:49
-
-
Save jhamon/7916485 to your computer and use it in GitHub Desktop.
#merge! will merge together time series data in O(m) time where m is the length of the shorter time series data set. Run specs with `rspec time_series_spec.rb --color --format doc`. See also `ruby time_series_benchmarks.rb` to see how #merge! performs on a very large receiver array. The non-bang #merge is always slow and has poor memory performa…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'date' | |
class TimeSeries | |
attr_reader :data, :date | |
def initialize(data, date) | |
raise ArgumentError unless data.is_a?(Array) && date.is_a?(Date) | |
raise ArgumentError, "TimeSeries cannot be empty" if data.empty? | |
@data = data | |
@date = date | |
end | |
def starts_before?(other_ts) | |
raise ArgumentError unless other_ts.is_a?(TimeSeries) | |
@date < other_ts.date | |
end | |
def dup | |
TimeSeries.new(@data.dup, @date.dup) | |
end | |
def merge(other_ts) | |
return self.dup if other_ts == self | |
raise ArgumentError unless other_ts.is_a?(TimeSeries) | |
smallest, largest = [self, other_ts].sort_by { |x| x.data.length } | |
dupped_large = largest.dup | |
dupped_large.merge!(smallest) | |
end | |
def merge!(other_ts) | |
return self if other_ts == self | |
raise ArgumentError unless other_ts.is_a?(TimeSeries) | |
if @data.size <= other_ts.data.size | |
@data.replace merge_self_onto_other(other_ts) | |
else | |
merge_other_onto_self!(other_ts) | |
end | |
@date = other_ts.date.dup unless starts_before?(other_ts) | |
self | |
end | |
private | |
def merge_other_onto_self!(other_ts) | |
time_delta = (@date - other_ts.date).to_i.abs | |
if other_ts.starts_before?(self) | |
base = pad_front_with_nils!(@data, time_delta) | |
time_delta = 0 | |
else | |
base = @data | |
end | |
merge_onto_base_w_delta!(base, other_ts.data, time_delta) | |
end | |
def merge_self_onto_other(other_ts) | |
# This non-bang method is called becausethe receiver should never | |
# mutate its arguments. | |
time_delta = (@date - other_ts.date).to_i.abs | |
if starts_before?(other_ts) | |
base = pad_front_with_nils!(other_ts.data.dup, time_delta) | |
time_delta = 0 | |
else | |
base = other_ts.data.dup | |
end | |
merge_onto_base_w_delta!(base, @data, time_delta) | |
end | |
def merge_onto_base_w_delta!(base_data, other_data, time_delta=0) | |
other_data.each_index do |i| | |
base_data[time_delta+i] = average(other_data[i], base_data[time_delta+i]) | |
end | |
base_data | |
end | |
def average(a, b) | |
return a if b.nil? | |
return b if a.nil? | |
(a+b).fdiv(2) | |
end | |
def pad_front_with_nils!(arr, num_of_nils) | |
arr.unshift(*[nil]*num_of_nils) | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "benchmark" | |
require "./time_series.rb" | |
require "date" | |
today = Date.new(2013, 12, 11) | |
past_date = Date.new(2012, 1, 1) | |
future_date = Date.new(2016, 1, 1) | |
big_data = Array.new(10_000_000) { rand (1000) } | |
small_data = [1,2,3,4,5,6] | |
small_old = TimeSeries.new(small_data, past_date) | |
small_today = TimeSeries.new(small_data, today) | |
small_young = TimeSeries.new(small_data, future_date) | |
big_today = TimeSeries.new(big_data, today) | |
n = 100000 | |
Benchmark.bmbm do |x| | |
x.report("#{n} runs: merge! small array into 10 million element array - temporally aligned") do | |
# Should be linear in length of small array | |
n.times do | |
big_today.merge!(small_today) | |
end | |
end | |
x.report("#{n} runs: merge! small array into 10 million element array - small data younger") do | |
# Should be linear in length of small array | |
n.times do | |
big_today.merge!(small_young) | |
end | |
end | |
x.report("#{n} runs: merge! small into 10 million element array - small data older") do | |
# Should be linear in length of small array | |
n.times do | |
big_today.merge!(small_old) | |
end | |
end | |
x.report("#{n} runs: merge! huge array into itself") do | |
# Should be constant time, since self merges just return self | |
n.times do | |
big_today.merge!(big_today) | |
end | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rspec' | |
require 'date' | |
require './time_series.rb' | |
require './time_series_spec_shared_stuff.rb' | |
describe "TimeSeries" do | |
let(:older) { Date.new(2013, 1, 1) } | |
let(:younger) { Date.new(2013, 1, 5) } | |
let(:long) { [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] } | |
let(:short) { [9, 8, 7, 6, 5, 4] } | |
let(:expected_merge_date) { older } | |
subject(:time_series) { TimeSeries.new([1,2,3,4], younger) } | |
it "should have a data array" do | |
expect(time_series).to respond_to(:data) | |
expect(time_series.data).to be_a(Array) | |
end | |
it "cannot have an empty data field" do | |
expect { TimeSeries.new([], older)}.to raise_error(ArgumentError) | |
end | |
it "should have a date" do | |
expect(time_series).to respond_to(:date) | |
expect(time_series.date).to be_a(Date) | |
end | |
describe "#starts_before?" do | |
let(:older_time_series) { TimeSeries.new(long, older)} | |
it "returns true if start date is earlier than other time series" do | |
expect(time_series.starts_before?(older_time_series)).to be_false | |
end | |
it "returns false if start date is compared with an older time series" do | |
expect(older_time_series.starts_before?(time_series)).to be_true | |
end | |
it "raises an error unless given another TimeSeries object" do | |
expect {time_series.starts_before?([1,2,3])}.to raise_error(ArgumentError) | |
end | |
end | |
describe "#dup" do | |
it "returns a copy with new object_id" do | |
expect(time_series.dup).not_to eq(time_series) | |
end | |
it "should also dup data and date attributes" do | |
dupped_series = time_series.dup | |
expect(dupped_series.date.object_id).not_to eq(time_series.date.object_id) | |
expect(dupped_series.data.object_id).not_to eq(time_series.data.object_id) | |
end | |
end | |
describe "#merge" do | |
let(:merge_method) { :merge } | |
context "when merging a TimeSeries with itself" do | |
include_context "self-merge" | |
it_behaves_like "a proper merge function" | |
it_behaves_like "a non-bang method" | |
end | |
context "when merging two non-overlapping series" do | |
include_context "non-overlapping" | |
it "uses nil to represent missing data" do | |
expect(ts1.merge(ts2).data).to include(nil) | |
end | |
it_behaves_like "a proper merge function" | |
it_behaves_like "a non-bang method" | |
end | |
context "when merging a shorter, younger series onto a longer, older one" do | |
include_context "shorter, younger onto longer, older" | |
it_behaves_like "a proper merge function" | |
it_behaves_like "a non-bang method" | |
end | |
context "when merging a shorter, older series onto a longer, younger one" do | |
include_context "shorter, older onto longer, younger" | |
it_behaves_like "a proper merge function" | |
it_behaves_like "a non-bang method" | |
end | |
context "when merging a longer, younger series onto a shorter, older one" do | |
include_context "longer, younger onto shorter, older" | |
it_behaves_like "a proper merge function" | |
it_behaves_like "a non-bang method" | |
end | |
context "when merging a longer, older series onto a shorter, younger one" do | |
include_context "longer, older onto shorter, younger" | |
it_behaves_like "a proper merge function" | |
it_behaves_like "a non-bang method" | |
end | |
end | |
describe "#merge!" do | |
let(:merge_method) { :merge! } | |
context "when merging a TimeSeries with itself" do | |
include_context "self-merge" | |
it "should return itself" do | |
expect(ts1.merge!(ts1)).to be(ts1) | |
end | |
it "should maintain a reference to the same data array" do | |
expect(ts1.data).to be(ts1.merge!(ts1).data) | |
end | |
it "should maintain a reference to the same date object" do | |
expect(ts1.date).to be(ts1.merge!(ts1).date) | |
end | |
end | |
context "when merging two non-overlapping series" do | |
include_context "non-overlapping" | |
it "uses nil to represent missing data" do | |
expect(ts1.merge!(ts2).data).to include(nil) | |
end | |
it_behaves_like "a proper merge function" | |
it_behaves_like "a dangerous (bang) method" | |
end | |
context "when merging a shorter, younger series onto a longer, older one" do | |
include_context "shorter, younger onto longer, older" | |
it_behaves_like "a proper merge function" | |
it_behaves_like "a dangerous (bang) method" | |
end | |
context "when merging a shorter, older series onto a longer, younger one" do | |
include_context "shorter, older onto longer, younger" | |
it_behaves_like "a proper merge function" | |
it_behaves_like "a dangerous (bang) method" | |
end | |
context "when merging a longer, younger series onto a shorter, older one" do | |
include_context "longer, younger onto shorter, older" | |
it_behaves_like "a proper merge function" | |
it_behaves_like "a dangerous (bang) method" | |
end | |
context "when merging a longer, older series onto a shorter, younger one" do | |
include_context "longer, older onto shorter, younger" | |
it_behaves_like "a proper merge function" | |
it_behaves_like "a dangerous (bang) method" | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment