zverok/good_data_frame.rb

## good_data_frame.rb
# It is show-the-point demo for my article
# "On DataFrame datatype in Ruby" http://zverok.github.io/blog/2016-01-10-dataframe.html

require 'good_data_frame' # `require': cannot load such file -- good_data_frame (LoadError)

# Initialization (with default index): hashes of column: values
# Values of each column are homogenous
# First and most important thing is "what columns is"
table = GDF.new(
  manager: ['Tom', 'Jerry', 'Magda'],
  salary:  [1000.0, 800.0,  960.0],
  sales:   [20_000, 6_000, 15_000]
)
# => prints in IRB:
#   | manager | salary | sales    |
# --+---------+--------+----------+
# 0 | Tom     | 1000.0 | 20'000.0 |
# 1 | Jerry   |  800.0 |  6'000.0 |
# 2 | Magda   |  960.0 | 15'000.0 |
# ...and if here's 5+ lines, just prints "...", and then last 2-3 lines

# Initialization: with index other then 0,1,2...
table2 = GDF.new(
  salary:  [1000.0, 800.0,  960.0],
  sales:   [20_000, 6_000, 15_000]

  # from "human" point-of-view, index is just a special column, isn't it?
  index: [Time.parse('2015-01-01'), Time.parse('2015-02-01'), Time.parse('2015-03-01')]
)

# DataFrame like your olde good Array/Enumerable
table.count # => 3 (rows)

table.first(2)
# =>
#   | manager | salary | sales    |
# --+---------+--------+----------+
# 0 | Tom     | 1000.0 | 20'000.0 |
# 1 | Jerry   |  800.0 |  6'000.0 |

table.last(1)
# =>
#   | manager | salary | sales    |
# --+---------+--------+----------+
# 2 | Magda   |  960.0 | 15'000.0 |

table.each do |row|
  p [row.class,  # GDF::Row, unlike column, is a virtual object for iteration only
    row.index,   # It is more like Struct/Hash/Hashie,
    row.manager] # with fields corresponding to columns
end

# Also #sort, #select, #reject work as usual, but returning instance of
# GoodDataFrame
table.reject do |row|
  row.salary < 900
end
#   | manager | salary | sales    |
# --+---------+--------+----------+
# 0 | Tom     | 1000.0 | 20'000.0 |
# 2 | Magda   |  960.0 | 15'000.0 |

# Adding data: performs check for column count and data type for each column
table << ['Jim', 950.0, 18_000]
#   | manager | salary | sales    |
# --+---------+--------+----------+
# 0 | Tom     | 1000.0 | 20'000.0 |
# 1 | Jerry   |  800.0 |  6'000.0 |
# 2 | Magda   |  960.0 | 15'000.0 |
# 3 | Jim     |  950.0 | 18'000.0 |

# Working with columns: column IS an object
table.columns # => GDF::Columns proxy array

table.columns.push(bonus: [300.0, 120.0, 230.0])
# =>
#   | manager | salary | sales    | bonus |
# --+---------+--------+----------+-------+
# 0 | Tom     | 1000.0 | 20'000.0 | 300.0 |
# 1 | Jerry   |  800.0 |  6'000.0 | 120.0 |
# 2 | Magda   |  960.0 | 15'000.0 | 230.0 |

# or even:
table.columns[:bonus] = [300.0, 120.0, 230.0]

table.columns.delete(:salary)
# =>
#   | manager | sales    | bonus |
# --+---------+----------+-------+
# ....

table.columns.reorder(:bonus, :salary)
# =>
#   | manager | bonus | salary |
# --+---------+-------+--------+
# ....

bonus = table.columns[:bonus]
# =>
#     0 |     1 |     2
# 300.0 | 120.0 | 230.0
#
# It's GDF::Column type, Array-ish, yet not necessarily indexed 0..N, and
# with additional operations, making it more math-y:

bonus * 2
# =>
#     0 |     1 |     2
# 600.0 | 240.0 | 460.0

table.columns[:bonus] / table.columns[:salary]
# =>
#    0 |    1 |    2
# 0.30 | 0.15 | 0.24

# "Views":
table.view(0..1, :salary)
#   | salary |
# --+--------+
# 0 | 1000.0 |
# 1 |  800.0 |
# 2 |  960.0 |

table.view(0..1, :salary) = 950.0 # equality forewer!
table
# =>
#   | manager | salary | sales    | bonus |
# --+---------+--------+----------+-------+
# 0 | Tom     |  950.0 | 20'000.0 | 300.0 |
# 1 | Jerry   |  950.0 |  6'000.0 | 120.0 |
# 2 | Magda   |  950.0 | 15'000.0 | 230.0 |

table = GDF.new(
  manager: ['Tom', 'Jerry', 'Magda'],
  salary:  [1000.0, nil,  960.0],
  sales:   [20_000, 6_000, nil]
)
table.view(&:nil?)
#   | salary | sales |
# --+--------+-------+
# 1 |    nil |       |
# 2 |        |   nil |

table.view(&:nil?).values = 0.0

table.view(nil, :salary){|s| s < 1000}.values += 100
# or?
table.select{|r| r.salary < 1000}.columns[:salary].values += 100
# neither is cool :(
table.view{|val, r, c| c == :salary && val < 1000}.values += 100
# ↑ ok, that's cute!

# Composite indexes
table = GoodDataFrame.new(
  age:    [30, 23, 38, 40, 22, 28],
  salary: [1000.0, 980.0, 860.0, 950.0, 720.0, 1050.0],

  # let's try the simplest thing possible
  index: {
    department_A: [:John, :Jeff, :Maggy],
    department_B: [:Jake, :Mary, :Andrew]
  }
)
#                       | age | salary |
# -------------+--------+-----+--------+
# department_A | John   |  30 | 1000.0 |
#              | Jeff   |  23 |  980.0 |
#              | Maggy  |  38 |  860.0 |
# -------------+--------+-----+--------+
# department_B | Jake   |  40 |  950.0 |
#              | Mary   |  22 |  720.0 |
#              | Andrew |  28 | 1050.0 |

table[:department_A]
#                       | age | salary |
# -------------+--------+-----+--------+
# department_A | John   |  30 | 1000.0 |
#              | Jeff   |  23 |  980.0 |
#              | Maggy  |  38 |  860.0 |

table[:department_A, :John]
#                       | age | salary |
# -------------+--------+-----+--------+
# department_A | John   |  30 | 1000.0 |


# Enough for most of cool stats and pivots and stuff
	# It is show-the-point demo for my article
	# "On DataFrame datatype in Ruby" http://zverok.github.io/blog/2016-01-10-dataframe.html

	require 'good_data_frame' # `require': cannot load such file -- good_data_frame (LoadError)

	# Initialization (with default index): hashes of column: values
	# Values of each column are homogenous
	# First and most important thing is "what columns is"
	table = GDF.new(
	manager: ['Tom', 'Jerry', 'Magda'],
	salary: [1000.0, 800.0, 960.0],
	sales: [20_000, 6_000, 15_000]
	)
	# => prints in IRB:
	# \| manager \| salary \| sales \|
	# --+---------+--------+----------+
	# 0 \| Tom \| 1000.0 \| 20'000.0 \|
	# 1 \| Jerry \| 800.0 \| 6'000.0 \|
	# 2 \| Magda \| 960.0 \| 15'000.0 \|
	# ...and if here's 5+ lines, just prints "...", and then last 2-3 lines

	# Initialization: with index other then 0,1,2...
	table2 = GDF.new(
	salary: [1000.0, 800.0, 960.0],
	sales: [20_000, 6_000, 15_000]

	# from "human" point-of-view, index is just a special column, isn't it?
	index: [Time.parse('2015-01-01'), Time.parse('2015-02-01'), Time.parse('2015-03-01')]
	)

	# DataFrame like your olde good Array/Enumerable
	table.count # => 3 (rows)

	table.first(2)
	# =>
	# \| manager \| salary \| sales \|
	# --+---------+--------+----------+
	# 0 \| Tom \| 1000.0 \| 20'000.0 \|
	# 1 \| Jerry \| 800.0 \| 6'000.0 \|

	table.last(1)
	# =>
	# \| manager \| salary \| sales \|
	# --+---------+--------+----------+
	# 2 \| Magda \| 960.0 \| 15'000.0 \|

	table.each do \|row\|
	p [row.class, # GDF::Row, unlike column, is a virtual object for iteration only
	row.index, # It is more like Struct/Hash/Hashie,
	row.manager] # with fields corresponding to columns
	end

	# Also #sort, #select, #reject work as usual, but returning instance of
	# GoodDataFrame
	table.reject do \|row\|
	row.salary < 900
	end
	# \| manager \| salary \| sales \|
	# --+---------+--------+----------+
	# 0 \| Tom \| 1000.0 \| 20'000.0 \|
	# 2 \| Magda \| 960.0 \| 15'000.0 \|

	# Adding data: performs check for column count and data type for each column
	table << ['Jim', 950.0, 18_000]
	# \| manager \| salary \| sales \|
	# --+---------+--------+----------+
	# 0 \| Tom \| 1000.0 \| 20'000.0 \|
	# 1 \| Jerry \| 800.0 \| 6'000.0 \|
	# 2 \| Magda \| 960.0 \| 15'000.0 \|
	# 3 \| Jim \| 950.0 \| 18'000.0 \|

	# Working with columns: column IS an object
	table.columns # => GDF::Columns proxy array

	table.columns.push(bonus: [300.0, 120.0, 230.0])
	# =>
	# \| manager \| salary \| sales \| bonus \|
	# --+---------+--------+----------+-------+
	# 0 \| Tom \| 1000.0 \| 20'000.0 \| 300.0 \|
	# 1 \| Jerry \| 800.0 \| 6'000.0 \| 120.0 \|
	# 2 \| Magda \| 960.0 \| 15'000.0 \| 230.0 \|

	# or even:
	table.columns[:bonus] = [300.0, 120.0, 230.0]

	table.columns.delete(:salary)
	# =>
	# \| manager \| sales \| bonus \|
	# --+---------+----------+-------+
	# ....

	table.columns.reorder(:bonus, :salary)
	# =>
	# \| manager \| bonus \| salary \|
	# --+---------+-------+--------+
	# ....

	bonus = table.columns[:bonus]
	# =>
	# 0 \| 1 \| 2
	# 300.0 \| 120.0 \| 230.0
	#
	# It's GDF::Column type, Array-ish, yet not necessarily indexed 0..N, and
	# with additional operations, making it more math-y:

	bonus * 2
	# =>
	# 0 \| 1 \| 2
	# 600.0 \| 240.0 \| 460.0

	table.columns[:bonus] / table.columns[:salary]
	# =>
	# 0 \| 1 \| 2
	# 0.30 \| 0.15 \| 0.24

	# "Views":
	table.view(0..1, :salary)
	# \| salary \|
	# --+--------+
	# 0 \| 1000.0 \|
	# 1 \| 800.0 \|
	# 2 \| 960.0 \|

	table.view(0..1, :salary) = 950.0 # equality forewer!
	table
	# =>
	# \| manager \| salary \| sales \| bonus \|
	# --+---------+--------+----------+-------+
	# 0 \| Tom \| 950.0 \| 20'000.0 \| 300.0 \|
	# 1 \| Jerry \| 950.0 \| 6'000.0 \| 120.0 \|
	# 2 \| Magda \| 950.0 \| 15'000.0 \| 230.0 \|

	table = GDF.new(
	manager: ['Tom', 'Jerry', 'Magda'],
	salary: [1000.0, nil, 960.0],
	sales: [20_000, 6_000, nil]
	)
	table.view(&:nil?)
	# \| salary \| sales \|
	# --+--------+-------+
	# 1 \| nil \| \|
	# 2 \| \| nil \|

	table.view(&:nil?).values = 0.0

	table.view(nil, :salary){\|s\| s < 1000}.values += 100
	# or?
	table.select{\|r\| r.salary < 1000}.columns[:salary].values += 100
	# neither is cool :(
	table.view{\|val, r, c\| c == :salary && val < 1000}.values += 100
	# ↑ ok, that's cute!

	# Composite indexes
	table = GoodDataFrame.new(
	age: [30, 23, 38, 40, 22, 28],
	salary: [1000.0, 980.0, 860.0, 950.0, 720.0, 1050.0],

	# let's try the simplest thing possible
	index: {
	department_A: [:John, :Jeff, :Maggy],
	department_B: [:Jake, :Mary, :Andrew]
	}
	)
	# \| age \| salary \|
	# -------------+--------+-----+--------+
	# department_A \| John \| 30 \| 1000.0 \|
	# \| Jeff \| 23 \| 980.0 \|
	# \| Maggy \| 38 \| 860.0 \|
	# -------------+--------+-----+--------+
	# department_B \| Jake \| 40 \| 950.0 \|
	# \| Mary \| 22 \| 720.0 \|
	# \| Andrew \| 28 \| 1050.0 \|

	table[:department_A]
	# \| age \| salary \|
	# -------------+--------+-----+--------+
	# department_A \| John \| 30 \| 1000.0 \|
	# \| Jeff \| 23 \| 980.0 \|
	# \| Maggy \| 38 \| 860.0 \|

	table[:department_A, :John]
	# \| age \| salary \|
	# -------------+--------+-----+--------+
	# department_A \| John \| 30 \| 1000.0 \|


	# Enough for most of cool stats and pivots and stuff