Skip to content

Instantly share code, notes, and snippets.

@ephekt
Created May 21, 2013 16:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ephekt/5621337 to your computer and use it in GitHub Desktop.
Save ephekt/5621337 to your computer and use it in GitHub Desktop.
# encoding: utf-8
require 'tmpdir'
require 'stringio'
require 'zip/zipfilesystem'
# Base class for all other types of spreadsheets
class Roo::GenericSpreadsheet
include Enumerable
TEMP_PREFIX = "oo_"
attr_reader :default_sheet, :headers
# sets the line with attribute names (default: 1)
attr_accessor :header_line
protected
def self.split_coordinate(str)
letter,number = Roo::GenericSpreadsheet.split_coord(str)
x = letter_to_number(letter)
y = number
return y, x
end
def self.split_coord(s)
if s =~ /([a-zA-Z]+)([0-9]+)/
letter = $1
number = $2.to_i
else
raise ArgumentError
end
return letter, number
end
public
def initialize(filename, packed=nil, file_warning=:error, tmpdir=nil)
@cell = Hash.new{|h,k| h[k] = {}}
@cell_type = Hash.new{|h,k| h[k] = {}}
@cells_read = {}
@first_row = {}
@last_row = {}
@first_column = {}
@last_column = {}
@style = {}
@style_defaults = Hash.new { |h,k| h[k] = [] }
@style_definitions = {}
@default_sheet = self.sheets.first
@formula = {}
@header_line = 1
end
# sets the working sheet in the document
# 'sheet' can be a number (1 = first sheet) or the name of a sheet.
def default_sheet=(sheet)
validate_sheet!(sheet)
@default_sheet = sheet
@first_row[sheet] = @last_row[sheet] = @first_column[sheet] = @last_column[sheet] = nil
@cells_read[sheet] = false
end
# first non-empty column as a letter
def first_column_as_letter(sheet=nil)
Roo::GenericSpreadsheet.number_to_letter(first_column(sheet))
end
# last non-empty column as a letter
def last_column_as_letter(sheet=nil)
Roo::GenericSpreadsheet.number_to_letter(last_column(sheet))
end
# returns the number of the first non-empty row
def first_row(sheet=nil)
sheet ||= @default_sheet
read_cells(sheet) unless @cells_read[sheet]
if @first_row[sheet]
return @first_row[sheet]
end
impossible_value = 999_999 # more than a spreadsheet can hold
result = impossible_value
@cell[sheet].each_pair {|key,value|
y = key.first.to_i # _to_string(key).split(',')
result = [result, y].min if value
} if @cell[sheet]
result = nil if result == impossible_value
@first_row[sheet] = result
result
end
# returns the number of the last non-empty row
def last_row(sheet=nil)
sheet ||= @default_sheet
read_cells(sheet) unless @cells_read[sheet]
if @last_row[sheet]
return @last_row[sheet]
end
impossible_value = 0
result = impossible_value
@cell[sheet].each_pair {|key,value|
y = key.first.to_i # _to_string(key).split(',')
result = [result, y].max if value
} if @cell[sheet]
result = nil if result == impossible_value
@last_row[sheet] = result
result
end
# returns the number of the first non-empty column
def first_column(sheet=nil)
sheet ||= @default_sheet
read_cells(sheet) unless @cells_read[sheet]
if @first_column[sheet]
return @first_column[sheet]
end
impossible_value = 999_999 # more than a spreadsheet can hold
result = impossible_value
@cell[sheet].each_pair {|key,value|
x = key.last.to_i # _to_string(key).split(',')
result = [result, x].min if value
} if @cell[sheet]
result = nil if result == impossible_value
@first_column[sheet] = result
result
end
# returns the number of the last non-empty column
def last_column(sheet=nil)
sheet ||= @default_sheet
read_cells(sheet) unless @cells_read[sheet]
if @last_column[sheet]
return @last_column[sheet]
end
impossible_value = 0
result = impossible_value
@cell[sheet].each_pair {|key,value|
x = key.last.to_i # _to_string(key).split(',')
result = [result, x].max if value
} if @cell[sheet]
result = nil if result == impossible_value
@last_column[sheet] = result
result
end
# returns a rectangular area (default: all cells) as yaml-output
# you can add additional attributes with the prefix parameter like:
# oo.to_yaml({"file"=>"flightdata_2007-06-26", "sheet" => "1"})
def to_yaml(prefix={}, from_row=nil, from_column=nil, to_row=nil, to_column=nil,sheet=nil)
sheet ||= @default_sheet
result = "--- \n"
return '' unless first_row # empty result if there is no first_row in a sheet
(from_row||first_row(sheet)).upto(to_row||last_row(sheet)) do |row|
(from_column||first_column(sheet)).upto(to_column||last_column(sheet)) do |col|
unless empty?(row,col,sheet)
result << "cell_#{row}_#{col}: \n"
prefix.each {|k,v|
result << " #{k}: #{v} \n"
}
result << " row: #{row} \n"
result << " col: #{col} \n"
result << " celltype: #{self.celltype(row,col,sheet)} \n"
if self.celltype(row,col,sheet) == :time
result << " value: #{Roo::GenericSpreadsheet.integer_to_timestring( self.cell(row,col,sheet))} \n"
else
result << " value: #{self.cell(row,col,sheet)} \n"
end
end
end
end
result
end
# write the current spreadsheet to stdout or into a file
def to_csv(filename=nil,sheet=nil)
sheet ||= @default_sheet
if filename
File.open(filename,"w") do |file|
write_csv_content(file,sheet)
end
return true
else
sio = StringIO.new
write_csv_content(sio,sheet)
sio.rewind
return sio.read
end
end
# returns a matrix object from the whole sheet or a rectangular area of a sheet
def to_matrix(from_row=nil, from_column=nil, to_row=nil, to_column=nil,sheet=nil)
require 'matrix'
sheet ||= @default_sheet
return Matrix.empty unless first_row
Matrix.rows((from_row||first_row(sheet)).upto(to_row||last_row(sheet)).map do |row|
(from_column||first_column(sheet)).upto(to_column||last_column(sheet)).map do |col|
cell(row,col)
end
end)
end
# find a row either by row number or a condition
# Caution: this works only within the default sheet -> set default_sheet before you call this method
# (experimental. see examples in the test_roo.rb file)
def find(*args) # :nodoc
options = (args.last.is_a?(Hash) ? args.pop : {})
result_array = options[:array]
header_for = Hash[1.upto(last_column).map do |col|
[col, cell(@header_line,col)]
end]
#-- id
if args[0].class == Fixnum
rownum = args[0]
if @header_line
[Hash[1.upto(self.row().size).map {|j|
[header_for.fetch(j), cell(rownum,j)]
}]]
else
self.row(rownum).size.times.map {|j|
cell(rownum,j + 1)
}
end
#-- :all
elsif args[0] == :all
rows = first_row.upto(last_row)
# are all conditions met?
if (conditions = options[:conditions]) && !conditions.empty?
column_with = header_for.invert
rows = rows.select do |i|
conditions.all? { |key,val| cell(i,column_with[key]) == val }
end
end
rows.map do |i|
if result_array
self.row(i)
else
Hash[1.upto(self.row(i).size).map do |j|
[header_for.fetch(j), cell(i,j)]
end]
end
end
end
end
# returns all values in this row as an array
# row numbers are 1,2,3,... like in the spreadsheet
def row(rownumber,sheet=nil)
sheet ||= @default_sheet
read_cells(sheet) unless @cells_read[sheet]
first_column(sheet).upto(last_column(sheet)).map do |col|
cell(rownumber,col,sheet)
end
end
# returns all values in this column as an array
# column numbers are 1,2,3,... like in the spreadsheet
def column(columnnumber,sheet=nil)
if columnnumber.class == String
columnnumber = Roo::Excel.letter_to_number(columnnumber)
end
sheet ||= @default_sheet
read_cells(sheet) unless @cells_read[sheet]
first_row(sheet).upto(last_row(sheet)).map do |row|
cell(row,columnnumber,sheet)
end
end
# set a cell to a certain value
# (this will not be saved back to the spreadsheet file!)
def set(row,col,value,sheet=nil) #:nodoc:
sheet ||= @default_sheet
read_cells(sheet) unless @cells_read[sheet]
row, col = normalize(row,col)
cell_type = case value
when Fixnum then :float
when String, Float then :string
else
raise ArgumentError, "Type for #{value} not set"
end
set_value(row,col,value,sheet)
set_type(row,col,cell_type,sheet)
end
# reopens and read a spreadsheet document
def reload
# von Abfrage der Klasse direkt auf .to_s == '..' umgestellt
ds = @default_sheet
if self.class.to_s == 'Google'
initialize(@spreadsheetkey,@user,@password)
else
initialize(@filename)
end
self.default_sheet = ds
#@first_row = @last_row = @first_column = @last_column = nil
end
# true if cell is empty
def empty?(row, col, sheet=nil)
sheet ||= @default_sheet
read_cells(sheet) unless @cells_read[sheet] or self.class == Roo::Excel
row,col = normalize(row,col)
contents = cell(row, col, sheet)
!contents || (celltype(row, col, sheet) == :string && contents.empty?) \
|| (row < first_row(sheet) || row > last_row(sheet) || col < first_column(sheet) || col > last_column(sheet))
end
# returns information of the spreadsheet document and all sheets within
# this document.
def info
result = "File: #{File.basename(@filename)}\n"+
"Number of sheets: #{sheets.size}\n"+
"Sheets: #{sheets.join(', ')}\n"
n = 1
sheets.each {|sheet|
self.default_sheet = sheet
result << "Sheet " + n.to_s + ":\n"
unless first_row
result << " - empty -"
else
result << " First row: #{first_row}\n"
result << " Last row: #{last_row}\n"
result << " First column: #{Roo::GenericSpreadsheet.number_to_letter(first_column)}\n"
result << " Last column: #{Roo::GenericSpreadsheet.number_to_letter(last_column)}"
end
result << "\n" if sheet != sheets.last
n += 1
}
result
end
# returns an XML representation of all sheets of a spreadsheet file
def to_xml
Nokogiri::XML::Builder.new do |xml|
xml.spreadsheet {
self.sheets.each do |sheet|
self.default_sheet = sheet
xml.sheet(:name => sheet) { |x|
if first_row and last_row and first_column and last_column
# sonst gibt es Fehler bei leeren Blaettern
first_row.upto(last_row) do |row|
first_column.upto(last_column) do |col|
unless empty?(row,col)
x.cell(cell(row,col),
:row =>row,
:column => col,
:type => celltype(row,col))
end
end
end
end
}
end
}
end.to_xml
end
# when a method like spreadsheet.a42 is called
# convert it to a call of spreadsheet.cell('a',42)
def method_missing(m, *args)
# #aa42 => #cell('aa',42)
# #aa42('Sheet1') => #cell('aa',42,'Sheet1')
if m =~ /^([a-z]+)(\d)$/
col = Roo::GenericSpreadsheet.letter_to_number($1)
row = $2.to_i
if args.empty?
cell(row,col)
else
cell(row,col,args.first)
end
else
super
end
end
=begin
#TODO: hier entfernen
# returns each formula in the selected sheet as an array of elements
# [row, col, formula]
def formulas(sheet=nil)
theformulas = Array.new
sheet ||= @default_sheet
read_cells(sheet) unless @cells_read[sheet]
return theformulas unless first_row(sheet) # if there is no first row then
# there can't be formulas
first_row(sheet).upto(last_row(sheet)) {|row|
first_column(sheet).upto(last_column(sheet)) {|col|
if formula?(row,col,sheet)
theformulas << [row, col, formula(row,col,sheet)]
end
}
}
theformulas
end
=end
# FestivalBobcats fork changes begin here
# access different worksheets by calling spreadsheet.sheet(1)
# or spreadsheet.sheet('SHEETNAME')
def sheet(index,name=false)
@default_sheet = String === index ? index : self.sheets[index]
name ? [@default_sheet,self] : self
end
# iterate through all worksheets of a document
def each_with_pagename
self.sheets.each do |s|
yield sheet(s,true)
end
end
# by passing in headers as options, this method returns
# specific columns from your header assignment
# for example:
# xls.sheet('New Prices').parse(:upc => 'UPC', :price => 'Price') would return:
# [{:upc => 123456789012, :price => 35.42},..]
# the queries are matched with regex, so regex options can be passed in
# such as :price => '^(Cost|Price)'
# case insensitive by default
# by using the :header_search option, you can query for headers
# and return a hash of every row with the keys set to the header result
# for example:
# xls.sheet('New Prices').parse(:header_search => ['UPC*SKU','^Price*\sCost\s'])
# that example searches for a column titled either UPC or SKU and another
# column titled either Price or Cost (regex characters allowed)
# * is the wildcard character
# you can also pass in a :clean => true option to strip the sheet of
# odd unicode characters and white spaces around columns
def each(options={})
if options.empty?
1.upto(last_row) do |line|
yield row(line)
end
else
if options[:clean]
options.delete(:clean)
@cleaned ||= {}
@cleaned[@default_sheet] || clean_sheet(@default_sheet)
end
if options[:header_search]
@headers = nil
@header_line = row_with(options[:header_search])
elsif [:first_row,true].include?(options[:headers])
@headers = []
row(first_row).each_with_index {|x,i| @headers << [x,i + 1]}
else
set_headers(options)
end
headers = @headers ||
Hash[(first_column..last_column).map do |col|
[cell(@header_line,col), col]
end]
@header_line.upto(last_row) do |line|
yield(Hash[headers.map {|k,v| [k,cell(line,v)]}])
end
end
end
def parse(options={})
ary = []
if block_given?
each(options) {|row| ary << yield(row)}
else
each(options) {|row| ary << row}
end
ary
end
def row_with(query,return_headers=false)
query.map! {|x| Array(x.split('*'))}
line_no = 0
each do |row|
line_no += 1
# makes sure headers is the first part of wildcard search for priority
# ex. if UPC and SKU exist for UPC*SKU search, UPC takes the cake
headers = query.map do |q|
q.map {|i| row.grep(/#{i}/i)[0]}.compact[0]
end.compact
if headers.length == query.length
@header_line = line_no
return return_headers ? headers : line_no
elsif line_no > 100
raise "Couldn't find header row."
end
end
end
# this method lets you find the worksheet with the most data
def longest_sheet
sheet(@workbook.worksheets.inject {|m,o|
o.row_count > m.row_count ? o : m
}.name)
end
protected
def file_type_check(filename, ext, name, warning_level, packed=nil)
new_expression = {
'.ods' => 'Roo::Openoffice.new',
'.xls' => 'Roo::Excel.new',
'.xlsx' => 'Roo::Excelx.new',
'.csv' => 'Roo::Csv.new',
'.xml' => 'Roo::Excel2003XML.new',
}
if packed == :zip
# lalala.ods.zip => lalala.ods
# hier wird KEIN unzip gemacht, sondern nur der Name der Datei
# getestet, falls es eine gepackte Datei ist.
filename = File.basename(filename,File.extname(filename))
end
case ext
when '.ods', '.xls', '.xlsx', '.csv', '.xml'
correct_class = "use #{new_expression[ext]} to handle #{ext} spreadsheet files. This has #{File.extname(filename).downcase}"
else
raise "unknown file type: #{ext}"
end
if uri?(filename) && qs_begin = filename.rindex('?')
filename = filename[0..qs_begin-1]
end
if File.extname(filename).downcase != ext
case warning_level
when :error
warn correct_class
raise TypeError, "#{filename} is not #{name} file"
when :warning
warn "are you sure, this is #{name} spreadsheet file?"
warn correct_class
when :ignore
# ignore
else
raise "#{warning_level} illegal state of file_warning"
end
end
end
# konvertiert einen Key in der Form "12,45" (=row,column) in
# ein Array mit numerischen Werten ([12,45])
# Diese Methode ist eine temp. Loesung, um zu erforschen, ob der
# Zugriff mit numerischen Keys schneller ist.
def key_to_num(str)
r,c = str.split(',')
[r.to_i,c.to_i]
end
# see: key_to_num
def key_to_string(arr)
"#{arr[0]},#{arr[1]}"
end
private
def make_tmpdir(tmp_root = nil)
Dir.mktmpdir(TEMP_PREFIX, tmp_root || ENV['ROO_TMP']) do |tmpdir|
yield tmpdir
end
end
def clean_sheet(sheet)
read_cells(sheet) unless @cells_read[sheet]
@cell[sheet].each_pair do |coord,value|
if String === value
@cell[sheet][coord] = sanitize_value(value)
end
end
@cleaned[sheet] = true
end
def sanitize_value(v)
v.strip.unpack('U*').select {|b| b < 127}.pack('U*')
end
def set_headers(hash={})
# try to find header row with all values or give an error
# then create new hash by indexing strings and keeping integers for header array
@headers = row_with(hash.values,true)
@headers = Hash[hash.keys.zip(@headers.map {|x| header_index(x)})]
end
def header_index(query)
row(@header_line).index(query) + first_column
end
def set_value(row,col,value,sheet=nil)
sheet ||= @default_sheet
@cell[sheet][[row,col]] = value
end
def set_type(row,col,type,sheet=nil)
sheet ||= @default_sheet
@cell_type[sheet][[row,col]] = type
end
# converts cell coordinate to numeric values of row,col
def normalize(row,col)
if row.class == String
if col.class == Fixnum
# ('A',1):
# ('B', 5) -> (5, 2)
row, col = col, row
else
raise ArgumentError
end
end
if col.class == String
col = Roo::GenericSpreadsheet.letter_to_number(col)
end
return row,col
end
def uri?(filename)
filename.start_with?("http://", "https://")
end
def open_from_uri(uri, tmpdir)
require 'open-uri'
response = ''
begin
open(uri, "User-Agent" => "Ruby/#{RUBY_VERSION}") { |net|
response = net.read
tempfilename = File.join(tmpdir, File.basename(uri))
File.open(tempfilename,"wb") do |file|
file.write(response)
end
}
rescue OpenURI::HTTPError
raise "could not open #{uri}"
end
File.join(tmpdir, File.basename(uri))
end
def open_from_stream(stream, tmpdir)
tempfilename = File.join(tmpdir, "spreadsheet")
File.open(tempfilename,"wb") do |file|
file.write(stream[7..-1])
end
File.join(tmpdir, "spreadsheet")
end
LETTERS = %w{A B C D E F G H I J K L M N O P Q R S T U V W X Y Z}
# convert a number to something like 'AB' (1 => 'A', 2 => 'B', ...)
def self.number_to_letter(n)
letters=""
if n > 26
while n % 26 == 0 && n != 0
letters << 'Z'
n = (n - 26) / 26
end
while n > 0
num = n%26
letters = LETTERS[num-1] + letters
n = (n / 26)
end
else
letters = LETTERS[n-1]
end
letters
end
# convert letters like 'AB' to a number ('A' => 1, 'B' => 2, ...)
def self.letter_to_number(letters)
result = 0
while letters && letters.length > 0
character = letters[0,1].upcase
num = LETTERS.index(character)
raise ArgumentError, "invalid column character '#{letters[0,1]}'" if num == nil
num += 1
result = result * 26 + num
letters = letters[1..-1]
end
result
end
def unzip(filename, tmpdir)
Zip::ZipFile.open(filename) do |zip|
process_zipfile_packed(zip, tmpdir)
end
end
# check if default_sheet was set and exists in sheets-array
def validate_sheet!(sheet)
case sheet
when nil
raise ArgumentError, "Error: sheet 'nil' not valid"
when Fixnum
self.sheets.fetch(sheet-1) do
raise RangeError, "sheet index #{sheet} not found"
end
when String
if !sheets.include? sheet
raise RangeError, "sheet '#{sheet}' not found"
end
else
raise TypeError, "not a valid sheet type: #{sheet.inspect}"
end
end
def process_zipfile_packed(zip, tmpdir, path='')
if zip.file.file? path
# extract and return filename
File.open(File.join(tmpdir, path),"wb") do |file|
file.write(zip.read(path))
end
File.join(tmpdir, path)
else
ret=nil
path += '/' unless path.empty?
zip.dir.foreach(path) do |filename|
ret = process_zipfile_packed(zip, tmpdir, path + filename)
end
ret
end
end
# Write all cells to the csv file. File can be a filename or nil. If the this
# parameter is nil the output goes to STDOUT
def write_csv_content(file=nil,sheet=nil)
file ||= STDOUT
if first_row(sheet) # sheet is not empty
1.upto(last_row(sheet)) do |row|
1.upto(last_column(sheet)) do |col|
file.print(",") if col > 1
file.print cell_to_csv(row,col,sheet)
end
file.print("\n")
end # sheet not empty
end
end
# The content of a cell in the csv output
def cell_to_csv(row, col, sheet)
if empty?(row,col,sheet)
''
else
onecell = cell(row,col,sheet)
case celltype(row,col,sheet)
when :string
unless onecell.empty?
%{"#{onecell.gsub(/"/,'""')}"}
end
when :float, :percentage
if onecell == onecell.to_i
onecell.to_i.to_s
else
onecell.to_s
end
when :formula
case onecell
when String
unless onecell.empty?
%{"#{onecell.gsub(/"/,'""')}"}
end
when Float
if onecell == onecell.to_i
onecell.to_i.to_s
else
onecell.to_s
end
when DateTime
onecell.to_s
else
raise "unhandled onecell-class #{onecell.class}"
end
when :date, :datetime
onecell.to_s
when :time
Roo::GenericSpreadsheet.integer_to_timestring(onecell)
else
raise "unhandled celltype #{celltype(row,col,sheet)}"
end || ""
end
end
# converts an integer value to a time string like '02:05:06'
def self.integer_to_timestring(content)
h = (content/3600.0).floor
content = content - h*3600
m = (content/60.0).floor
content = content - m*60
s = content
sprintf("%02d:%02d:%02d",h,m,s)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment