Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Smart Spreadsheet reading with Ruby
### IMPORTS
 
require 'roo'
require 'pp'
 
### IMPLEMENTATION ###
 
# A Excel spreadsheet reader that can clean up column names and convert data.
#
# Assumptions: The data is read off the first sheet of the workbook. The sheet
# has column headers. Column names are cleaned up, so as to tolerate small
# errors (see "read_headers"). Each row has the same number of entries /
# columns.
#
# This happens Excel95 (xls) and Excel XML (xlsx).
#
# @example How to use this class
#    rdr = XlsReader('my-excel.xls')
#    rdr.read() { |rec| print rec }
#
class ExcelReader
 
   attr_accessor :wbook, :syn_dict, :headers
 
   # Initialise the reader.
   #
   # @param [String] infile   the path to the spreadsheet to be read in.
   # @param [Hash] syn_dict   a remapping of column names, to allow synonyms
   #
   # @example How to create a reader
   #    rdr = XlsReader('my-excel.xls')
   #    rdr = XlsReader('my-excel.xls', {'foo_bar'=> 'foobar'})
   #
   def initialize(infile, file_type)
      # NOTE: roo insists that files end with the right extension
      tmpfile = Tempfile.new(['excel', ".#{file_type}"])
      tmpfile_hndl = tmpfile.open()
      tmpfile_hndl.write (open(infile, 'r').read)
      tmpfile_hndl.close()
 
      if file_type == 'xls'
         @wbook = Excel.new(tmpfile.path)
      elsif file_type == 'xlsx'
         @wbook = Excelx.new(tmpfile.path)
      else
         raise ArgumentError("unrecognised filetype '#{file_type}'")
      end
      # NOTE: in roo, you don't select a worksheet, you name the current one
      @wbook.default_sheet = wbook.sheets.first
      @syn_dict = {}
   end
 
   # Read the sheet, and return each non-header row to the passed block.
   #
   # This first reads the headers and cleans them up (see "read_headers"). When
   # each row is read, the values are stored in a hash with the cleaned up
   # column names as keys. If a method exists called "method_", this is
   # used to convert the value. Spreadsheet seems to convert floats but dates
   # are returned as Excel serial values.
   #
   # @param [Block] blk  An executable block
   #
   def read(&blk)
      ## Preconditions:
      # NOTE: you can't grab a row or just read a line in roo, you have to ask
      # about the bounds and explcitly iter over the cell contents.
      row_start = @wbook.first_row
      row_stop = @wbook.last_row
      col_start = @wbook.first_column
      col_stop = @wbook.last_column
      if row_start != 1
         raise RuntimeError, "data must start on the first row not #{row_start}"
      end
      if col_start != 1
         raise RuntimeError, "data must start in the first column not #{col_start}"
      end
      # Main:
      # grab and parse headers
      @headers = read_headers(col_stop)
      # read each row
      (2..row_stop).each { |i|
         row_data = (1..col_stop).collect { |j| @wbook.cell(i,j) }
         row_zip = @headers.zip(row_data).flatten()
         row_hash = Hash[*row_zip]
         row_hash.each_pair { |k,v|
            meth_name = "convert_#{k}"
            if (self.respond_to?(meth_name))
               row_hash[k] = self.send(meth_name, v)
            else
               row_hash[k] = convert(v)
            end
         }
         blk.call(row_hash)
      }
   end
 
   # Clean up the title of a column to something reasonable
   #
   # If the header is recognised or a synonym, return it in the canonical
   # i.e. a symbol. Otherwise send back the raw form (a string with space
   # stripped from flanks)
   #
   def clean_col_header (hdr)
      # Conversion is:
      # - strip flanking space
      # - convert all internal non-alphanumerics to underscores
      # reduce consequetive underscores to a single one
      # TODO: strip flanking underscores
      clean_str = hdr.downcase.strip.gsub(/\W+/, '_')
      return clean_str.gsub(/_+/, '_')
   end
 
   # Return the canonical set of headers.
   #
   # This makes everything lowercase, strips flankning space, and substitutes
   # underscores for spaces. Override to validate or for further process headers.
   #
   def read_headers (col_stop)
      # collect header row
      headers = (1..col_stop).collect { |j|
         @wbook.cell(1,j)
      }
      # drop case, strip flanking spaces, replace gaps with underscores
      return headers.collect { |h|
         raw_val = h.strip()
         h_str = clean_col_header (raw_val)
         @syn_dict.fetch(h_str, raw_val)
      }
   end
 
   # General conversion of spreadsheet cell values
   #
   def convert(val)
      # clean up strings and return nil for
      if val.class == String
         val.strip!()
         if ["?", "unknown"].member?(val.downcase())
            return nil
         end
      end
      return val
   end
 
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.