thinkerbot/.gitignore

## .gitignore
data
results
config

## Rapfile
require 'tap/task'
require 'tap/tasks/dump/csv'

# Extract::task extract results from Excel file
#
# Inputs an array of result files and extracts identification information
# for each accession number specified in the configurations.  The result
# is a hash of (accession, data) pairs, where the data is an array of
# extracted information; the information at each index in data corresponds
# to the result_file.
#
# Note the index is NOT the index of the result_file, rather it is extracted
# from the name of the result file:
#
#   707_band1_protein_export   => index 1
#   707_band12_protein_export  => index 12
#
# No data is registered when a particular accession was not found in a given
# file.
#
#   % rap extract_results 707_band1_protein_export.csv --accession IPI0012767 --: inspect
#
class Extract < Tap::Task
  config :accessions, [], &c.list

  def process(*result_files)
    results = {}
    accessions.each do |accession|
      results[accession] = Array.new(result_files.length)
    end

    result_files.each do |result_file|
      log :extract, result_file
      extract(result_file, results)
    end

    # add headers and square data
    headers = extract_data
    results.each_value do |data|
      data[0] = headers
      data.each_index do |index|
        data[index] ||= Array.new(headers.length)
      end
    end

    results
  end

  def extract(result_file, results)
    # determine index of extracted information
    unless result_file =~ /_band(\d+)/
      raise "expected result file path like: <n>_band<m>"
    end
    data_index = $1.to_i

    # parse data for each accession
    parsing = false
    CSV.open(result_file, 'r') do |row|
      unless parsing
        # Advance to accessions
        parsing = (row[0] == "Accession")
        next
      end

      if accessions.include?(row[0])
        results[row[0]][data_index] = extract_data(row)
      end
    end
  end

  def extract_data(row=nil)
    return [
      'Norm (V)',
      'Norm (I)',
      'Norm (IIB)',
      'Raw (V)',
      'Raw (I)',
      'Raw (IIB)',
      'Spec (V)',
      'Spec (I)',
      'Spec (IIB)'
    ] unless row

    [
      row[col('G')],
      row[col('H')],
      row[col('I')],
      row[col('J')],
      row[col('K')],
      row[col('L')],
      row[col('M')],
      row[col('N')],
      row[col('O')]
    ]
  end

  ALPHABET = ('A'..'Z').to_a
  def col(letter)
    ALPHABET.index(letter)
  end
end

# Workflow::task runs the extract workflow
#
# Runs the extract task for each set of data in the data directories,
# and then collates the results into csv format.
#
# Inputs:
#
#   data/gel1/.._band1..csv
#            /.._band1..csv
#
#   data/gel2/.._band1..csv
#            /.._band2..csv
#
# Results format:
#
#               gel1            gel2     ...
#   Accession   header  header  header  header
#   1           ...
#   2
#   ...
#
# Command:
#
#   % rap workflow data/* --config config/workflow.yml > results/results.csv
#
class Workflow < Tap::Task
  define :extract, Extract
  define :format, Tap::Tasks::Dump::Csv

  def process(*data_dirs)
    # == data
    # accession: [[band], [band], [band], ...]
    n_bands = 0
    n_data_points = 0
    gels = data_dirs.collect do |dir|
      gel_data = extract.call(*Dir.glob("#{dir}/*"))

      # determine the maximum number of bands
      # and the maximum number of datapoints
      # per band.  These are used in squaring
      # the data for csv
      gel_data.each_value do |bands|
        if n_bands < bands.length
          n_bands = bands.length
        end

        bands.each do |data|
          if n_data_points < data.length
            n_data_points = data.length
          end
        end
      end

      gel_data
    end

    # adds a space between datasets
    n_data_points += 1

    # adds headers
    headers = data_dirs.collect do |dir|
      header = Array.new(n_data_points)
      header[0] = File.basename(dir)
      header
    end
    headers = headers.flatten
    headers.unshift(nil)

    format.call(headers)

    extract.accessions.each do |accession|
      gels.collect do |data|
        bands = data[accession]

        # ensure each gel data has the same
        # number of bands
        while bands.length < n_bands
          bands << []
        end

        # ensure each band has the same number
        # of datapoints
        bands.each do |band|
          while band.length < n_data_points
            band << nil
          end
        end

        bands
      end.transpose.each_with_index do |row, index|
        row = row.flatten
        row.unshift(index == 0 ? accession : index)
        format.call(row)
      end

      format.call([])
    end

    nil
  end
end
	require 'tap/task'
	require 'tap/tasks/dump/csv'

	# Extract::task extract results from Excel file
	#
	# Inputs an array of result files and extracts identification information
	# for each accession number specified in the configurations. The result
	# is a hash of (accession, data) pairs, where the data is an array of
	# extracted information; the information at each index in data corresponds
	# to the result_file.
	#
	# Note the index is NOT the index of the result_file, rather it is extracted
	# from the name of the result file:
	#
	# 707_band1_protein_export => index 1
	# 707_band12_protein_export => index 12
	#
	# No data is registered when a particular accession was not found in a given
	# file.
	#
	# % rap extract_results 707_band1_protein_export.csv --accession IPI0012767 --: inspect
	#
	class Extract < Tap::Task
	config :accessions, [], &c.list

	def process(*result_files)
	results = {}
	accessions.each do \|accession\|
	results[accession] = Array.new(result_files.length)
	end

	result_files.each do \|result_file\|
	log :extract, result_file
	extract(result_file, results)
	end

	# add headers and square data
	headers = extract_data
	results.each_value do \|data\|
	data[0] = headers
	data.each_index do \|index\|
	data[index] \|\|= Array.new(headers.length)
	end
	end

	results
	end

	def extract(result_file, results)
	# determine index of extracted information
	unless result_file =~ /_band(\d+)/
	raise "expected result file path like: <n>_band<m>"
	end
	data_index = $1.to_i

	# parse data for each accession
	parsing = false
	CSV.open(result_file, 'r') do \|row\|
	unless parsing
	# Advance to accessions
	parsing = (row[0] == "Accession")
	next
	end

	if accessions.include?(row[0])
	results[row[0]][data_index] = extract_data(row)
	end
	end
	end

	def extract_data(row=nil)
	return [
	'Norm (V)',
	'Norm (I)',
	'Norm (IIB)',
	'Raw (V)',
	'Raw (I)',
	'Raw (IIB)',
	'Spec (V)',
	'Spec (I)',
	'Spec (IIB)'
	] unless row

	[
	row[col('G')],
	row[col('H')],
	row[col('I')],
	row[col('J')],
	row[col('K')],
	row[col('L')],
	row[col('M')],
	row[col('N')],
	row[col('O')]
	]
	end

	ALPHABET = ('A'..'Z').to_a
	def col(letter)
	ALPHABET.index(letter)
	end
	end

	# Workflow::task runs the extract workflow
	#
	# Runs the extract task for each set of data in the data directories,
	# and then collates the results into csv format.
	#
	# Inputs:
	#
	# data/gel1/.._band1..csv
	# /.._band1..csv
	#
	# data/gel2/.._band1..csv
	# /.._band2..csv
	#
	# Results format:
	#
	# gel1 gel2 ...
	# Accession header header header header
	# 1 ...
	# 2
	# ...
	#
	# Command:
	#
	# % rap workflow data/* --config config/workflow.yml > results/results.csv
	#
	class Workflow < Tap::Task
	define :extract, Extract
	define :format, Tap::Tasks::Dump::Csv

	def process(*data_dirs)
	# == data
	# accession: [[band], [band], [band], ...]
	n_bands = 0
	n_data_points = 0
	gels = data_dirs.collect do \|dir\|
	gel_data = extract.call(Dir.glob("#{dir}/"))

	# determine the maximum number of bands
	# and the maximum number of datapoints
	# per band. These are used in squaring
	# the data for csv
	gel_data.each_value do \|bands\|
	if n_bands < bands.length
	n_bands = bands.length
	end

	bands.each do \|data\|
	if n_data_points < data.length
	n_data_points = data.length
	end
	end
	end

	gel_data
	end

	# adds a space between datasets
	n_data_points += 1

	# adds headers
	headers = data_dirs.collect do \|dir\|
	header = Array.new(n_data_points)
	header[0] = File.basename(dir)
	header
	end
	headers = headers.flatten
	headers.unshift(nil)

	format.call(headers)

	extract.accessions.each do \|accession\|
	gels.collect do \|data\|
	bands = data[accession]

	# ensure each gel data has the same
	# number of bands
	while bands.length < n_bands
	bands << []
	end

	# ensure each band has the same number
	# of datapoints
	bands.each do \|band\|
	while band.length < n_data_points
	band << nil
	end
	end

	bands
	end.transpose.each_with_index do \|row, index\|
	row = row.flatten
	row.unshift(index == 0 ? accession : index)
	format.call(row)
	end

	format.call([])
	end

	nil
	end
	end