Skip to content

Instantly share code, notes, and snippets.

@thinkerbot
Created June 19, 2009 18:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thinkerbot/132792 to your computer and use it in GitHub Desktop.
Save thinkerbot/132792 to your computer and use it in GitHub Desktop.
20090619 ECM data extraction
data
results
config
require 'tap/task'
require 'tap/tasks/dump/csv'
# Extract::task extract results from Excel file
#
# Inputs an array of result files and extracts identification information
# for each accession number specified in the configurations. The result
# is a hash of (accession, data) pairs, where the data is an array of
# extracted information; the information at each index in data corresponds
# to the result_file.
#
# Note the index is NOT the index of the result_file, rather it is extracted
# from the name of the result file:
#
# 707_band1_protein_export => index 1
# 707_band12_protein_export => index 12
#
# No data is registered when a particular accession was not found in a given
# file.
#
# % rap extract_results 707_band1_protein_export.csv --accession IPI0012767 --: inspect
#
class Extract < Tap::Task
config :accessions, [], &c.list
def process(*result_files)
results = {}
accessions.each do |accession|
results[accession] = Array.new(result_files.length)
end
result_files.each do |result_file|
log :extract, result_file
extract(result_file, results)
end
# add headers and square data
headers = extract_data
results.each_value do |data|
data[0] = headers
data.each_index do |index|
data[index] ||= Array.new(headers.length)
end
end
results
end
def extract(result_file, results)
# determine index of extracted information
unless result_file =~ /_band(\d+)/
raise "expected result file path like: <n>_band<m>"
end
data_index = $1.to_i
# parse data for each accession
parsing = false
CSV.open(result_file, 'r') do |row|
unless parsing
# Advance to accessions
parsing = (row[0] == "Accession")
next
end
if accessions.include?(row[0])
results[row[0]][data_index] = extract_data(row)
end
end
end
def extract_data(row=nil)
return [
'Norm (V)',
'Norm (I)',
'Norm (IIB)',
'Raw (V)',
'Raw (I)',
'Raw (IIB)',
'Spec (V)',
'Spec (I)',
'Spec (IIB)'
] unless row
[
row[col('G')],
row[col('H')],
row[col('I')],
row[col('J')],
row[col('K')],
row[col('L')],
row[col('M')],
row[col('N')],
row[col('O')]
]
end
ALPHABET = ('A'..'Z').to_a
def col(letter)
ALPHABET.index(letter)
end
end
# Workflow::task runs the extract workflow
#
# Runs the extract task for each set of data in the data directories,
# and then collates the results into csv format.
#
# Inputs:
#
# data/gel1/.._band1..csv
# /.._band1..csv
#
# data/gel2/.._band1..csv
# /.._band2..csv
#
# Results format:
#
# gel1 gel2 ...
# Accession header header header header
# 1 ...
# 2
# ...
#
# Command:
#
# % rap workflow data/* --config config/workflow.yml > results/results.csv
#
class Workflow < Tap::Task
define :extract, Extract
define :format, Tap::Tasks::Dump::Csv
def process(*data_dirs)
# == data
# accession: [[band], [band], [band], ...]
n_bands = 0
n_data_points = 0
gels = data_dirs.collect do |dir|
gel_data = extract.call(*Dir.glob("#{dir}/*"))
# determine the maximum number of bands
# and the maximum number of datapoints
# per band. These are used in squaring
# the data for csv
gel_data.each_value do |bands|
if n_bands < bands.length
n_bands = bands.length
end
bands.each do |data|
if n_data_points < data.length
n_data_points = data.length
end
end
end
gel_data
end
# adds a space between datasets
n_data_points += 1
# adds headers
headers = data_dirs.collect do |dir|
header = Array.new(n_data_points)
header[0] = File.basename(dir)
header
end
headers = headers.flatten
headers.unshift(nil)
format.call(headers)
extract.accessions.each do |accession|
gels.collect do |data|
bands = data[accession]
# ensure each gel data has the same
# number of bands
while bands.length < n_bands
bands << []
end
# ensure each band has the same number
# of datapoints
bands.each do |band|
while band.length < n_data_points
band << nil
end
end
bands
end.transpose.each_with_index do |row, index|
row = row.flatten
row.unshift(index == 0 ? accession : index)
format.call(row)
end
format.call([])
end
nil
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment