Created
June 19, 2009 18:57
-
-
Save thinkerbot/132792 to your computer and use it in GitHub Desktop.
20090619 ECM data extraction
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
data | |
results | |
config |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'tap/task' | |
require 'tap/tasks/dump/csv' | |
# Extract::task extract results from Excel file | |
# | |
# Inputs an array of result files and extracts identification information | |
# for each accession number specified in the configurations. The result | |
# is a hash of (accession, data) pairs, where the data is an array of | |
# extracted information; the information at each index in data corresponds | |
# to the result_file. | |
# | |
# Note the index is NOT the index of the result_file, rather it is extracted | |
# from the name of the result file: | |
# | |
# 707_band1_protein_export => index 1 | |
# 707_band12_protein_export => index 12 | |
# | |
# No data is registered when a particular accession was not found in a given | |
# file. | |
# | |
# % rap extract_results 707_band1_protein_export.csv --accession IPI0012767 --: inspect | |
# | |
class Extract < Tap::Task | |
config :accessions, [], &c.list | |
def process(*result_files) | |
results = {} | |
accessions.each do |accession| | |
results[accession] = Array.new(result_files.length) | |
end | |
result_files.each do |result_file| | |
log :extract, result_file | |
extract(result_file, results) | |
end | |
# add headers and square data | |
headers = extract_data | |
results.each_value do |data| | |
data[0] = headers | |
data.each_index do |index| | |
data[index] ||= Array.new(headers.length) | |
end | |
end | |
results | |
end | |
def extract(result_file, results) | |
# determine index of extracted information | |
unless result_file =~ /_band(\d+)/ | |
raise "expected result file path like: <n>_band<m>" | |
end | |
data_index = $1.to_i | |
# parse data for each accession | |
parsing = false | |
CSV.open(result_file, 'r') do |row| | |
unless parsing | |
# Advance to accessions | |
parsing = (row[0] == "Accession") | |
next | |
end | |
if accessions.include?(row[0]) | |
results[row[0]][data_index] = extract_data(row) | |
end | |
end | |
end | |
def extract_data(row=nil) | |
return [ | |
'Norm (V)', | |
'Norm (I)', | |
'Norm (IIB)', | |
'Raw (V)', | |
'Raw (I)', | |
'Raw (IIB)', | |
'Spec (V)', | |
'Spec (I)', | |
'Spec (IIB)' | |
] unless row | |
[ | |
row[col('G')], | |
row[col('H')], | |
row[col('I')], | |
row[col('J')], | |
row[col('K')], | |
row[col('L')], | |
row[col('M')], | |
row[col('N')], | |
row[col('O')] | |
] | |
end | |
ALPHABET = ('A'..'Z').to_a | |
def col(letter) | |
ALPHABET.index(letter) | |
end | |
end | |
# Workflow::task runs the extract workflow | |
# | |
# Runs the extract task for each set of data in the data directories, | |
# and then collates the results into csv format. | |
# | |
# Inputs: | |
# | |
# data/gel1/.._band1..csv | |
# /.._band1..csv | |
# | |
# data/gel2/.._band1..csv | |
# /.._band2..csv | |
# | |
# Results format: | |
# | |
# gel1 gel2 ... | |
# Accession header header header header | |
# 1 ... | |
# 2 | |
# ... | |
# | |
# Command: | |
# | |
# % rap workflow data/* --config config/workflow.yml > results/results.csv | |
# | |
class Workflow < Tap::Task | |
define :extract, Extract | |
define :format, Tap::Tasks::Dump::Csv | |
def process(*data_dirs) | |
# == data | |
# accession: [[band], [band], [band], ...] | |
n_bands = 0 | |
n_data_points = 0 | |
gels = data_dirs.collect do |dir| | |
gel_data = extract.call(*Dir.glob("#{dir}/*")) | |
# determine the maximum number of bands | |
# and the maximum number of datapoints | |
# per band. These are used in squaring | |
# the data for csv | |
gel_data.each_value do |bands| | |
if n_bands < bands.length | |
n_bands = bands.length | |
end | |
bands.each do |data| | |
if n_data_points < data.length | |
n_data_points = data.length | |
end | |
end | |
end | |
gel_data | |
end | |
# adds a space between datasets | |
n_data_points += 1 | |
# adds headers | |
headers = data_dirs.collect do |dir| | |
header = Array.new(n_data_points) | |
header[0] = File.basename(dir) | |
header | |
end | |
headers = headers.flatten | |
headers.unshift(nil) | |
format.call(headers) | |
extract.accessions.each do |accession| | |
gels.collect do |data| | |
bands = data[accession] | |
# ensure each gel data has the same | |
# number of bands | |
while bands.length < n_bands | |
bands << [] | |
end | |
# ensure each band has the same number | |
# of datapoints | |
bands.each do |band| | |
while band.length < n_data_points | |
band << nil | |
end | |
end | |
bands | |
end.transpose.each_with_index do |row, index| | |
row = row.flatten | |
row.unshift(index == 0 ? accession : index) | |
format.call(row) | |
end | |
format.call([]) | |
end | |
nil | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment