Skip to content

Instantly share code, notes, and snippets.

@gregcaporaso
Created August 23, 2013 14:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gregcaporaso/6320252 to your computer and use it in GitHub Desktop.
Save gregcaporaso/6320252 to your computer and use it in GitHub Desktop.
Example files used while developing pyqi's Getting Started tutorials.

These files were used while developing pyqi's Getting Started tutorials. See those documents for usage examples.

>s1
ACCTTTAACC
>s2
CCGG
>s3
AAAAAAAAAAAAAAAAAAAAAAAAAAA
#!/usr/bin/env python
from __future__ import division
from pyqi.core.command import Command, Parameter, ParameterCollection
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2013, Greg Caporaso"
__credits__ = ["Greg Caporaso"]
__license__ = "BSD"
__version__ = "0.0.1"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"
class SequenceCollectionSummarizer(Command):
BriefDescription = "Generate summary statistics on a collection of sequences."
LongDescription = "Provided the number of sequences, the minimum sequence length, and the maximum sequence length given a collection of sequences. Sequences should be provided as a list (or generator) of tuples of (sequence id, sequence) pairs."
Parameters = ParameterCollection([
Parameter(Name='seqs', DataType=list,
Description='sequences to be summarized', Required=True),
Parameter(Name='suppress_length_summary', DataType=bool,
Description='do not generate summary information on the sequence lengths',
Required=False,Default=False)
])
def run(self, **kwargs):
"""
"""
num_seqs = 0
sequence_lengths = []
for seq_id, seq in kwargs['seqs']:
num_seqs += 1
sequence_lengths.append(len(seq))
if kwargs['suppress_length_summary']:
min_length = None
max_length = None
else:
min_length = min(sequence_lengths)
max_length = max(sequence_lengths)
return {'num-seqs':num_seqs,
'min-length':min_length,
'max-length':max_length}
CommandConstructor = SequenceCollectionSummarizer
#!/usr/bin/env python
from pyqi.core.interfaces.optparse import (OptparseUsageExample,
OptparseOption, OptparseResult)
from pyqi.core.command import make_parameter_collection_lookup_f
from sequence_collection_summarizer import CommandConstructor
from pyqi.core.exception import IncompetentDeveloperError
import os
param_lookup = make_parameter_collection_lookup_f(CommandConstructor)
def parse_fasta(fp):
"""
fp: path to a fasta-formatted file
This function is a fasta record generator, yielding
(sequence id, sequence) pairs when provided with a
valid fasta file.
NO ERROR CHECKING IS PERFORMED!
"""
# Always open files for reading in python using mode 'U'
# to correctly handle different types of line breaks
f = open(fp,'U')
seq_id = None
seq = []
for line in f:
line = line.strip()
if line.startswith('>'):
if len(seq) != 0:
# we've completed a fasta record
yield seq_id, ''.join(seq)
seq_id = line[1:]
seq = []
else:
seq.append(line)
yield seq_id, ''.join(seq)
def append_datum_to_file(result_key, data, option_value=None):
"""Append summary information to a file.
"""
# don't do anything if data is None
if data is None:
return
# If option_value is None when this output handler is called,
# the interface developer did something wrong when defining
# the OptparseResults. Politely alert the developer that
# this output handler isn't associated with an option
# (it needs to be associated with an output file path).
if option_value is None:
raise IncompetentDeveloperError(
"Cannot write output without a filepath.")
# open the output file for appending, and write the
# summary information to a single tab-separated line
with open(option_value, 'a') as f:
f.write('%s\t%d\n' % (result_key, data))
usage_examples = [
OptparseUsageExample(ShortDesc="Summarize the input sequence collection and write the result to file.",
LongDesc="Read the file specified by -i, and compute the number of sequences in the file, and the minimum and maximum sequence lengths. Write all of that information to path specified by -o.",
Ex="%prog -i seqs.fna -o seqs.summary.txt"),
OptparseUsageExample(ShortDesc="Summarize the input sequence collection and write the result to file, excluding information on sequence lengths.",
LongDesc="Read the file specified by -i, compute the number of sequences in the file, and write that information to path specified by -o.",
Ex="%prog -i seqs.fna -o seqs.summary.txt --suppress-length-summary")
]
inputs = [
OptparseOption(Parameter=param_lookup('seqs'),
InputType='existing_filepath',
InputAction='store',
InputHandler=parse_fasta,
ShortName='i'),
OptparseOption(Parameter=param_lookup('suppress_length_summary'),
InputType=None,
InputAction='store_true',
InputHandler=None,
ShortName=None),
OptparseOption(Parameter=None,
InputType='new_filepath',
InputAction='store',
ShortName='o',
Name='output-fp',
Required=True,
Help='path where output should be written')
]
outputs = [
OptparseResult(ResultKey='num-seqs',
OutputHandler=append_datum_to_file,
OptionName='output-fp'),
OptparseResult(ResultKey='min-length',
OutputHandler=append_datum_to_file,
OptionName='output-fp'),
OptparseResult(ResultKey='max-length',
OutputHandler=append_datum_to_file,
OptionName='output-fp'),
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment