Tony walterst

## filter_fastq.py
#!/usr/bin/env python

# Used to filter a fastq to match another fastq that is a subset of the query one, e.g. matching a
# index fastq to the pear assembled subset fastq
# Usage:  python filter_fastq.py input_fastq target_fastq output_fastq

from sys import argv

from cogent.parse.fastq import MinimalFastqParser

## add_taxa_to_fasta.py
#!/usr/bin/env python

""" Usage:
python add_taxa_to_fasta.py input_taxa_file input_fasta_file output_fasta
"""

from sys import argv

from cogent.parse.fasta import MinimalFastaParser

## collapse_rare_taxa.py
#!/usr/bin/env python

__author__ = "William Walters"
__copyright__ = "NA"
__credits__ = ["William Walters"]
__license__ = "GPL"
__version__ = "1.0"
__maintainer__ = "William Walters"
__email__ = "william.a.walters@gmail.com"

## filter_otu_mapping_from_otu_table.py
#!/usr/bin/env python

__author__ = "William Walters"
__copyright__ = "Copyright 2011"
__credits__ = ["William Walters"]
__license__ = "GPL"
__version__ = "1.0"
__maintainer__ = "William Walters"
__email__ = "William.A.Walters@colorado.edu"


## strip_primers_exclude.py
#!/usr/bin/env python

# USAGE: python strip_primers_exclude.py Mapping_file input_fasta output_fasta log_filename

from sys import argv
from string import upper
from re import compile

from cogent.parse.fasta import MinimalFastaParser
from skbio.sequence import DNA

## workflow_genus_distances.txt
We want to ask the question of how different sequences are within certain genera. In this case, I was looking at Prevotella,
Bacteroides, and Porphyromonas genera within Bacteroidetes, and the distance between sequences are a count of nucleotide differences
divided by the length of the sequence considered.

To do this, I used the 99% OTUs (16S only) from the SILVA 123 release, available here:
http://www.arb-silva.de/no_cache/download/archive/qiime/

We want to minimize the number of sequences included that may erroneously be labeled as the target taxa, but fall on other parts of
the Bacteroidetes tree with other taxa, rather than grouped with the target genus. My goal is to find a node within a Bacteroidetes
tree whose descendents are all or mostly the target genus while retaining the most possible tips that contain the

## remove_short_reads.py
#!/usr/bin/env python


from sys import argv

from cogent.parse.fasta import MinimalFastaParser

min_len = int(argv[2])

for label,seq in MinimalFastaParser(open(argv[1], "U")):

## remove_short_reads.py
#!/usr/bin/env python


from sys import argv

from cogent.parse.fasta import MinimalFastaParser

min_len = int(argv[2])

for label,seq in MinimalFastaParser(open(argv[1], "U")):

## strip_primers_forward_only.py
#!/usr/bin/env python

# USAGE: python strip_primers.py Mapping_file input_fasta output_fasta log_filename

from sys import argv
from string import upper
from re import compile

from cogent.parse.fasta import MinimalFastaParser
from skbio.sequence import DNA

## generate_taxa_scatter_plots.py
#!/usr/bin/env python

__author__ = "William Walters"
__copyright__ = "Copyright 2011"
__credits__ = ["William Walters"]
__license__ = "GPL"
__version__ = "1.0"
__maintainer__ = "William Walters"
__email__ = "William.A.Walters@colorado.edu"
	#!/usr/bin/env python

	# Used to filter a fastq to match another fastq that is a subset of the query one, e.g. matching a
	# index fastq to the pear assembled subset fastq
	# Usage: python filter_fastq.py input_fastq target_fastq output_fastq

	from sys import argv

	from cogent.parse.fastq import MinimalFastqParser
	#!/usr/bin/env python

	""" Usage:
	python add_taxa_to_fasta.py input_taxa_file input_fasta_file output_fasta
	"""

	from sys import argv

	from cogent.parse.fasta import MinimalFastaParser
	#!/usr/bin/env python

	__author__ = "William Walters"
	__copyright__ = "NA"
	__credits__ = ["William Walters"]
	__license__ = "GPL"
	__version__ = "1.0"
	__maintainer__ = "William Walters"
	__email__ = "william.a.walters@gmail.com"
	#!/usr/bin/env python

	__author__ = "William Walters"
	__copyright__ = "Copyright 2011"
	__credits__ = ["William Walters"]
	__license__ = "GPL"
	__version__ = "1.0"
	__maintainer__ = "William Walters"
	__email__ = "William.A.Walters@colorado.edu"
	#!/usr/bin/env python

	# USAGE: python strip_primers_exclude.py Mapping_file input_fasta output_fasta log_filename

	from sys import argv
	from string import upper
	from re import compile

	from cogent.parse.fasta import MinimalFastaParser
	from skbio.sequence import DNA
	We want to ask the question of how different sequences are within certain genera. In this case, I was looking at Prevotella,
	Bacteroides, and Porphyromonas genera within Bacteroidetes, and the distance between sequences are a count of nucleotide differences
	divided by the length of the sequence considered.

	To do this, I used the 99% OTUs (16S only) from the SILVA 123 release, available here:
	http://www.arb-silva.de/no_cache/download/archive/qiime/

	We want to minimize the number of sequences included that may erroneously be labeled as the target taxa, but fall on other parts of
	the Bacteroidetes tree with other taxa, rather than grouped with the target genus. My goal is to find a node within a Bacteroidetes
	tree whose descendents are all or mostly the target genus while retaining the most possible tips that contain the
	#!/usr/bin/env python

	# USAGE: python strip_primers.py Mapping_file input_fasta output_fasta log_filename

	from sys import argv
	from string import upper
	from re import compile

	from cogent.parse.fasta import MinimalFastaParser
	from skbio.sequence import DNA