Marcus Fedarko fedarko

## filter_gfa_lowcov.py
#! /usr/bin/env python
# Filters low-coverage segments from a jumboDBG GFA file.
# This assumes that the second-from-the-last entry in each segment line will be
# length, and that the last entry in each segment line will be k-mer coverage.
# It also uses the definition of k-mer coverage assumed by jumboDBG -- so, the
# conventional "coverage" of a segment can be computed as KC / (length - K).

K = 5001
MINCOV = 5
UPDATE_FREQ = 1000

## read_stats.py
#! /usr/bin/env python3
#
# Computes the total number of reads, total read length, and average read
# length of a set of (maybe gzipped) FASTA / FASTQ files. Requires the pyfastx
# library (https://github.com/lmdu/pyfastx). I designed this in the context of
# computing read statistics, but if you have a set of other sequences (e.g.
# contigs) then I guess this would still work for that.
#
# USAGE:
# ./read_stats.py file1.fa [file2.fa ...]

## shorten_edge_labels.py
#! /usr/bin/env python
#
# Shortens edge labels in a DOT file output by LJA to just show the first line
# and then a count of how many other lines are omitted. (If an edge's label
# spans exactly one or two lines, then the entire label is preserved.)
#
# USAGE:
# ./shorten_edge_labels.py in.dot out.dot

import sys

## check_for_conflicting_node_ids.py
#! /usr/bin/env python
#
# Scans through a jumboDBG / LJA output DOT file; looks for cases where
# the same node is "defined" on multiple lines. This can be caused by the
# same truncated node ID being misused across lines.
#
# USAGE:
# ./check_for_conflicting_node_ids.py graph.dot
#
# Note that this assumes that the input graph was output by jumboDBG / LJA --

## rm_seqs_from_gfa.py
#! /usr/bin/env python3
#
# SUMMARY
# =======
# Outputs a copy of a GFA 1 file with each segment (S) line that contains a
# sequence (not just a "*" character) altered to have an LN:i: tag describing
# the length of the sequence, and the sequence replaced with a "*" character.
#
# All other lines (including S lines that already do not contain a sequence,
# and other types of lines [e.g. H, L, ...]) will be included unchanged in the

## sort-rmdup-bbl.py
#! /usr/bin/env python3
# NOTE: this is a hack, so it will probably break if you have BBL files that
# don't look like the natbib-generated ones I'm used to. It is also pretty
# unintelligent about *how* it sorts entries (it defers most of the work
# to python), so if you have cases where some of your references are by
# the same person or whatever then that might cause the output to not match
# your expectations.

import sys

## gfa-to-fasta.py
#! /usr/bin/env python3
# Converts a GFA assembly graph to a FASTA file of all sequences
# within the graph. Notably, this ignores connections between sequences
# in the graph.
#
# Depends on Python 3.6 or later.
#
# Usage:
# $ ./gfa_to_fasta.py mygraph.gfa contigs.fasta

## handle_duplicate_sample_ids.py
#! /usr/bin/env python3
import os
from collections import Counter
from math import ceil
import re
from numpy import argmax
import pandas as pd
from qiime2 import Metadata

# "Parameters" of this script

## find_missing_dates.py
#! /usr/bin/env python3
from dateutil.parser import parse
import pandas as pd


df = pd.read_csv("20191209_metadata.txt", sep="\t", index_col=0)

# Subset to a certain host subject ID, if desired
df = df[df["host_subject_id"] == "M03"]

## negative_control_stats.py
#! /usr/bin/env python3
"""
This is a small script that looks through the annotated taxonomies of all
features present in a dataset's negative control samples. It's handy for
checking that certain features are (for the most part) absent from these
samples.

This obviously isn't a very formal way of accounting for contamination,
but it is useful for quickly verifying that certain taxa are probably not
the product of contamination. (Better approaches include e.g. the decontam
	#! /usr/bin/env python
	# Filters low-coverage segments from a jumboDBG GFA file.
	# This assumes that the second-from-the-last entry in each segment line will be
	# length, and that the last entry in each segment line will be k-mer coverage.
	# It also uses the definition of k-mer coverage assumed by jumboDBG -- so, the
	# conventional "coverage" of a segment can be computed as KC / (length - K).

	K = 5001
	MINCOV = 5
	UPDATE_FREQ = 1000
	#! /usr/bin/env python3
	#
	# Computes the total number of reads, total read length, and average read
	# length of a set of (maybe gzipped) FASTA / FASTQ files. Requires the pyfastx
	# library (https://github.com/lmdu/pyfastx). I designed this in the context of
	# computing read statistics, but if you have a set of other sequences (e.g.
	# contigs) then I guess this would still work for that.
	#
	# USAGE:
	# ./read_stats.py file1.fa [file2.fa ...]
	#! /usr/bin/env python
	#
	# Shortens edge labels in a DOT file output by LJA to just show the first line
	# and then a count of how many other lines are omitted. (If an edge's label
	# spans exactly one or two lines, then the entire label is preserved.)
	#
	# USAGE:
	# ./shorten_edge_labels.py in.dot out.dot

	import sys
	#! /usr/bin/env python
	#
	# Scans through a jumboDBG / LJA output DOT file; looks for cases where
	# the same node is "defined" on multiple lines. This can be caused by the
	# same truncated node ID being misused across lines.
	#
	# USAGE:
	# ./check_for_conflicting_node_ids.py graph.dot
	#
	# Note that this assumes that the input graph was output by jumboDBG / LJA --
	#! /usr/bin/env python3
	#
	# SUMMARY
	# =======
	# Outputs a copy of a GFA 1 file with each segment (S) line that contains a
	# sequence (not just a "*" character) altered to have an LN:i: tag describing
	# the length of the sequence, and the sequence replaced with a "*" character.
	#
	# All other lines (including S lines that already do not contain a sequence,
	# and other types of lines [e.g. H, L, ...]) will be included unchanged in the
	#! /usr/bin/env python3
	# NOTE: this is a hack, so it will probably break if you have BBL files that
	# don't look like the natbib-generated ones I'm used to. It is also pretty
	# unintelligent about how it sorts entries (it defers most of the work
	# to python), so if you have cases where some of your references are by
	# the same person or whatever then that might cause the output to not match
	# your expectations.

	import sys
	#! /usr/bin/env python3
	# Converts a GFA assembly graph to a FASTA file of all sequences
	# within the graph. Notably, this ignores connections between sequences
	# in the graph.
	#
	# Depends on Python 3.6 or later.
	#
	# Usage:
	# $ ./gfa_to_fasta.py mygraph.gfa contigs.fasta
	#! /usr/bin/env python3
	import os
	from collections import Counter
	from math import ceil
	import re
	from numpy import argmax
	import pandas as pd
	from qiime2 import Metadata

	# "Parameters" of this script
	#! /usr/bin/env python3
	from dateutil.parser import parse
	import pandas as pd


	df = pd.read_csv("20191209_metadata.txt", sep="\t", index_col=0)

	# Subset to a certain host subject ID, if desired
	df = df[df["host_subject_id"] == "M03"]
	#! /usr/bin/env python3
	"""
	This is a small script that looks through the annotated taxonomies of all
	features present in a dataset's negative control samples. It's handy for
	checking that certain features are (for the most part) absent from these
	samples.

	This obviously isn't a very formal way of accounting for contamination,
	but it is useful for quickly verifying that certain taxa are probably not
	the product of contamination. (Better approaches include e.g. the decontam