Skip to content

Instantly share code, notes, and snippets.

@Krailon
Last active May 23, 2020 05:47
Show Gist options
  • Save Krailon/3ae4f43460292910b3aa to your computer and use it in GitHub Desktop.
Save Krailon/3ae4f43460292910b3aa to your computer and use it in GitHub Desktop.
Modified version of Lee's script to convert multiline FASTA to single-line. A "coverage threshold" option has been added to allow discarding of low-coverage sequences.
#!/usr/bin/env python
# Created by: Lee Bergstrand
# Modified by: Matt McInnes
# Descript: Converts multiline FASTAs to single line FASTAs
#
# Usage: FastaMLtoSL.py <sequences.faa>
# Example: FastaMLtoSL.py mySeqs.faa
#----------------------------------------------------------------------------------------
#===========================================================================================================
#Imports:
import sys
import re
#===========================================================================================================
# Functions:
# 1: Checks if in proper number of arguments are passed gives instructions on proper use.
def argsCheck(numArgs):
if len(sys.argv) != numArgs:
print("Converts multiline FASTAs to single line FASTAs")
print("By Lee Bergstrand\n")
print("Usage: " + sys.argv[0] + " <sequences.fasta> <coverage threshold>")
print("Examples: " + sys.argv[0] + " mySeqs.fasta 150")
exit(1) # Aborts program. (exit(1) indicates that an error occurred)
#===========================================================================================================
# Main program code:
# House keeping...
argsCheck(3) # Checks if the number of arguments are correct.
try:
# Stores file one for input checking.
inFile = sys.argv[1]
outFile = inFile + ".out"
coverage_threshold = float(sys.argv[2])
print(">> Using coverage threshold of %f" % coverage_threshold)
# Reads sequence file list and stores it as a string object. Safely closes file:
print(">> Opening FASTA file...")
with open(inFile, "r") as fasta_file:
fasta_data = fasta_file.read()
sequence_blocks = re.findall(">[^>]+", fasta_data)
except IOError:
print("Error: failed to open " + inFile)
exit(1)
print(">> Converting FASTA file from multiline to single line and writing to file.")
# Conversts multiline fasta to single line. Writes new fasta to file.
try:
with open(outFile, "w") as newFasta:
for block in sequence_blocks:
try:
header, sequence = block.split("\n", 1) # Split each fasta into header and sequence.
except ValueError:
print("Error: failed to split header and sequence")
header = header + "\n" # Replace ">" lost in ">" split, Replace "\n" lost in split directly above.
sequence = sequence.replace("\n", "") + "\n" # Replace newlines in sequence, remember to add one to the end.
match = re.search("node_(\d+)_length_\d+_cov_([\d\.]+)_id_(\d+)", header, re.IGNORECASE)
if not match:
print("Warning: failed to extract header values from sequence:\n%s" % sequence)
continue
sequence_node = int(match.groups()[0])
sequence_coverage = float(match.groups()[1])
sequence_id = int(match.groups()[2])
if sequence_coverage < coverage_threshold:
print(">> Skipping sequence {NODE: %d, id: %d} with coverage %d" % (sequence_node, sequence_id, sequence_coverage))
continue
newFasta.write(header + sequence)
except IOError:
print("Error: failed to open " + inFile)
exit(1)
newFasta.close()
print(">> Done!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment