Krailon/FastaMLtoSL.py

## FastaMLtoSL.py
#!/usr/bin/env python
# Created by: Lee Bergstrand
# Modified by: Matt McInnes
# Descript: Converts multiline FASTAs to single line FASTAs
#
# Usage: FastaMLtoSL.py <sequences.faa>
# Example: FastaMLtoSL.py mySeqs.faa
#----------------------------------------------------------------------------------------
#===========================================================================================================
#Imports:

import sys
import re
#===========================================================================================================
# Functions:

# 1: Checks if in proper number of arguments are passed gives instructions on proper use.
def argsCheck(numArgs):
  if len(sys.argv) != numArgs:
    print("Converts multiline FASTAs to single line FASTAs")
    print("By Lee Bergstrand\n")
    print("Usage: " + sys.argv[0] + " <sequences.fasta> <coverage threshold>")
    print("Examples: " + sys.argv[0] + " mySeqs.fasta 150")
    exit(1) # Aborts program. (exit(1) indicates that an error occurred)
#===========================================================================================================
# Main program code:

# House keeping...
argsCheck(3) # Checks if the number of arguments are correct.

try:
  # Stores file one for input checking.
  inFile  = sys.argv[1]
  outFile = inFile + ".out"
  coverage_threshold = float(sys.argv[2])
  print(">> Using coverage threshold of %f" % coverage_threshold)

  # Reads sequence file list and stores it as a string object. Safely closes file:
  print(">> Opening FASTA file...")
  with open(inFile, "r") as fasta_file:
    fasta_data = fasta_file.read()
  sequence_blocks = re.findall(">[^>]+", fasta_data)
except IOError:
  print("Error: failed to open " + inFile)
  exit(1)

print(">> Converting FASTA file from multiline to single line and writing to file.")
# Conversts multiline fasta to single line. Writes new fasta to file.
try:
  with open(outFile, "w") as newFasta:
    for block in sequence_blocks:
      try:
        header, sequence = block.split("\n", 1) # Split each fasta into header and sequence.
      except ValueError:
        print("Error: failed to split header and sequence")

      header = header + "\n" # Replace ">" lost in ">" split, Replace "\n" lost in split directly above.
      sequence = sequence.replace("\n", "") + "\n" # Replace newlines in sequence, remember to add one to the end.

      match = re.search("node_(\d+)_length_\d+_cov_([\d\.]+)_id_(\d+)", header, re.IGNORECASE)
      if not match:
        print("Warning: failed to extract header values from sequence:\n%s" % sequence)
        continue

      sequence_node = int(match.groups()[0])
      sequence_coverage = float(match.groups()[1])
      sequence_id = int(match.groups()[2])

      if sequence_coverage < coverage_threshold:
        print(">> Skipping sequence {NODE: %d, id: %d} with coverage %d" % (sequence_node, sequence_id, sequence_coverage))
        continue

      newFasta.write(header + sequence)
except IOError:
  print("Error: failed to open " + inFile)
  exit(1)

newFasta.close()
print(">> Done!")
	#!/usr/bin/env python
	# Created by: Lee Bergstrand
	# Modified by: Matt McInnes
	# Descript: Converts multiline FASTAs to single line FASTAs
	#
	# Usage: FastaMLtoSL.py <sequences.faa>
	# Example: FastaMLtoSL.py mySeqs.faa
	#----------------------------------------------------------------------------------------
	#===========================================================================================================
	#Imports:

	import sys
	import re
	#===========================================================================================================
	# Functions:

	# 1: Checks if in proper number of arguments are passed gives instructions on proper use.
	def argsCheck(numArgs):
	if len(sys.argv) != numArgs:
	print("Converts multiline FASTAs to single line FASTAs")
	print("By Lee Bergstrand\n")
	print("Usage: " + sys.argv[0] + " <sequences.fasta> <coverage threshold>")
	print("Examples: " + sys.argv[0] + " mySeqs.fasta 150")
	exit(1) # Aborts program. (exit(1) indicates that an error occurred)
	#===========================================================================================================
	# Main program code:

	# House keeping...
	argsCheck(3) # Checks if the number of arguments are correct.

	try:
	# Stores file one for input checking.
	inFile = sys.argv[1]
	outFile = inFile + ".out"
	coverage_threshold = float(sys.argv[2])
	print(">> Using coverage threshold of %f" % coverage_threshold)

	# Reads sequence file list and stores it as a string object. Safely closes file:
	print(">> Opening FASTA file...")
	with open(inFile, "r") as fasta_file:
	fasta_data = fasta_file.read()
	sequence_blocks = re.findall(">[^>]+", fasta_data)
	except IOError:
	print("Error: failed to open " + inFile)
	exit(1)

	print(">> Converting FASTA file from multiline to single line and writing to file.")
	# Conversts multiline fasta to single line. Writes new fasta to file.
	try:
	with open(outFile, "w") as newFasta:
	for block in sequence_blocks:
	try:
	header, sequence = block.split("\n", 1) # Split each fasta into header and sequence.
	except ValueError:
	print("Error: failed to split header and sequence")

	header = header + "\n" # Replace ">" lost in ">" split, Replace "\n" lost in split directly above.
	sequence = sequence.replace("\n", "") + "\n" # Replace newlines in sequence, remember to add one to the end.

	match = re.search("node_(\d+)_length_\d+_cov_([\d\.]+)_id_(\d+)", header, re.IGNORECASE)
	if not match:
	print("Warning: failed to extract header values from sequence:\n%s" % sequence)
	continue

	sequence_node = int(match.groups()[0])
	sequence_coverage = float(match.groups()[1])
	sequence_id = int(match.groups()[2])

	if sequence_coverage < coverage_threshold:
	print(">> Skipping sequence {NODE: %d, id: %d} with coverage %d" % (sequence_node, sequence_id, sequence_coverage))
	continue

	newFasta.write(header + sequence)
	except IOError:
	print("Error: failed to open " + inFile)
	exit(1)

	newFasta.close()
	print(">> Done!")