Skip to content

Instantly share code, notes, and snippets.

@NsAveek
Created December 8, 2018 15:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save NsAveek/e3a200f95569ec7bf3b382b81554d98d to your computer and use it in GitHub Desktop.
Save NsAveek/e3a200f95569ec7bf3b382b81554d98d to your computer and use it in GitHub Desktop.
(Python) Find out the genbank, fasta and cluster format from the file and print the nucleotides
import os.path
import re
def is_cluster_format(line):
cluster_format_matcher = re.compile(clusterFormat)
cluster_format_result = cluster_format_matcher.findall(line)
return cluster_format_result.__len__() > 0
def is_fasta_format(line):
fasta_format_matcher = re.compile(faFormat)
fasta_format_result = fasta_format_matcher.findall(line)
return fasta_format_result.__len__() > 0
def is_gen_bank_format(line):
for i in line:
gen_bank_format_result = re.findall(genBankFormat, i)
if gen_bank_format_result.__len__() > 0:
return gen_bank_format_result.__len__() > 0
return False
def get_the_genbank_lines(lines):
string_to_process = ''
patternFound = False
for line in lines:
if patternFound:
string_to_process += line
else:
if re.findall(genBankFormat, line).__len__() > 0:
patternFound = True
string_to_process = string_to_process.replace('//', '').strip()
string_to_process = re.sub(r'\b\d+\b', '', string_to_process).replace('\n', '').replace('\s', '').strip()
listOfStr = string_to_process.split(" ")
final_gb_format = ''
for eachStr in listOfStr:
if eachStr != '':
final_gb_format += eachStr
return final_gb_format
def count_characters(string_data):
return len(string_data)
if __name__ == '__main__':
while "true":
faFormat = '^>'
genBankFormat = 'ORIGIN\n'
clusterFormat = '^seq\\d\t'
fileName = raw_input("\nPlease enter a file name : ").strip()
try:
inputFile = open(fileName)
except IOError:
print ("File does not exist. Please give a correct file path")
else:
print(inputFile.name)
read_lines_from_file = tuple(inputFile.readlines())
if is_fasta_format(read_lines_from_file[0].strip("\n")):
sequence = ''
sequenceCounter = 1
print "is Fasta Format \n"
for i in range(1, len(read_lines_from_file)):
if is_fasta_format(read_lines_from_file[i].strip("\n")):
actualSequence = sequence.replace('\n', '')
print " Sequence : ", sequenceCounter, " Counter : ", count_characters(actualSequence)
print actualSequence.replace('\r', '')
sequenceCounter = sequenceCounter + 1
sequence = ''
continue
else:
sequence += read_lines_from_file[i]
print " Sequence : ", sequenceCounter, " Counter : ", count_characters(
sequence.replace('\n', ''))
print sequence.replace('\n', '').replace('\r', '')
elif is_cluster_format(read_lines_from_file[0].strip("\n")):
sequence = ''
sequenceCounter = 1
print "is Cluster Format"
for each_line in range(0, len(read_lines_from_file)):
sequence = read_lines_from_file[each_line]
actualSequence = sequence.replace('\n', '')
actualSequence = re.sub(r'^.*\t', r'', actualSequence)
print " Sequence : ", sequenceCounter, " Counter : ", count_characters(actualSequence)
print actualSequence
sequenceCounter = sequenceCounter + 1
elif is_gen_bank_format(read_lines_from_file):
print "is gen bank format\n"
sequence = get_the_genbank_lines(read_lines_from_file)
print "Sequence : ", count_characters(sequence)
print sequence
else:
print "File format not supported, Please retry"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment