Created
December 8, 2018 15:08
-
-
Save NsAveek/e3a200f95569ec7bf3b382b81554d98d to your computer and use it in GitHub Desktop.
(Python) Find out the genbank, fasta and cluster format from the file and print the nucleotides
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os.path | |
import re | |
def is_cluster_format(line): | |
cluster_format_matcher = re.compile(clusterFormat) | |
cluster_format_result = cluster_format_matcher.findall(line) | |
return cluster_format_result.__len__() > 0 | |
def is_fasta_format(line): | |
fasta_format_matcher = re.compile(faFormat) | |
fasta_format_result = fasta_format_matcher.findall(line) | |
return fasta_format_result.__len__() > 0 | |
def is_gen_bank_format(line): | |
for i in line: | |
gen_bank_format_result = re.findall(genBankFormat, i) | |
if gen_bank_format_result.__len__() > 0: | |
return gen_bank_format_result.__len__() > 0 | |
return False | |
def get_the_genbank_lines(lines): | |
string_to_process = '' | |
patternFound = False | |
for line in lines: | |
if patternFound: | |
string_to_process += line | |
else: | |
if re.findall(genBankFormat, line).__len__() > 0: | |
patternFound = True | |
string_to_process = string_to_process.replace('//', '').strip() | |
string_to_process = re.sub(r'\b\d+\b', '', string_to_process).replace('\n', '').replace('\s', '').strip() | |
listOfStr = string_to_process.split(" ") | |
final_gb_format = '' | |
for eachStr in listOfStr: | |
if eachStr != '': | |
final_gb_format += eachStr | |
return final_gb_format | |
def count_characters(string_data): | |
return len(string_data) | |
if __name__ == '__main__': | |
while "true": | |
faFormat = '^>' | |
genBankFormat = 'ORIGIN\n' | |
clusterFormat = '^seq\\d\t' | |
fileName = raw_input("\nPlease enter a file name : ").strip() | |
try: | |
inputFile = open(fileName) | |
except IOError: | |
print ("File does not exist. Please give a correct file path") | |
else: | |
print(inputFile.name) | |
read_lines_from_file = tuple(inputFile.readlines()) | |
if is_fasta_format(read_lines_from_file[0].strip("\n")): | |
sequence = '' | |
sequenceCounter = 1 | |
print "is Fasta Format \n" | |
for i in range(1, len(read_lines_from_file)): | |
if is_fasta_format(read_lines_from_file[i].strip("\n")): | |
actualSequence = sequence.replace('\n', '') | |
print " Sequence : ", sequenceCounter, " Counter : ", count_characters(actualSequence) | |
print actualSequence.replace('\r', '') | |
sequenceCounter = sequenceCounter + 1 | |
sequence = '' | |
continue | |
else: | |
sequence += read_lines_from_file[i] | |
print " Sequence : ", sequenceCounter, " Counter : ", count_characters( | |
sequence.replace('\n', '')) | |
print sequence.replace('\n', '').replace('\r', '') | |
elif is_cluster_format(read_lines_from_file[0].strip("\n")): | |
sequence = '' | |
sequenceCounter = 1 | |
print "is Cluster Format" | |
for each_line in range(0, len(read_lines_from_file)): | |
sequence = read_lines_from_file[each_line] | |
actualSequence = sequence.replace('\n', '') | |
actualSequence = re.sub(r'^.*\t', r'', actualSequence) | |
print " Sequence : ", sequenceCounter, " Counter : ", count_characters(actualSequence) | |
print actualSequence | |
sequenceCounter = sequenceCounter + 1 | |
elif is_gen_bank_format(read_lines_from_file): | |
print "is gen bank format\n" | |
sequence = get_the_genbank_lines(read_lines_from_file) | |
print "Sequence : ", count_characters(sequence) | |
print sequence | |
else: | |
print "File format not supported, Please retry" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment