Skip to content

Instantly share code, notes, and snippets.

@klprint
Created August 9, 2016 10:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save klprint/936adc536f828011f1ea39346427b0a7 to your computer and use it in GitHub Desktop.
Save klprint/936adc536f828011f1ea39346427b0a7 to your computer and use it in GitHub Desktop.
Searches in files with the same extension for sequences being provided by a fasta file. The script picks 15-16 base-pairs in the middle of each sequence in the fasta file and uses grep to find the number of occurrences.
import subprocess
import glob
# Functions
def read_multifasta(file_path):
is_entry = False
fasta_dict = {}
sequence = []
key = None
with open(file_path, 'r') as fasta:
for line in fasta:
if line[0] == '>':
if is_entry:
fasta_dict[key] = ''.join(sequence)
sequence = []
key = line.split(' ')[0][1:]
is_entry = True
else:
if line[0] != '\n':
sequence.append(line.rstrip())
fasta_dict[key] = ''.join(sequence)
return(fasta_dict)
def subset_seq(sequence, q_len):
mid = int(len(sequence)/2)
start = mid - int(q_len/2)
end = mid + int(q_len/2)
sub_seq = sequence[start-1:end]
return(sub_seq)
def grep(subj_file, querry):
cmd = ['grep', '-c', querry, subj_file]
count = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0]
count = count.decode('utf-8')
count = int(count)
print(count)
return(count)
def main():
ext = input('Extension of Read-Files (without \'.\'): ')
files = glob.glob('*.'+ext)
seqs = input('Path to querry fasta: ')
seqs = read_multifasta(seqs)
all_keys = len(seqs.keys())
print(str(all_keys) + ' sequences loaded.')
querry_seqs = {}
for seqID in seqs.keys():
full_seq = seqs[seqID]
sub_seq = subset_seq(full_seq, 15)
querry_seqs[seqID] = sub_seq
with open('countfile.out', 'w') as outfile:
outfile.write('ReadFile\t')
id_count = 0
qIDs = querry_seqs.keys() # determines correct order
for id in qIDs:
id_count += 1
outfile.write(id)
if id_count == len(qIDs):
outfile.write('\n')
else:
outfile.write('\t')
searched_keys = 1
for file in files:
print('Started search for sequences in ' + file)
outfile.write(file + '\t')
for querryID in qIDs:
querry = querry_seqs[querryID]
print(querryID)
count = grep(file, querry=querry)
outfile.write(str(count) + '\t')
print(str((searched_keys/all_keys)*100) +'% searched')
searched_keys += 1
searched_keys = 0
outfile.write('\n')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment