Skip to content

Instantly share code, notes, and snippets.

@dinovski
Last active September 10, 2022 16:07
Show Gist options
  • Save dinovski/f01f281db77005e8e47e960b75c27bc9 to your computer and use it in GitHub Desktop.
Save dinovski/f01f281db77005e8e47e960b75c27bc9 to your computer and use it in GitHub Desktop.
case insensitive motif search of a FASTA file (eg. find all 'GATC' sites for binning damID reads).
#!/usr/bin/env python
from Bio import SeqIO
import re
import os
import sys
import csv
usage="""
Perform case insensitive motif search of a FASTA file and output 3 BED files: motif_sites.bed (motif coordinates), motif_fragments.bed (coordinates between each motif), and motif_fragments_full.bed (coordinates between and including each motif)
python fasta2bed.py <motif> <fasta.fa> <out_path>
eg. python fasta2bed.py GATC hg19.fa odir/
"""
if len(sys.argv) < 4:
print(usage)
pattern = sys.argv[1]
fasta_path = sys.argv[2]
out_path = sys.argv[3]
motif_sites = os.path.join(out_path, pattern + "_sites.bed")
motif_frags_full = os.path.join(out_path, pattern + "_fragments_full.bed")
motif_frags = os.path.join(out_path, pattern + "_fragments.bed")
outfile_sites = open(motif_sites, 'w')
def search_fasta(pattern, fasta_path):
for record in SeqIO.parse(open(fasta_path, 'r'), "fasta"):
chrom = record.id
for match in re.finditer(pattern, str(record.seq), re.IGNORECASE):
start_pos = match.start() + 1
end_pos = match.end()
outfile_sites.write(chrom + "\t" + str(start_pos) + "\t" + str(end_pos) + "\n")
search_fasta(pattern, fasta_path)
outfile_sites.close()
print("Converting sites to fragments")
## convert sites to fragments: coordinates are from motif1 start to motif2 end
infile = motif_sites
outfile_frags_full = open(motif_frags_full, 'w')
outfile_frags = open(motif_frags, 'w')
with open(infile,'r') as f:
reader = csv.reader(f, delimiter='\t')
#previous = reader.next() #python2
previous = next(reader)
for line in reader:
if line[0] == previous[0]: #to process by chr
chr = previous[0]
start = int(previous[1])
end = int(line[2])
#print chr, start, end
previous = line
outfile_frags_full.write(chr + "\t" + str(start) + "\t" + str(end) + "\n")
outfile_frags.write(chr + "\t" + str(start + len(pattern)) + "\t" + str(end - len(pattern)) + "\n")
else:
print("processing %s" % (line[0]))
previous = line
outfile_frags_full.close()
outfile_frags.close()
if __name__ == "__main__":
print('Module is run directly')
@dinovski
Copy link
Author

updated to python 3

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment