Last active
June 9, 2019 20:56
-
-
Save mr-eyes/4ecabbe816df38aa9212a49e9dbad8ef to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Exclude CDHIT Clusters that contains number of sequences below a certain threshold | |
""" | |
import sys | |
args = sys.argv | |
if len(args) < 3: | |
exit("run: python cdhit_filter.py <input_file> <threshold>") | |
else: | |
ORIGINAL_FILE = args[1] | |
NEW_FILE = "filtered_" + ORIGINAL_FILE | |
THRESHOLD = int(args[2]) | |
cluster_buffer = "" | |
with open(ORIGINAL_FILE, 'r') as cdhit, open(NEW_FILE, 'w') as filtered: | |
first_line = next(cdhit) # Add the first line to the buffer | |
cluster_buffer += first_line | |
# Iterate over each line | |
for line in cdhit: | |
# Check if line starts with ">" that means a new cluster started, so analyse the previous one | |
if line[0] == ">": | |
# Count how many "at" in the cluster | |
count = cluster_buffer.count("at") + 1 | |
# Check if the count more than the predefined threshold | |
# If yes, write it to the new file | |
if count >= THRESHOLD: | |
filtered.write(cluster_buffer) | |
# Empty the cluster_buffer for buffering the new cluster | |
cluster_buffer = "" | |
cluster_buffer = line | |
else: | |
cluster_buffer += line | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from Bio import SeqIO | |
import sys | |
import re | |
args = sys.argv | |
if len(args) < 3: | |
exit("run: python extract_seqs.py <fasta_seqs> <clstrs_file>") | |
else: | |
FASTA_FILE = args[1] | |
CLUSTERS = args[2] | |
EXTRACTED = "extracted_" + FASTA_FILE | |
reps_seqs_headers = [] | |
with open(CLUSTERS, 'r') as headers: | |
for line in headers: | |
if "*" in line: | |
fasta_header_id = re.findall(r">(.*)\.\.\.", line)[0] | |
reps_seqs_headers.append(fasta_header_id) | |
fasta_sequences = SeqIO.parse(open(FASTA_FILE),'fasta') | |
with open(EXTRACTED, 'w') as extraced: | |
for seq in fasta_sequences: | |
for header in reps_seqs_headers: | |
if header in seq.description: | |
SeqIO.write(seq, extraced, "fasta") | |
continue |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment