Skip to content

Instantly share code, notes, and snippets.

@mr-eyes
Last active June 9, 2019 20:56
Show Gist options
  • Save mr-eyes/4ecabbe816df38aa9212a49e9dbad8ef to your computer and use it in GitHub Desktop.
Save mr-eyes/4ecabbe816df38aa9212a49e9dbad8ef to your computer and use it in GitHub Desktop.
"""
Exclude CDHIT Clusters that contains number of sequences below a certain threshold
"""
import sys
args = sys.argv
if len(args) < 3:
exit("run: python cdhit_filter.py <input_file> <threshold>")
else:
ORIGINAL_FILE = args[1]
NEW_FILE = "filtered_" + ORIGINAL_FILE
THRESHOLD = int(args[2])
cluster_buffer = ""
with open(ORIGINAL_FILE, 'r') as cdhit, open(NEW_FILE, 'w') as filtered:
first_line = next(cdhit) # Add the first line to the buffer
cluster_buffer += first_line
# Iterate over each line
for line in cdhit:
# Check if line starts with ">" that means a new cluster started, so analyse the previous one
if line[0] == ">":
# Count how many "at" in the cluster
count = cluster_buffer.count("at") + 1
# Check if the count more than the predefined threshold
# If yes, write it to the new file
if count >= THRESHOLD:
filtered.write(cluster_buffer)
# Empty the cluster_buffer for buffering the new cluster
cluster_buffer = ""
cluster_buffer = line
else:
cluster_buffer += line
from Bio import SeqIO
import sys
import re
args = sys.argv
if len(args) < 3:
exit("run: python extract_seqs.py <fasta_seqs> <clstrs_file>")
else:
FASTA_FILE = args[1]
CLUSTERS = args[2]
EXTRACTED = "extracted_" + FASTA_FILE
reps_seqs_headers = []
with open(CLUSTERS, 'r') as headers:
for line in headers:
if "*" in line:
fasta_header_id = re.findall(r">(.*)\.\.\.", line)[0]
reps_seqs_headers.append(fasta_header_id)
fasta_sequences = SeqIO.parse(open(FASTA_FILE),'fasta')
with open(EXTRACTED, 'w') as extraced:
for seq in fasta_sequences:
for header in reps_seqs_headers:
if header in seq.description:
SeqIO.write(seq, extraced, "fasta")
continue
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment