mr-eyes/cdhit_filter.py

## cdhit_filter.py
"""
Exclude CDHIT Clusters that contains number of sequences below a certain threshold
"""


import sys

args = sys.argv

if len(args) < 3:
    exit("run: python cdhit_filter.py <input_file> <threshold>")

else:
    ORIGINAL_FILE = args[1]
    NEW_FILE = "filtered_" + ORIGINAL_FILE
    THRESHOLD = int(args[2])

cluster_buffer = ""

with open(ORIGINAL_FILE, 'r') as cdhit, open(NEW_FILE, 'w') as filtered:
    first_line = next(cdhit) # Add the first line to the buffer
    cluster_buffer += first_line

    # Iterate over each line
    for line in cdhit:
        # Check if line starts with ">" that means a new cluster started, so analyse the previous one
        if line[0] == ">":
            # Count how many "at" in the cluster
            count = cluster_buffer.count("at")  + 1

            # Check if the count more than the predefined threshold
            # If yes, write it to the new file
            if count >= THRESHOLD:
                filtered.write(cluster_buffer)

            # Empty the cluster_buffer for buffering the new cluster
            cluster_buffer = ""
            cluster_buffer = line

        else:
            cluster_buffer += line


## extract_seqs.py
from Bio import SeqIO
import sys
import re

args = sys.argv

if len(args) < 3:
    exit("run: python extract_seqs.py <fasta_seqs> <clstrs_file>")

else:
    FASTA_FILE = args[1]
    CLUSTERS = args[2]
    EXTRACTED = "extracted_" + FASTA_FILE


reps_seqs_headers = []
with open(CLUSTERS, 'r') as headers:
    for line in headers:
         if "*" in line:
            fasta_header_id = re.findall(r">(.*)\.\.\.", line)[0]
            reps_seqs_headers.append(fasta_header_id)


fasta_sequences = SeqIO.parse(open(FASTA_FILE),'fasta')

with open(EXTRACTED, 'w') as extraced:
    for seq in fasta_sequences:
        for header in reps_seqs_headers:
            if header in seq.description:
                SeqIO.write(seq, extraced, "fasta")
                continue
	"""
	Exclude CDHIT Clusters that contains number of sequences below a certain threshold
	"""


	import sys

	args = sys.argv

	if len(args) < 3:
	exit("run: python cdhit_filter.py <input_file> <threshold>")

	else:
	ORIGINAL_FILE = args[1]
	NEW_FILE = "filtered_" + ORIGINAL_FILE
	THRESHOLD = int(args[2])

	cluster_buffer = ""

	with open(ORIGINAL_FILE, 'r') as cdhit, open(NEW_FILE, 'w') as filtered:
	first_line = next(cdhit) # Add the first line to the buffer
	cluster_buffer += first_line

	# Iterate over each line
	for line in cdhit:
	# Check if line starts with ">" that means a new cluster started, so analyse the previous one
	if line[0] == ">":
	# Count how many "at" in the cluster
	count = cluster_buffer.count("at") + 1

	# Check if the count more than the predefined threshold
	# If yes, write it to the new file
	if count >= THRESHOLD:
	filtered.write(cluster_buffer)

	# Empty the cluster_buffer for buffering the new cluster
	cluster_buffer = ""
	cluster_buffer = line

	else:
	cluster_buffer += line
	from Bio import SeqIO
	import sys
	import re

	args = sys.argv

	if len(args) < 3:
	exit("run: python extract_seqs.py <fasta_seqs> <clstrs_file>")

	else:
	FASTA_FILE = args[1]
	CLUSTERS = args[2]
	EXTRACTED = "extracted_" + FASTA_FILE


	reps_seqs_headers = []
	with open(CLUSTERS, 'r') as headers:
	for line in headers:
	if "*" in line:
	fasta_header_id = re.findall(r">(.*)\.\.\.", line)[0]
	reps_seqs_headers.append(fasta_header_id)


	fasta_sequences = SeqIO.parse(open(FASTA_FILE),'fasta')

	with open(EXTRACTED, 'w') as extraced:
	for seq in fasta_sequences:
	for header in reps_seqs_headers:
	if header in seq.description:
	SeqIO.write(seq, extraced, "fasta")
	continue