-
-
Save gkarthik/661f4232304cf989ffd890941b01feff to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bjorn_support as bs | |
import mutations as bm | |
import pandas as pd | |
# FASTA must include reference NC_045512.2 (e.g. use cat to add the reference) | |
fasta_filepath = '2021-02-08_release.fa' | |
# specify name for output alignment | |
msa_filepath = 'msa.fa' | |
# run alignment (uses MAFFT but can be changed from bjorn_support.py) | |
bs.align_fasta(fasta_filepath, msa_filepath); | |
# load alignment | |
msa_data = bs.load_fasta(msa_filepath, is_aligned=True) | |
# identify variants for each sample | |
# must identify insertions before anything else, otherwise information is lost | |
try: | |
insertions, _ = bm.identify_insertions_per_sample(msa_data) | |
except: | |
insertions = None | |
substitutions, _ = bm.identify_replacements_per_sample(msa_data) | |
deletions, _ = bm.identify_deletions_per_sample(msa_data) | |
# Read in pangolin lineage calls | |
lineages = pd.read_csv("lineage_report.csv") | |
# Merge lineages with deletions and insertions | |
merged_deletions = pd.merge(deletions, lineages, right_on="taxon", left_on="idx") | |
merged_insertions = pd.merge(insertions, lineages, right_on="taxon", left_on="idx") | |
# Export to csv | |
merged_deletions.to_csv("deletions.csv") | |
merged_insertions.to_csv("insertions.csv") | |
# Get seq_ids of those sequences with indel len %3 not a multiple of 3. | |
seq_ids = merged_deletions[merged_deletions["del_len"] % 3 != 0]["idx"].tolist() | |
seq_ids.extend(merged_insertions[merged_insertions["ins_len"] % 3 != 0]["idx"].tolist()) | |
# Merge insertions and deletions to get final set of sequence ids | |
tmp = pd.concat([merged_deletions, merged_insertions]) | |
ids = tmp[((tmp["del_len"] %3 != 0) & (~tmp["del_len"].isna())) | ((tmp["ins_len"] %3 != 0) & (~tmp["ins_len"].isna()))][["idx", "lineage"]].groupby("idx").first() | |
# Print number of sequenes flagged | |
print(len(list(set(seq_ids)))) | |
# Print list of sequence IDs | |
print("\n".join(list(set(seq_ids)))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
could you please tell me how to install this, thanks.
nts.py", line 1, in
import bjorn_support as bs
ModuleNotFoundError: No module named 'bjorn_support'