Skip to content

Instantly share code, notes, and snippets.

@gkarthik
Forked from AlaaALatif/tabulate_variants.py
Last active April 13, 2022 09:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gkarthik/661f4232304cf989ffd890941b01feff to your computer and use it in GitHub Desktop.
Save gkarthik/661f4232304cf989ffd890941b01feff to your computer and use it in GitHub Desktop.
import bjorn_support as bs
import mutations as bm
import pandas as pd
# FASTA must include reference NC_045512.2 (e.g. use cat to add the reference)
fasta_filepath = '2021-02-08_release.fa'
# specify name for output alignment
msa_filepath = 'msa.fa'
# run alignment (uses MAFFT but can be changed from bjorn_support.py)
bs.align_fasta(fasta_filepath, msa_filepath);
# load alignment
msa_data = bs.load_fasta(msa_filepath, is_aligned=True)
# identify variants for each sample
# must identify insertions before anything else, otherwise information is lost
try:
insertions, _ = bm.identify_insertions_per_sample(msa_data)
except:
insertions = None
substitutions, _ = bm.identify_replacements_per_sample(msa_data)
deletions, _ = bm.identify_deletions_per_sample(msa_data)
# Read in pangolin lineage calls
lineages = pd.read_csv("lineage_report.csv")
# Merge lineages with deletions and insertions
merged_deletions = pd.merge(deletions, lineages, right_on="taxon", left_on="idx")
merged_insertions = pd.merge(insertions, lineages, right_on="taxon", left_on="idx")
# Export to csv
merged_deletions.to_csv("deletions.csv")
merged_insertions.to_csv("insertions.csv")
# Get seq_ids of those sequences with indel len %3 not a multiple of 3.
seq_ids = merged_deletions[merged_deletions["del_len"] % 3 != 0]["idx"].tolist()
seq_ids.extend(merged_insertions[merged_insertions["ins_len"] % 3 != 0]["idx"].tolist())
# Merge insertions and deletions to get final set of sequence ids
tmp = pd.concat([merged_deletions, merged_insertions])
ids = tmp[((tmp["del_len"] %3 != 0) & (~tmp["del_len"].isna())) | ((tmp["ins_len"] %3 != 0) & (~tmp["ins_len"].isna()))][["idx", "lineage"]].groupby("idx").first()
# Print number of sequenes flagged
print(len(list(set(seq_ids))))
# Print list of sequence IDs
print("\n".join(list(set(seq_ids))))
@liamxg
Copy link

liamxg commented Apr 13, 2022

could you please tell me how to install this, thanks.
nts.py", line 1, in
import bjorn_support as bs
ModuleNotFoundError: No module named 'bjorn_support'

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment