Skip to content

Instantly share code, notes, and snippets.

@sanxiyn
Created December 31, 2020 17:58
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sanxiyn/fddd1f18074076fb47e04733e6b62865 to your computer and use it in GitHub Desktop.
Save sanxiyn/fddd1f18074076fb47e04733e6b62865 to your computer and use it in GitHub Desktop.
BNT162b2
import csv
import python_codon_tables
def read_csv_file(file_path):
with open(file_path) as f:
reader = csv.reader(f)
records = list(reader)
return records
def use_most_frequent(species):
substitutions = {}
codons_table = python_codon_tables.get_codons_table(species)
for amino_acid in codons_table:
frequency_table = codons_table[amino_acid]
codons = sorted(frequency_table)
max_frequency = 0
most_frequent_codon = None
for codon in codons:
frequency = frequency_table[codon]
if frequency > max_frequency:
max_frequency = frequency
most_frequent_codon = codon
for codon in codons:
substitutions[codon] = most_frequent_codon
return substitutions
if __name__ == "__main__":
# Use codon usage table of Homo sapiens
substitutions = use_most_frequent("h_sapiens_9606")
virvac = read_csv_file("side-by-side.csv")[1:]
matches = 0
for element in virvac:
_, vir, vac = element
our = substitutions[vir]
if vac == our:
matches += 1
print("{:.1f}%".format(100*matches/len(virvac)))
@alexomics
Copy link

I've made some changes to include the proline substitutions and the other ways of calculating the score.

import python_codon_tables


def reader(s):
    return s.strip().split(",")


def codons(*args, n=3):
    yield from (tuple(s[i : i + n] for s in args) for i in range(0, len(args[0]), n))


def pc_match(s1, s2, as_codons=False):
    div = 3 if as_codons else 1
    it = codons(s1, s2, n=div)
    return sum(a == b for a, b in it) / (len(s1) / div)


if __name__ == "__main__":
    res = [["species", "bases", "codons"]]
    for species in ["h_sapiens_9606", "m_musculus_10090"]:
        freq = python_codon_tables.get_codons_table(species)
        # Map codons to most frequent codon in this species this
        # does the same as the  `use_most_frequent` function
        subs = {k: max(d, key=d.get) for d in freq.values() for k in d.keys()}
        conv = {k: r for r, t in freq.items() for k in t.keys()}

        new_seq, vac_seq = [], []
        with open("side-by-side.csv") as fh:
            _ = next(fh)  # Skip headers
            for _, vir, vac in map(reader, fh):
                if conv[vir] != conv[vac]:
                    vir = vac
                new_seq.append(subs[vir])
                vac_seq.append(vac)

        new_prot = "".join(map(conv.get, new_seq))
        vac_prot = "".join(map(conv.get, vac_seq))
        new_seq = "".join(new_seq)
        vac_seq = "".join(vac_seq)
        assert new_prot == vac_prot
        res.append(
            [
                f"{species}",
                f"{pc_match(vac_seq, new_seq):.2%}",
                f"{pc_match(vac_seq, new_seq, True):.2%}",
            ]
        )
    m = list(max(map(len, x)) + 2 for x in list(map(list, zip(*res))))
    m[0] -= 2
    s = "{:>{}}" * len(m)
    for x in res:
        c = x + m
        c[::2] = x
        c[1::2] = m
        print(s.format(*c))

When comparing bases instead of codons the score goes up to 90.97% and is slightly better using Mus musculus at 91.08%

         species   bases  codons
  h_sapiens_9606  90.97%  78.26%
m_musculus_10090  91.08%  78.57%

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment