Skip to content

Instantly share code, notes, and snippets.

@spazm
Created March 4, 2019 03:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save spazm/a85f04ebbbf4f3defca4dd6d6bba5490 to your computer and use it in GitHub Desktop.
Save spazm/a85f04ebbbf4f3defca4dd6d6bba5490 to your computer and use it in GitHub Desktop.
use itertools::Itertools;
use std::collections::HashMap;
/// Split a string into substrings of length sub_size
/// Final element may be shorter than sub_size
fn sub_strings(source: &str, sub_size: usize) -> Vec<String> {
source
.chars()
.chunks(sub_size)
.into_iter()
.map(|chunk| chunk.collect())
.collect()
}
pub struct CodonsInfo<'a> {
names: HashMap<&'a str, &'a str>,
}
impl<'a> CodonsInfo<'a> {
/// translate three character strings describing a protein sequence
/// to an Option string of the full name of the amino acid.
///
/// The protein sequence should be in uppercase.
/// The amino acid name should be in lowercase.
///
/// e.g. "AUG" => Some("methionine").
///
/// | Codon | Protein |
/// |---------------------|---------------|
/// | AUG | Methionine |
/// | UUU, UUC | Phenylalanine |
/// | UUA, UUG | Leucine |
/// | UCU, UCC, UCA, UCG | Serine |
/// | UAU, UAC | Tyrosine |
/// | UGU, UGC | Cysteine |
/// | UGG | Tryptophan |
/// | UAA, UAG, UGA | STOP |
///
pub fn name_for(&self, codon: &str) -> Option<&'a str> {
self.names.get(codon).map(|&s| s)
}
/// Return a list of protein names that correspond to the RNA string or None if the RNA string is invalid
///
/// iterate through rna string by codon (three characters at a time) and
/// check for valid amino names
/// stop if a STOP codon is reached.
pub fn of_rna(&self, rna: &str) -> Option<Vec<&'a str>> {
let mut v = vec![];
for codon in sub_strings(rna, 3) {
match self.name_for(&codon) {
None => break,
Some("stop codon") => break,
Some(amino_name) => v.push(amino_name),
}
}
if v.is_empty() {
None
} else {
Some(v)
}
}
}
/// Consume a collection of pairs of (RNA protein sequence, Amino acid name) to
/// create a CodonsInfo object that can map from RNA protein sequence to Amino acid name
pub fn parse<'a>(pairs: Vec<(&'a str, &'a str)>) -> CodonsInfo<'a> {
let mut names = HashMap::new();
for (rna, amino) in pairs {
names.insert(rna, amino);
}
CodonsInfo { names }
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment