ahwagner/Ref Allele Normalization.ipynb

## Ref Allele Normalization.ipynb
{
  "cells": [
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "# Normalizing Ref Alleles"
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "## Supporting Functions"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "from collections import namedtuple",
      "execution_count": 1,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "## Normalization step 1: Trim\ndef trim(allele1, allele2):\n    def find_common_prefix(seq1, seq2):\n        l = min(len(seq1), len(seq2))\n        mod = 0\n        i = 0\n        for i in range(l):\n            if seq1[i] == seq2[i]:\n                mod = 1\n                continue\n            mod = 0\n            break\n        i += mod\n        common_prefix = seq1[:i]\n        seq1 = seq1[i:]\n        seq2 = seq2[i:]\n        return (common_prefix, seq1, seq2)\n    \n    Resp = namedtuple('TrimData', \n          ['trimmed_allele_1', 'trimmed_allele_2',\n           'common_prefix', 'common_suffix'])\n    \n    ra1 = allele1[::-1] # reversed Allele 1\n    ra2 = allele2[::-1] # reversed Allele 2\n    \n    rev_suffix, ra1, ra2 = find_common_prefix(ra1, ra2)\n    allele1 = ra1[::-1]\n    allele2 = ra2[::-1]\n    common_suffix = rev_suffix[::-1]\n    \n    common_prefix, allele1, allele2 = find_common_prefix(allele1, allele2)\n    \n    return(Resp(allele1, allele2, common_prefix, common_suffix))\n\n# Sanity checks\ntrim_data = trim('AAACTA', 'AAGTACTA')\nassert trim_data.common_prefix == 'AA'\nassert trim_data.common_suffix == 'ACTA'\nassert trim_data.trimmed_allele_1 == ''\nassert trim_data.trimmed_allele_2 == 'GT'\n\ntrim_data = trim('AAGTACTA', 'AAACTA')\nassert trim_data.common_prefix == 'AA'\nassert trim_data.common_suffix == 'ACTA'\nassert trim_data.trimmed_allele_1 == 'GT'\nassert trim_data.trimmed_allele_2 == ''\n\ntrim_data = trim('AAAA', 'AAAA')\nassert trim_data.common_prefix == ''\nassert trim_data.common_suffix == 'AAAA'\nassert trim_data.trimmed_allele_1 == ''\nassert trim_data.trimmed_allele_2 == ''\n\ntrim_data = trim('ACTGA', 'AGGGA')\nassert trim_data.common_prefix == 'A'\nassert trim_data.common_suffix == 'GA'\nassert trim_data.trimmed_allele_1 == 'CT'\nassert trim_data.trimmed_allele_2 == 'GG'\n\ntrim_data = trim('CA', 'CAGCA')\nassert trim_data.common_prefix == ''\nassert trim_data.common_suffix == 'CA'\nassert trim_data.trimmed_allele_1 == ''\nassert trim_data.trimmed_allele_2 == 'CAG'",
      "execution_count": 2,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "## Normalization step 2: bounds of ambiguity\nfrom itertools import cycle\n\ndef find_ambiguity_interval(ref, start, stop, roller_seq):\n    assert start <= stop\n    right_roller = cycle(roller_seq)\n    left_roller = cycle(roller_seq[::-1])\n    left_bound = start\n    right_bound = stop\n    while left_bound > 0:\n        left_residue = ref[left_bound - 1]\n        right_cycle_residue = next(left_roller)\n        if left_residue == right_cycle_residue:\n            left_bound -= 1\n        else:\n            break\n    while right_bound < len(ref):\n        right_residue = ref[right_bound]\n        left_cycle_residue = next(right_roller)\n        if right_residue == left_cycle_residue:\n            right_bound += 1\n        else:\n            break\n    return(left_bound - start, right_bound - stop)",
      "execution_count": 3,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "# Python class to define \n# colored text and background \nclass colors: \n    reset='\\033[0m'\n    class fg: \n        black='\\033[30m'\n        red='\\033[31m'\n        green='\\033[32m'\n        orange='\\033[33m'\n        blue='\\033[34m'\n        purple='\\033[35m'\n        cyan='\\033[36m'\n        lightgrey='\\033[37m'\n        darkgrey='\\033[90m'\n        lightred='\\033[91m'\n        lightgreen='\\033[92m'\n        yellow='\\033[93m'\n        lightblue='\\033[94m'\n        pink='\\033[95m'\n        lightcyan='\\033[96m'\n    class bg: \n        black='\\033[40m'\n        red='\\033[41m'\n        green='\\033[42m'\n        yellow='\\033[103m'\n        orange='\\033[43m'\n        blue='\\033[44m'\n        purple='\\033[45m'\n        cyan='\\033[46m'\n        lightgrey='\\033[47m'\n",
      "execution_count": 4,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "## Normalization routines"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "def current_normalization(ref, interval, state):\n    ref_start, ref_stop = interval\n    ref_allele = ref[ref_start:ref_stop]\n    trim_data = trim(ref_allele, state)\n    start = ref_start + len(trim_data.common_prefix)\n    stop  = ref_stop  - len(trim_data.common_suffix)\n    a1 = trim_data.trimmed_allele_1\n    a2 = trim_data.trimmed_allele_2\n    if a1 and a2:\n        # Substitution\n        pass\n    elif a1 or a2:\n        # Requires overprecision correction\n        if a1:\n            seq = a1\n        else:\n            seq = a2\n        steps = find_ambiguity_interval(ref, start, stop, seq)\n        prepend = ref[start + steps[0]:start]\n        append = ref[stop:stop + steps[1]]\n        a1 = prepend + a1 + append\n        a2 = prepend + a2 + append\n        start = start + steps[0]\n        stop = stop + steps[1]\n    else:\n        # Reference match, not handled\n        raise ValueError(\"Sequence state precisely matches reference.\")\n    Resp = namedtuple('NormalizationStatus', ['interval', 'ref_seq', 'state'])\n    return(Resp((start, stop), a1, a2))",
      "execution_count": 26,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "def new_normalization(ref, interval, state):\n    ref_start, ref_stop = interval\n    s = ref_stop - ref_start\n    assert s >= 0 and s + len(state) > 0 # Do we want to allow 0-interval, empty seq?\n    ref_allele = ref[ref_start:ref_stop]\n    trim_data = trim(ref_allele, state)\n    start = ref_start + len(trim_data.common_prefix)\n    stop  = ref_stop  - len(trim_data.common_suffix)\n    a1 = trim_data.trimmed_allele_1\n    a2 = trim_data.trimmed_allele_2\n    #### NEW CODE - Logic changes ####\n    if a1 and a2: \n        # No overprecision correction when 2 alleles after trim\n        pass\n    else: \n        # Overprecision correction otherwise\n        if a1:\n            seq = a1\n        elif a2:\n            seq = a2\n        else:\n            # Fully concordant, restore reference trim\n            seq = state\n            start, stop = interval\n            a1 = seq\n            a2 = seq\n            assert len(seq) == stop - start\n        steps = find_ambiguity_interval(ref, start, stop, seq)\n        prepend = ref[start + steps[0]:start]\n        append = ref[stop:stop + steps[1]]\n        a1 = prepend + a1 + append\n        a2 = prepend + a2 + append\n        start = start + steps[0]\n        stop = stop + steps[1]\n    \n    #### NEW CODE - Restore trimmed for concordant prefix /suffix ####\n    if ref_start < start:\n        prepend = ref[ref_start:start]\n        start = ref_start\n        a1 = prepend + a1\n        a2 = prepend + a2\n    if ref_stop > stop:\n        append = ref[stop:ref_stop]\n        stop = ref_stop\n        a1 = a1 + append\n        a2 = a2 + append\n    \n    #### END NEW CODE ####\n    \n    Resp = namedtuple('NormalizationStatus', ['interval', 'ref_seq', 'state'])\n    return(Resp((start, stop), a1, a2))",
      "execution_count": 74,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "## Test cases\n\n### Test function definitions"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "def compare_normalization(ref, interval, state, verbose=True):\n    try:\n        current = current_normalization(ref, interval, state)\n    except ValueError:\n        return None\n    new = new_normalization(ref, interval, state)\n    if verbose:\n        fstring = \"Interval: ({interval[0]:2},{interval[1]:2}) ref: {ref_seq:15} state: {state:15}\"\n        print(fstring.format(**current._asdict()))\n        print(fstring.format(**new._asdict()))\n    ci = current.interval\n    ni = new.interval\n    matched = ci == ni and current.state == new.state\n    if ci[1]-ci[0] > interval[1]-interval[0]:\n        current_interval_size = 1\n    elif ci[1]-ci[0] < interval[1]-interval[0]:\n        current_interval_size = -1\n    else:\n        current_interval_size = 0\n    if ni[1]-ni[0] > interval[1]-interval[0]:\n        new_interval_size = 1\n    elif ni[1]-ni[0] < interval[1]-interval[0]:\n        new_interval_size = -1\n    else:\n        new_interval_size = 0\n    Resp = namedtuple('CompareResult', \n              ['current_interval_size', 'new_interval_size', 'matched', 'c_result', 'n_result'])\n    return Resp(current_interval_size, new_interval_size, matched, current, new)",
      "execution_count": 75,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "def view_alignment(ref, interval, state):\n    fstring  = \"Params :: Interval: ({1:2},{2:2}) ref: {0:15} state: {3}\"\n    fstring2 = \"current:: Interval: ({0:2},{1:2}) ref: \"\n    fstring3 = \"new    :: Interval: ({0:2},{1:2}) ref: \"\n    \n    e = (ref, interval, state)\n    start, stop = interval\n    \n    current = current_normalization(*e)\n    new     = new_normalization(*e)\n    ref_colored = ref[:start] + colors.fg.red +\\\n                    ref[start:stop] + colors.reset + ref[stop:]\n\n    print(fstring.format(ref_colored, start, stop, state))\n\n    s2 = fstring2.format(*(current.interval))\n    s2 += ' ' * current.interval[0] + current.ref_seq + \\\n          ' ' * (15 - current.interval[1]) + f' state: {current.state}'\n    print(s2)\n    s3 = fstring3.format(*(new.interval))\n    s3 += ' ' * new.interval[0] + new.ref_seq + \\\n          ' ' * (15 - new.interval[1]) + f' state: {new.state}'\n    print(s3, \"\\n\")\n",
      "execution_count": 89,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "def view_examples(example_list):\n    for e in example_list:\n        view_alignment(*e)\n",
      "execution_count": 98,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "### VRS Example\nFirst, we evaluate the example from the [VRS normalization page](https://vr-spec.readthedocs.io/en/1.0/impl-guide/normalization.html#normalization), and see it is unchanged between approaches."
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "ref = 'TCAGCAGCT'\ninterval = (4, 6)\nstate = 'CAGCA'\n\nr = compare_normalization(ref, interval, state)\nprint(r.matched)",
      "execution_count": 90,
      "outputs": [
        {
          "output_type": "stream",
          "text": "Interval: ( 1, 8) ref: CAGCAGC         state: CAGCAGCAGC     \nInterval: ( 1, 8) ref: CAGCAGC         state: CAGCAGCAGC     \nTrue\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "### Entirely concordant Alleles\n\nOur current strategy does not normalize Alleles that are entirely [concordant](https://github.com/ga4gh/vr-spec/issues/193#issuecomment-624347578) with the supplied reference."
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "try:\n    current_normalization(ref, interval, ref[interval[0]:interval[1]])\nexcept ValueError as e:\n    print(f\"{type(e)}: {e}\")",
      "execution_count": 91,
      "outputs": [
        {
          "output_type": "stream",
          "text": "<class 'ValueError'>: Sequence state precisely matches reference.\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "The new approach will allow handle this in the same way it handles other (non-overprecise) Alleles:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "new_normalization(ref, interval, ref[interval[0]:interval[1]])",
      "execution_count": 92,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 92,
          "data": {
            "text/plain": "NormalizationStatus(interval=(4, 6), ref_seq='CA', state='CA')"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "### Prefix / suffix concordant Alleles\n\nOur current strategy normalizes Alleles by trimming off concordant suffixes, and then concordant prefixes. The new strategy does this too, but restores the minimum Allele bounds to the untrimmed sequence before returning.\n\nThe consequence of these divergent strategies can be seen when two Alleles with different length concordant prefix (or suffix) normalize to the same allele in our current implementation, but different Alleles in the new implementation."
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "examples = list()\n\nref = 'GATTACA'\n\nexamples.append((ref, (1, 4), 'ATTC'))\nexamples.append((ref, (0, 4), 'GATTC'))\n\nview_examples(examples)",
      "execution_count": 103,
      "outputs": [
        {
          "output_type": "stream",
          "text": "Params :: Interval: ( 1, 4) ref: G\u001b[31mATT\u001b[0mACA state: ATTC\ncurrent:: Interval: ( 4, 4) ref:                 state: C\nnew    :: Interval: ( 1, 4) ref:  ATT            state: ATTC \n\nParams :: Interval: ( 0, 4) ref: \u001b[31mGATT\u001b[0mACA state: GATTC\ncurrent:: Interval: ( 4, 4) ref:                 state: C\nnew    :: Interval: ( 0, 4) ref: GATT            state: GATTC \n\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "### Effect on overprecision correction\n\nThis has no effect on our ability to correct overprecision. Here are some examples to illustrate this:\n\nFlanking reference in overprecise insertion event:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "examples = list()\n\nref = 'GGATCATC'\n\nexamples.append((ref, (1,2), 'GATC'))\nexamples.append((ref, (0,2), 'GGATC'))\n\nview_examples(examples)",
      "execution_count": 105,
      "outputs": [
        {
          "output_type": "stream",
          "text": "Params :: Interval: ( 1, 2) ref: G\u001b[31mG\u001b[0mATCATC state: GATC\ncurrent:: Interval: ( 2, 8) ref:   ATCATC        state: ATCATCATC\nnew    :: Interval: ( 1, 8) ref:  GATCATC        state: GATCATCATC \n\nParams :: Interval: ( 0, 2) ref: \u001b[31mGG\u001b[0mATCATC state: GGATC\ncurrent:: Interval: ( 2, 8) ref:   ATCATC        state: ATCATCATC\nnew    :: Interval: ( 0, 8) ref: GGATCATC        state: GGATCATCATC \n\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "## Evaluating differences from random inputs\n\n### Constrained to sequences with discordant start and stop residues\n\nAlleles where the sequence state is the same length as the reference interval, bounded by alternate alleles, will be unchanged."
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "from random import choices, randint\n\nk=15\nalt={'A': 'C', 'C': 'G', 'G': 'T', 'T': 'A'}\n\ncomparisons = 0\nexpansions = 0\nfor i in range(1000):\n    novel_seq = choices('ACTG', k=k)\n    for j in range(20):\n        interval = tuple(sorted((randint(0, k),randint(0, k))))\n        state = novel_seq[interval[0]:interval[1]]\n        if len(state) == 0:\n            continue\n        state[0] = alt[state[0]]\n        state[-1] = alt[state[-1]]\n        result = compare_normalization(''.join(novel_seq), \n                                     interval, \n                                     ''.join(state), \n                                     verbose=False)\n        assert result.matched is True\n        comparisons += 1\n        if result.new_interval_size == 1:\n            expansions += 1\n\nprint(f'Algorithms produced identical results through {comparisons} comparisons, with {expansions} overprecision corrections.')",
      "execution_count": 80,
      "outputs": [
        {
          "output_type": "stream",
          "text": "Algorithms produced identical results through 18754 comparisons, with 0 overprecision corrections.\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "from random import choices, randint\nfrom collections import Counter, defaultdict\n\nk=15\nc=3\n\nunmatched_conditions = Counter()\nexamples = defaultdict(list)\n\ncomparisons = 0\nn = 50000\nfor i in range(n):\n    novel_seq = choices('ACTG', k=k)\n    novel_seq_str = ''.join(novel_seq)\n    interval = tuple(sorted((randint(0, k),randint(0, k))))\n    state = choices('ACTG', k=randint(0, c))\n    state_str = ''.join(state)\n    if len(state) + (interval[1] - interval[0]) == 0:\n        continue\n    result = compare_normalization(novel_seq_str, interval, \n                 state_str, verbose=False)\n    if result is None:\n        continue\n    if result.matched is False:\n        key = (result.current_interval_size, result.new_interval_size)\n        if unmatched_conditions[key] < 5:\n            examples[key].append((novel_seq_str,interval,state_str))\n        unmatched_conditions[key] += 1\n    comparisons += 1\n\nprint(f'Performed {comparisons} comparisons in {n} tests, with {sum(unmatched_conditions.values())} discrepancies.')",
      "execution_count": 81,
      "outputs": [
        {
          "output_type": "stream",
          "text": "Performed 48745 comparisons in 50000 tests, with 13860 discrepancies.\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "unmatched_conditions",
      "execution_count": 82,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 82,
          "data": {
            "text/plain": "Counter({(-1, 0): 12465, (1, 1): 289, (-1, 1): 206, (0, 1): 900})"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "### Trimmed reference\nThe obvious difference is for Alleles with trimmed concordant prefixes or suffixes and no overprecision correction."
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "def extract_and_view_examples(example_key):\n    example_list = examples[example_key]\n    view_examples(example_list)\n    \nextract_and_view_examples((-1,0))",
      "execution_count": 83,
      "outputs": [
        {
          "output_type": "stream",
          "text": "Params :: Interval: (10,14) ref: GTACTTTGAG\u001b[31mGTAC\u001b[0mC state: GGC\ncurrent:: Interval: (11,13) ref:            TA   state: G\nnew    :: Interval: (10,14) ref:           GTAC  state: GGC \n\nParams :: Interval: ( 9,13) ref: CAGAGAGTT\u001b[31mTAAG\u001b[0mTC state: TC\ncurrent:: Interval: (10,13) ref:           AAG   state: C\nnew    :: Interval: ( 9,13) ref:          TAAG   state: TC \n\nParams :: Interval: ( 7,12) ref: TATGAAC\u001b[31mCTGCC\u001b[0mCTA state: CT\ncurrent:: Interval: ( 9,12) ref:          GCC    state: \nnew    :: Interval: ( 7,12) ref:        CTGCC    state: CT \n\nParams :: Interval: ( 2, 6) ref: AT\u001b[31mATAC\u001b[0mCTCGTAGCC state: C\ncurrent:: Interval: ( 2, 5) ref:   ATA           state: \nnew    :: Interval: ( 2, 6) ref:   ATAC          state: C \n\nParams :: Interval: ( 0, 3) ref: \u001b[31mATT\u001b[0mCCACTACGTCAA state: A\ncurrent:: Interval: ( 1, 3) ref:  TT             state: \nnew    :: Interval: ( 0, 3) ref: ATT             state: A \n\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "After that, cases where the new routine returns a greater where the old routine does not. These are cases where there was a 1 nucleotide ambiguity of where the delins took place, coupled to a 1 nucleotide shared prefix/suffix. "
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "extract_and_view_examples((0,1))",
      "execution_count": 84,
      "outputs": [
        {
          "output_type": "stream",
          "text": "Params :: Interval: ( 2,11) ref: TG\u001b[31mACTATGAGT\u001b[0mATCA state: T\ncurrent:: Interval: ( 1,10) ref:  GACTATGAG      state: G\nnew    :: Interval: ( 1,11) ref:  GACTATGAGT     state: GT \n\nParams :: Interval: ( 0, 1) ref: \u001b[31mG\u001b[0mATGAAGCTTTCAAT state: GA\ncurrent:: Interval: ( 1, 2) ref:  A              state: AA\nnew    :: Interval: ( 0, 2) ref: GA              state: GAA \n\nParams :: Interval: (10,12) ref: ACCCCGACAT\u001b[31mTA\u001b[0mCCT state: A\ncurrent:: Interval: ( 9,11) ref:          TT     state: T\nnew    :: Interval: ( 9,12) ref:          TTA    state: TA \n\nParams :: Interval: ( 5, 9) ref: CGAGG\u001b[31mCAGG\u001b[0mCCAGCC state: G\ncurrent:: Interval: ( 4, 8) ref:     GCAG        state: G\nnew    :: Interval: ( 4, 9) ref:     GCAGG       state: GG \n\nParams :: Interval: ( 8,12) ref: AAGCGCGC\u001b[31mCTGG\u001b[0mTAC state: C\ncurrent:: Interval: ( 9,13) ref:          TGGT   state: T\nnew    :: Interval: ( 8,13) ref:         CTGGT   state: CT \n\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "In an extension of this, we see some cases where there is greater ambiguity and increased overprecision correction, again coupled to a shared prefix / suffix"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "extract_and_view_examples((1,1))",
      "execution_count": 85,
      "outputs": [
        {
          "output_type": "stream",
          "text": "Params :: Interval: ( 2,10) ref: GA\u001b[31mAGATAACG\u001b[0mGAGAT state: A\ncurrent:: Interval: ( 3,12) ref:    GATAACGGA    state: GA\nnew    :: Interval: ( 2,12) ref:   AGATAACGGA    state: AGA \n\nParams :: Interval: (11,13) ref: TCAAGCCACAA\u001b[31mAT\u001b[0mTT state: T\ncurrent:: Interval: ( 9,12) ref:          AAA    state: AA\nnew    :: Interval: ( 9,13) ref:          AAAT   state: AAT \n\nParams :: Interval: ( 3, 7) ref: CCT\u001b[31mTCTA\u001b[0mCAAAGGTC state: A\ncurrent:: Interval: ( 1, 6) ref:  CTTCT          state: CT\nnew    :: Interval: ( 1, 7) ref:  CTTCTA         state: CTA \n\nParams :: Interval: ( 8,14) ref: AGCTACCG\u001b[31mCTTCGA\u001b[0mT state: A\ncurrent:: Interval: ( 6,13) ref:       CGCTTCG   state: CG\nnew    :: Interval: ( 6,14) ref:       CGCTTCGA  state: CGA \n\nParams :: Interval: ( 4,10) ref: GGCG\u001b[31mTCACGC\u001b[0mCAATT state: T\ncurrent:: Interval: ( 5,12) ref:      CACGCCA    state: CA\nnew    :: Interval: ( 4,12) ref:     TCACGCCA    state: TCA \n\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Finally, there are cases where we see a trim-based reduction in the current routine, but an overall increase in the new routine due to overprecision correction."
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "extract_and_view_examples((-1,1))",
      "execution_count": 86,
      "outputs": [
        {
          "output_type": "stream",
          "text": "Params :: Interval: ( 3, 8) ref: CAA\u001b[31mATATG\u001b[0mCGACGTC state: TG\ncurrent:: Interval: ( 2, 6) ref:   AATA          state: A\nnew    :: Interval: ( 2, 8) ref:   AATATG        state: ATG \n\nParams :: Interval: ( 5, 7) ref: CCTTG\u001b[31mAA\u001b[0mCGGTAGGT state: GAA\ncurrent:: Interval: ( 4, 5) ref:     G           state: GG\nnew    :: Interval: ( 4, 7) ref:     GAA         state: GGAA \n\nParams :: Interval: (12,14) ref: GAAACATGCTAT\u001b[31mGT\u001b[0mC state: GTC\ncurrent:: Interval: (14,15) ref:               C state: CC\nnew    :: Interval: (12,15) ref:             GTC state: GTCC \n\nParams :: Interval: ( 1, 8) ref: T\u001b[31mTGTGTTC\u001b[0mTAGTGGA state: TG\ncurrent:: Interval: ( 3, 9) ref:    TGTTCT       state: T\nnew    :: Interval: ( 1, 9) ref:  TGTGTTCT       state: TGT \n\nParams :: Interval: (13,15) ref: GTGAAGCCTCATA\u001b[31mTG\u001b[0m state: ATG\ncurrent:: Interval: (12,13) ref:             A   state: AA\nnew    :: Interval: (12,15) ref:             ATG state: AATG \n\n",
          "name": "stdout"
        }
      ]
    }
  ],
  "metadata": {
    "_draft": {
      "nbviewer_url": "https://gist.github.com/364fd1ebcd10133937bb2adc18d4779a"
    },
    "gist": {
      "id": "364fd1ebcd10133937bb2adc18d4779a",
      "data": {
        "description": "Workbook for testing different normalization strategies",
        "public": true
      }
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3",
      "language": "python"
    },
    "language_info": {
      "name": "python",
      "version": "3.8.2",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}