shadiakiki1986/try-seqacademy.ipynb

## try-seqacademy.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "try seqacademy.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "toc_visible": true,
      "authorship_tag": "ABX9TyOjb1/A6k6mo17GqKVTiKF4",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/shadiakiki1986/3713187107a5c5322e507f0d38e6ff0b/try-seqacademy.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "SGyGLv5G0Bbm",
        "colab_type": "text"
      },
      "source": [
        "Based on https://github.com/NCBI-Hackathons/seqacademy/"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "UUXbH1y2nisC",
        "colab_type": "text"
      },
      "source": [
        "## Install hisat2"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "egBGvqsxy9SA",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 289
        },
        "outputId": "c210f714-7949-4870-9b81-1ee2c767ac08"
      },
      "source": [
        "%%shell\n",
        "# download hisat2 https://ccb.jhu.edu/software/hisat2/manual.shtml\n",
        "wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/downloads/hisat2-2.1.0-Linux_x86_64.zip\n",
        "unzip hisat2-2.1.0-Linux_x86_64.zip > /dev/null\n",
        "ls\n",
        "#chmod +x hisat2-2.1.0/\n",
        "mv hisat2-2.1.0/ /usr/local/bin/"
      ],
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "--2020-08-05 07:59:55--  ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/downloads/hisat2-2.1.0-Linux_x86_64.zip\n",
            "           => ‘hisat2-2.1.0-Linux_x86_64.zip’\n",
            "Resolving ftp.ccb.jhu.edu (ftp.ccb.jhu.edu)... 128.220.174.63\n",
            "Connecting to ftp.ccb.jhu.edu (ftp.ccb.jhu.edu)|128.220.174.63|:21... connected.\n",
            "Logging in as anonymous ... Logged in!\n",
            "==> SYST ... done.    ==> PWD ... done.\n",
            "==> TYPE I ... done.  ==> CWD (1) /pub/infphilo/hisat2/downloads ... done.\n",
            "==> SIZE hisat2-2.1.0-Linux_x86_64.zip ... 27104162\n",
            "==> PASV ... done.    ==> RETR hisat2-2.1.0-Linux_x86_64.zip ... done.\n",
            "Length: 27104162 (26M) (unauthoritative)\n",
            "\n",
            "\r          hisat2-2.   0%[                    ]       0  --.-KB/s               \r         hisat2-2.1  56%[==========>         ]  14.51M  69.2MB/s               \rhisat2-2.1.0-Linux_ 100%[===================>]  25.85M  93.7MB/s    in 0.3s    \n",
            "\n",
            "2020-08-05 07:59:55 (93.7 MB/s) - ‘hisat2-2.1.0-Linux_x86_64.zip’ saved [27104162]\n",
            "\n",
            "hisat2-2.1.0  hisat2-2.1.0-Linux_x86_64.zip  sample_data\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              ""
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 1
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "VEF3wG472uS_",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 54
        },
        "outputId": "14e09e22-a9e6-4a64-897d-3136c17e019c"
      },
      "source": [
        "#%%shell\n",
        "\n",
        "# using PATH=$PATH:... in shell doesn't persist the changes to other cells,\n",
        "# so use os.environ instead\n",
        "#PATH=$PATH:/usr/local/bin/hisat2-2.1.0\n",
        "import os\n",
        "os.environ['PATH'] += \":/usr/local/bin/hisat2-2.1.0\"\n",
        "!echo $PATH"
      ],
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/tools/node/bin:/tools/google-cloud-sdk/bin:/opt/bin:/usr/local/bin/hisat2-2.1.0\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "JL-LDdRq25Xh",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 54
        },
        "outputId": "3be69587-9628-420e-fdeb-785ee6d02857"
      },
      "source": [
        "# check that PATH edit is persistent\n",
        "!echo $PATH"
      ],
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/tools/node/bin:/tools/google-cloud-sdk/bin:/opt/bin:/usr/local/bin/hisat2-2.1.0\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "L1hZtmnQ1oKE",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 136
        },
        "outputId": "fbfa6b62-913e-4487-8cdc-9661adb0c663"
      },
      "source": [
        "!hisat2-build --version"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "hisat2-build version 2.1.0\n",
            "64-bit\n",
            "Built on login-node03\n",
            "Wed Jun  7 15:52:43 EDT 2017\n",
            "Compiler: gcc version 4.8.2 (GCC) \n",
            "Options: -O3 -m64 -msse2 -funroll-loops -g3 -DPOPCNT_CAPABILITY\n",
            "Sizeof {int, long, long long, void*, size_t, off_t}: {4, 8, 8, 8, 8, 8}\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "9obcFKcAoPEW",
        "colab_type": "text"
      },
      "source": [
        "## prep"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "7Rm-XEBW6G4l",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 289
        },
        "outputId": "3577c2e3-b88a-49f5-ac5d-3312fdf328f0"
      },
      "source": [
        "%%shell\n",
        "# download seqacademy source\n",
        "wget https://github.com/NCBI-Hackathons/seqacademy/archive/master.zip\n",
        "unzip master.zip > /dev/null\n",
        "mv seqacademy-master seqacademy"
      ],
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "--2020-08-05 08:00:07--  https://github.com/NCBI-Hackathons/seqacademy/archive/master.zip\n",
            "Resolving github.com (github.com)... 140.82.114.4\n",
            "Connecting to github.com (github.com)|140.82.114.4|:443... connected.\n",
            "HTTP request sent, awaiting response... 302 Found\n",
            "Location: https://codeload.github.com/NCBI-Hackathons/seqacademy/zip/master [following]\n",
            "--2020-08-05 08:00:07--  https://codeload.github.com/NCBI-Hackathons/seqacademy/zip/master\n",
            "Resolving codeload.github.com (codeload.github.com)... 140.82.114.9\n",
            "Connecting to codeload.github.com (codeload.github.com)|140.82.114.9|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: unspecified [application/zip]\n",
            "Saving to: ‘master.zip’\n",
            "\n",
            "\rmaster.zip              [<=>                 ]       0  --.-KB/s               \rmaster.zip              [ <=>                ] 109.64K   537KB/s               \rmaster.zip              [  <=>               ]   2.40M  8.04MB/s    in 0.3s    \n",
            "\n",
            "2020-08-05 08:00:08 (8.04 MB/s) - ‘master.zip’ saved [2518828]\n",
            "\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              ""
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 5
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "u-PMZUar7BCX",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "!mkdir test # required for next step"
      ],
      "execution_count": 6,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "S6hulUPRoVwl",
        "colab_type": "text"
      },
      "source": [
        "## hisat2-build (build index)"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "7IPeQjH1vn_P",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 272
        },
        "outputId": "3e7685ef-71c5-431d-bd5c-69e3bdececda"
      },
      "source": [
        "%%shell\n",
        "# from scripts/chipseq/index.sh\n",
        "\n",
        "mkdir yeast_index\n",
        "cd yeast_index\n",
        "\n",
        "#\n",
        "# Downloads sequences for the latest Yeast release from Ensembl.\n",
        "#\n",
        "# By default, this script builds an index for just the base files,\n",
        "# since alignments to those sequences are the most useful.  To change\n",
        "# which categories are built by this script, edit the CHRS_TO_INDEX\n",
        "# variable below.\n",
        "#\n",
        "\n",
        "ENSEMBL_RELEASE=84\n",
        "ENSEMBL_YEAST_BASE=ftp://ftp.ensembl.org/pub/release-${ENSEMBL_RELEASE}/fasta/saccharomyces_cerevisiae/dna/\n",
        "\n",
        "F=Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa\n",
        "if [ ! -f $F ] ; then\n",
        "\twget ${ENSEMBL_YEAST_BASE}/$F.gz || (echo \"Error getting $F\" && exit 1)\n",
        "\tgunzip $F.gz || (echo \"Error unzipping $F\" && exit 1)\n",
        "\tmv $F genome.fa\n",
        "fi"
      ],
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "--2020-08-05 08:00:13--  ftp://ftp.ensembl.org/pub/release-84/fasta/saccharomyces_cerevisiae/dna//Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.gz\n",
            "           => ‘Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.gz’\n",
            "Resolving ftp.ensembl.org (ftp.ensembl.org)... 193.62.193.139\n",
            "Connecting to ftp.ensembl.org (ftp.ensembl.org)|193.62.193.139|:21... connected.\n",
            "Logging in as anonymous ... Logged in!\n",
            "==> SYST ... done.    ==> PWD ... done.\n",
            "==> TYPE I ... done.  ==> CWD (1) /pub/release-84/fasta/saccharomyces_cerevisiae/dna/ ... done.\n",
            "==> SIZE Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.gz ... 3786708\n",
            "==> PASV ... done.    ==> RETR Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.gz ... done.\n",
            "Length: 3786708 (3.6M) (unauthoritative)\n",
            "\n",
            "Saccharomyces_cerev 100%[===================>]   3.61M  4.69MB/s    in 0.8s    \n",
            "\n",
            "2020-08-05 08:00:15 (4.69 MB/s) - ‘Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.gz’ saved [3786708]\n",
            "\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              ""
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 7
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "U9AjyCx7yYCb",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "outputId": "a755e7b4-b4cc-44ad-b60c-29491fd042e7"
      },
      "source": [
        "%%shell\n",
        "# from scripts/chipseq/index.sh\n",
        "\n",
        "hisat2-build yeast_index/genome.fa yeast_index/genome\n",
        "\n",
        "#rm genome.fa"
      ],
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Settings:\n",
            "  Output files: \"yeast_index/genome.*.ht2\"\n",
            "  Line rate: 6 (line is 64 bytes)\n",
            "  Lines per side: 1 (side is 64 bytes)\n",
            "  Offset rate: 4 (one in 16)\n",
            "  FTable chars: 10\n",
            "  Strings: unpacked\n",
            "  Local offset rate: 3 (one in 8)\n",
            "  Local fTable chars: 6\n",
            "  Local sequence length: 57344\n",
            "  Local sequence overlap between two consecutive indexes: 1024\n",
            "  Endianness: little\n",
            "  Actual local endianness: little\n",
            "  Sanity checking: disabled\n",
            "  Assertions: disabled\n",
            "  Random seed: 0\n",
            "  Sizeofs: void*:8, int:4, long:8, size_t:8\n",
            "Input files DNA, FASTA:\n",
            "  yeast_index/genome.fa\n",
            "Reading reference sizes\n",
            "  Time reading reference sizes: 00:00:01\n",
            "Calculating joined length\n",
            "Writing header\n",
            "Reserving space for joined string\n",
            "Joining reference sequences\n",
            "  Time to join reference sequences: 00:00:00\n",
            "  Time to read SNPs and splice sites: 00:00:00\n",
            "Using parameters --bmax 2279457 --dcv 1024\n",
            "  Doing ahead-of-time memory usage test\n",
            "  Passed!  Constructing with these parameters: --bmax 2279457 --dcv 1024\n",
            "Constructing suffix-array element generator\n",
            "Building DifferenceCoverSample\n",
            "  Building sPrime\n",
            "  Building sPrimeOrder\n",
            "  V-Sorting samples\n",
            "  V-Sorting samples time: 00:00:00\n",
            "  Allocating rank array\n",
            "  Ranking v-sort output\n",
            "  Ranking v-sort output time: 00:00:00\n",
            "  Invoking Larsson-Sadakane on ranks\n",
            "  Invoking Larsson-Sadakane on ranks time: 00:00:00\n",
            "  Sanity-checking and returning\n",
            "Building samples\n",
            "Reserving space for 12 sample suffixes\n",
            "Generating random suffixes\n",
            "QSorting 12 sample offsets, eliminating duplicates\n",
            "QSorting sample offsets, eliminating duplicates time: 00:00:00\n",
            "Multikey QSorting 12 samples\n",
            "  (Using difference cover)\n",
            "  Multikey QSorting samples time: 00:00:00\n",
            "Calculating bucket sizes\n",
            "Splitting and merging\n",
            "  Splitting and merging time: 00:00:00\n",
            "Split 1, merged 5; iterating...\n",
            "Splitting and merging\n",
            "  Splitting and merging time: 00:00:00\n",
            "Avg bucket size: 1.51964e+06 (target: 2279456)\n",
            "Converting suffix-array elements to index image\n",
            "Allocating ftab, absorbFtab\n",
            "Entering GFM loop\n",
            "Getting block 1 of 8\n",
            "  Reserving size (2279457) for bucket 1\n",
            "  Calculating Z arrays for bucket 1\n",
            "  Entering block accumulator loop for bucket 1:\n",
            "  bucket 1: 10%\n",
            "  bucket 1: 20%\n",
            "  bucket 1: 30%\n",
            "  bucket 1: 40%\n",
            "  bucket 1: 50%\n",
            "  bucket 1: 60%\n",
            "  bucket 1: 70%\n",
            "  bucket 1: 80%\n",
            "  bucket 1: 90%\n",
            "  bucket 1: 100%\n",
            "  Sorting block of length 2000591 for bucket 1\n",
            "  (Using difference cover)\n",
            "  Sorting block time: 00:00:01\n",
            "Returning block of 2000592 for bucket 1\n",
            "Getting block 2 of 8\n",
            "  Reserving size (2279457) for bucket 2\n",
            "  Calculating Z arrays for bucket 2\n",
            "  Entering block accumulator loop for bucket 2:\n",
            "  bucket 2: 10%\n",
            "  bucket 2: 20%\n",
            "  bucket 2: 30%\n",
            "  bucket 2: 40%\n",
            "  bucket 2: 50%\n",
            "  bucket 2: 60%\n",
            "  bucket 2: 70%\n",
            "  bucket 2: 80%\n",
            "  bucket 2: 90%\n",
            "  bucket 2: 100%\n",
            "  Sorting block of length 1791111 for bucket 2\n",
            "  (Using difference cover)\n",
            "  Sorting block time: 00:00:01\n",
            "Returning block of 1791112 for bucket 2\n",
            "Getting block 3 of 8\n",
            "  Reserving size (2279457) for bucket 3\n",
            "  Calculating Z arrays for bucket 3\n",
            "  Entering block accumulator loop for bucket 3:\n",
            "  bucket 3: 10%\n",
            "  bucket 3: 20%\n",
            "  bucket 3: 30%\n",
            "  bucket 3: 40%\n",
            "  bucket 3: 50%\n",
            "  bucket 3: 60%\n",
            "  bucket 3: 70%\n",
            "  bucket 3: 80%\n",
            "  bucket 3: 90%\n",
            "  bucket 3: 100%\n",
            "  Sorting block of length 1386934 for bucket 3\n",
            "  (Using difference cover)\n",
            "  Sorting block time: 00:00:00\n",
            "Returning block of 1386935 for bucket 3\n",
            "Getting block 4 of 8\n",
            "  Reserving size (2279457) for bucket 4\n",
            "  Calculating Z arrays for bucket 4\n",
            "  Entering block accumulator loop for bucket 4:\n",
            "  bucket 4: 10%\n",
            "  bucket 4: 20%\n",
            "  bucket 4: 30%\n",
            "  bucket 4: 40%\n",
            "  bucket 4: 50%\n",
            "  bucket 4: 60%\n",
            "  bucket 4: 70%\n",
            "  bucket 4: 80%\n",
            "  bucket 4: 90%\n",
            "  bucket 4: 100%\n",
            "  Sorting block of length 1309547 for bucket 4\n",
            "  (Using difference cover)\n",
            "  Sorting block time: 00:00:00\n",
            "Returning block of 1309548 for bucket 4\n",
            "Getting block 5 of 8\n",
            "  Reserving size (2279457) for bucket 5\n",
            "  Calculating Z arrays for bucket 5\n",
            "  Entering block accumulator loop for bucket 5:\n",
            "  bucket 5: 10%\n",
            "  bucket 5: 20%\n",
            "  bucket 5: 30%\n",
            "  bucket 5: 40%\n",
            "  bucket 5: 50%\n",
            "  bucket 5: 60%\n",
            "  bucket 5: 70%\n",
            "  bucket 5: 80%\n",
            "  bucket 5: 90%\n",
            "  bucket 5: 100%\n",
            "  Sorting block of length 1717347 for bucket 5\n",
            "  (Using difference cover)\n",
            "  Sorting block time: 00:00:00\n",
            "Returning block of 1717348 for bucket 5\n",
            "Getting block 6 of 8\n",
            "  Reserving size (2279457) for bucket 6\n",
            "  Calculating Z arrays for bucket 6\n",
            "  Entering block accumulator loop for bucket 6:\n",
            "  bucket 6: 10%\n",
            "  bucket 6: 20%\n",
            "  bucket 6: 30%\n",
            "  bucket 6: 40%\n",
            "  bucket 6: 50%\n",
            "  bucket 6: 60%\n",
            "  bucket 6: 70%\n",
            "  bucket 6: 80%\n",
            "  bucket 6: 90%\n",
            "  bucket 6: 100%\n",
            "  Sorting block of length 1538951 for bucket 6\n",
            "  (Using difference cover)\n",
            "  Sorting block time: 00:00:01\n",
            "Returning block of 1538952 for bucket 6\n",
            "Getting block 7 of 8\n",
            "  Reserving size (2279457) for bucket 7\n",
            "  Calculating Z arrays for bucket 7\n",
            "  Entering block accumulator loop for bucket 7:\n",
            "  bucket 7: 10%\n",
            "  bucket 7: 20%\n",
            "  bucket 7: 30%\n",
            "  bucket 7: 40%\n",
            "  bucket 7: 50%\n",
            "  bucket 7: 60%\n",
            "  bucket 7: 70%\n",
            "  bucket 7: 80%\n",
            "  bucket 7: 90%\n",
            "  bucket 7: 100%\n",
            "  Sorting block of length 1421401 for bucket 7\n",
            "  (Using difference cover)\n",
            "  Sorting block time: 00:00:01\n",
            "Returning block of 1421402 for bucket 7\n",
            "Getting block 8 of 8\n",
            "  Reserving size (2279457) for bucket 8\n",
            "  Calculating Z arrays for bucket 8\n",
            "  Entering block accumulator loop for bucket 8:\n",
            "  bucket 8: 10%\n",
            "  bucket 8: 20%\n",
            "  bucket 8: 30%\n",
            "  bucket 8: 40%\n",
            "  bucket 8: 50%\n",
            "  bucket 8: 60%\n",
            "  bucket 8: 70%\n",
            "  bucket 8: 80%\n",
            "  bucket 8: 90%\n",
            "  bucket 8: 100%\n",
            "  Sorting block of length 991216 for bucket 8\n",
            "  (Using difference cover)\n",
            "  Sorting block time: 00:00:00\n",
            "Returning block of 991217 for bucket 8\n",
            "Exited GFM loop\n",
            "fchr[A]: 0\n",
            "fchr[C]: 3766349\n",
            "fchr[G]: 6086925\n",
            "fchr[T]: 8404025\n",
            "fchr[$]: 12157105\n",
            "Exiting GFM::buildToDisk()\n",
            "Returning from initFromVector\n",
            "Wrote 8248111 bytes to primary GFM file: yeast_index/genome.1.ht2\n",
            "Wrote 3039284 bytes to secondary GFM file: yeast_index/genome.2.ht2\n",
            "Re-opening _in1 and _in2 as input streams\n",
            "Returning from GFM constructor\n",
            "Returning from initFromVector\n",
            "Wrote 5399069 bytes to primary GFM file: yeast_index/genome.5.ht2\n",
            "Wrote 3092708 bytes to secondary GFM file: yeast_index/genome.6.ht2\n",
            "Re-opening _in5 and _in5 as input streams\n",
            "Returning from HierEbwt constructor\n",
            "Headers:\n",
            "    len: 12157105\n",
            "    gbwtLen: 12157106\n",
            "    nodes: 12157106\n",
            "    sz: 3039277\n",
            "    gbwtSz: 3039277\n",
            "    lineRate: 6\n",
            "    offRate: 4\n",
            "    offMask: 0xfffffff0\n",
            "    ftabChars: 10\n",
            "    eftabLen: 0\n",
            "    eftabSz: 0\n",
            "    ftabLen: 1048577\n",
            "    ftabSz: 4194308\n",
            "    offsLen: 759820\n",
            "    offsSz: 3039280\n",
            "    lineSz: 64\n",
            "    sideSz: 64\n",
            "    sideGbwtSz: 48\n",
            "    sideGbwtLen: 192\n",
            "    numSides: 63319\n",
            "    numLines: 63319\n",
            "    gbwtTotLen: 4052416\n",
            "    gbwtTotSz: 4052416\n",
            "    reverse: 0\n",
            "    linearFM: Yes\n",
            "Total time for call to driver() for forward index: 00:00:10\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              ""
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 8
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ZtaZTxNCncoo",
        "colab_type": "text"
      },
      "source": [
        "## chipseq"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "WSik-38b56Mv",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 105
        },
        "outputId": "7614c30f-ecfe-4187-97f3-c0eaa3cd8972"
      },
      "source": [
        "# from scripts/chipseq/runchipseq.py\n",
        "\n",
        "import subprocess\n",
        "import os\n",
        "from pandas import read_csv\n",
        "\n",
        "\"\"\"\n",
        "Run the ChIP-Seq aligner using HISAT\n",
        "\"\"\"\n",
        "\n",
        "ChIPSeqSRARunTableFile='seqacademy/data/ChIPSeqSRA.tsv'\n",
        "ChIPSeqSRATable = read_csv(ChIPSeqSRARunTableFile, delimiter='\\t')\n",
        "ChIPSeqoutrun = (ChIPSeqSRATable[\"Run\"])\n",
        "ChIPSeqoutputSam = \"test/\" + ChIPSeqoutrun + \".sam\"\n",
        "ChIPSeqoutputAlignmentSummary = \"test/\" + ChIPSeqoutrun + \".txt\"\n",
        "ChIPSeqoutputMetrics = \"test/\" + ChIPSeqoutrun + \".metrics\"\n",
        "ChIPSeqoutputSortBam = \"test/\" + ChIPSeqoutrun + \".sorted.bam\"\n",
        "\n",
        "for index, individual in enumerate(ChIPSeqoutrun):\n",
        "    run = ChIPSeqoutrun[index]\n",
        "    summary = ChIPSeqoutputAlignmentSummary[index] \n",
        "    metrics = ChIPSeqoutputMetrics[index]\n",
        "    sam = ChIPSeqoutputSam[index]\n",
        "    bam = ChIPSeqoutputSortBam[index]\n",
        "    index = \"yeast_index/genome\"\n",
        "    cmduse = \"hisat2 -u 10000 -p 2 -x {0} --sra-acc {1} --new-summary --summary-file {2} --met-file {3} -S {4}\".format(index, run, summary, metrics, sam)\n",
        "    print(cmduse)\n",
        "    #os.system(cmduse) # just printing the command rather than running it, for the sake of running it manually in the cell below"
      ],
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "hisat2 -u 10000 -p 2 -x yeast_index/genome --sra-acc SRR6703656 --new-summary --summary-file test/SRR6703656.txt --met-file test/SRR6703656.metrics -S test/SRR6703656.sam\n",
            "hisat2 -u 10000 -p 2 -x yeast_index/genome --sra-acc SRR6703661 --new-summary --summary-file test/SRR6703661.txt --met-file test/SRR6703661.metrics -S test/SRR6703661.sam\n",
            "hisat2 -u 10000 -p 2 -x yeast_index/genome --sra-acc SRR6703662 --new-summary --summary-file test/SRR6703662.txt --met-file test/SRR6703662.metrics -S test/SRR6703662.sam\n",
            "hisat2 -u 10000 -p 2 -x yeast_index/genome --sra-acc SRR6703663 --new-summary --summary-file test/SRR6703663.txt --met-file test/SRR6703663.metrics -S test/SRR6703663.sam\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "M6vE-stH7KA4",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 204
        },
        "outputId": "8a50d539-5640-4da7-f21d-5722a01ae996"
      },
      "source": [
        "# run one of the commands from above\n",
        "!hisat2 -u 10000 -p 2 -x yeast_index/genome --sra-acc SRR6703656 --new-summary --summary-file test/SRR6703656.txt --met-file test/SRR6703656.metrics -S test/SRR6703656.sam"
      ],
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "HISAT2 summary stats:\n",
            "\tTotal pairs: 10000\n",
            "\t\tAligned concordantly or discordantly 0 time: 1567 (15.67%)\n",
            "\t\tAligned concordantly 1 time: 7092 (70.92%)\n",
            "\t\tAligned concordantly >1 times: 1280 (12.80%)\n",
            "\t\tAligned discordantly 1 time: 61 (0.61%)\n",
            "\tTotal unpaired reads: 3134\n",
            "\t\tAligned 0 time: 2027 (64.68%)\n",
            "\t\tAligned 1 time: 918 (29.29%)\n",
            "\t\tAligned >1 times: 189 (6.03%)\n",
            "\tOverall alignment rate: 89.86%\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "X5vKLSa5sRkU",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "L6c_VubVpdcV",
        "colab_type": "text"
      },
      "source": [
        "## install htseq"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "b2Hfsn9ApexI",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 190
        },
        "outputId": "5f424381-2c52-4e47-db2c-676ea6bfb71d"
      },
      "source": [
        "!pip install HTSeq"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Collecting HTSeq\n",
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/53/ca/287519f54993e1ed1d7fcd773baac8a05d710b90b7ca5970c4664c686810/HTSeq-0.12.4-cp36-cp36m-manylinux2010_x86_64.whl (1.4MB)\n",
            "\u001b[K     |████████████████████████████████| 1.4MB 2.8MB/s \n",
            "\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from HTSeq) (1.18.5)\n",
            "Collecting pysam\n",
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/87/a1/73e80a7a873f3fb0e52d368a4343eb9882b737c932b95020d82251f1087e/pysam-0.16.0.1-cp36-cp36m-manylinux1_x86_64.whl (9.9MB)\n",
            "\u001b[K     |████████████████████████████████| 10.0MB 16.6MB/s \n",
            "\u001b[?25hInstalling collected packages: pysam, HTSeq\n",
            "Successfully installed HTSeq-0.12.4 pysam-0.16.0.1\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "0Mhri2nPrTRS",
        "colab_type": "text"
      },
      "source": [
        "## install samtools"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "vPeZMsctrUgP",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "%%shell\n",
        "# compile from source\n",
        "#wget https://github.com/samtools/samtools/releases/download/1.10/samtools-1.10.tar.bz2\n",
        "#bunzip2 samtools-1.10.tar.bz2\n",
        "#tar -xf samtools-1.10.tar\n",
        "#ls samtools-1.10\n",
        "\n",
        "# install from ubuntu ppa\n",
        "apt-get install -q samtools\n",
        "samtools --version"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "MaKyHQoxr5MG",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 68
        },
        "outputId": "22caa892-bd04-4bc4-9734-ef2e4ce4b7c0"
      },
      "source": [
        "!samtools --version"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "samtools 1.7\n",
            "Using htslib 1.7-2\n",
            "Copyright (C) 2018 Genome Research Ltd.\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "1ZEHv_eOnthK",
        "colab_type": "text"
      },
      "source": [
        "## rnaseq\n",
        "\n",
        "https://github.com/NCBI-Hackathons/seqacademy/blob/master/rnaseq.md"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "IM0l5GqNnu49",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 119
        },
        "outputId": "67c0ba58-ae78-419e-fb10-524f328250e3"
      },
      "source": [
        "# scripts/rnaseq/runrnaseq.py\n",
        "\n",
        "from pandas import read_csv\n",
        "import os\n",
        "\n",
        "RNASeqSRARunTableFile='seqacademy/data/RNASeqSRA.tsv'\n",
        "RNASeqSRATable = read_csv(RNASeqSRARunTableFile, delimiter='\\t')\n",
        "RNASeqoutrun = (RNASeqSRATable[\"Run\"])\n",
        "RNASeqoutputSam = \"test/\" + RNASeqoutrun + \".sam\"\n",
        "RNASeqoutputAlignmentSummary = \"test/\" + RNASeqoutrun + \".txt\"\n",
        "RNASeqoutputMetrics = \"test/\" + RNASeqoutrun + \".metrics\"\n",
        "RNASeqoutputSortBam = \"test/\" + RNASeqoutrun + \".sorted.bam\"\n",
        "\n",
        "for index, individual in enumerate(RNASeqoutrun):\n",
        "    run = RNASeqoutrun[index]\n",
        "    summary = RNASeqoutputAlignmentSummary[index] \n",
        "    metrics = RNASeqoutputMetrics[index]\n",
        "    sam = RNASeqoutputSam[index]\n",
        "    bam = RNASeqoutputSortBam[index]\n",
        "    cmduse = \"hisat2 -u 2500 -x yeast_index/genome --sra-acc {0} --new-summary --summary-file {1} --met-file {2} -S {3}\".format(run, summary, metrics, sam)\n",
        "    print(cmduse)\n",
        "    #os.system(cmduse)"
      ],
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "hisat2 -u 2500 -x yeast_index/genome --sra-acc SRR5494627 --new-summary --summary-file test/SRR5494627.txt --met-file test/SRR5494627.metrics -S test/SRR5494627.sam\n",
            "hisat2 -u 2500 -x yeast_index/genome --sra-acc SRR5494628 --new-summary --summary-file test/SRR5494628.txt --met-file test/SRR5494628.metrics -S test/SRR5494628.sam\n",
            "hisat2 -u 2500 -x yeast_index/genome --sra-acc SRR5494629 --new-summary --summary-file test/SRR5494629.txt --met-file test/SRR5494629.metrics -S test/SRR5494629.sam\n",
            "hisat2 -u 2500 -x yeast_index/genome --sra-acc SRR5494630 --new-summary --summary-file test/SRR5494630.txt --met-file test/SRR5494630.metrics -S test/SRR5494630.sam\n",
            "hisat2 -u 2500 -x yeast_index/genome --sra-acc SRR5494631 --new-summary --summary-file test/SRR5494631.txt --met-file test/SRR5494631.metrics -S test/SRR5494631.sam\n",
            "hisat2 -u 2500 -x yeast_index/genome --sra-acc SRR5494632 --new-summary --summary-file test/SRR5494632.txt --met-file test/SRR5494632.metrics -S test/SRR5494632.sam\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "kycvWwXpozjo",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 119
        },
        "outputId": "eb6325f0-90f7-4940-8fe8-5c7a24ca40ae"
      },
      "source": [
        "%%shell\n",
        "# run one command from above\n",
        "hisat2 -u 2500 -x yeast_index/genome --sra-acc SRR5494627 --new-summary --summary-file test/SRR5494627.txt --met-file test/SRR5494627.metrics -S test/SRR5494627.sam"
      ],
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "HISAT2 summary stats:\n",
            "\tTotal reads: 2500\n",
            "\t\tAligned 0 time: 129 (5.16%)\n",
            "\t\tAligned 1 time: 2164 (86.56%)\n",
            "\t\tAligned >1 times: 207 (8.28%)\n",
            "\tOverall alignment rate: 94.84%\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              ""
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 13
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "KDGrzmyipfpw",
        "colab_type": "text"
      },
      "source": [
        "## htseq-count"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "rdRlohY6r-BY",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 119
        },
        "outputId": "adeb43a5-e8e6-4e09-9f38-3cbddd375948"
      },
      "source": [
        "# from scripts/rnaseq/samtools.py\n",
        "import os\n",
        "from pandas import read_csv\n",
        "\n",
        "RNASeqSRARunTableFile='seqacademy/data/RNASeqSRA.tsv'\n",
        "RNASeqSRATable = read_csv(RNASeqSRARunTableFile, delimiter='\\t')\n",
        "RNASeqoutrun = (RNASeqSRATable[\"Run\"])\n",
        "RNASeqoutputSam = \"test/\" + RNASeqoutrun + \".sam\"\n",
        "RNASeqoutputAlignmentSummary = \"test/\" + RNASeqoutrun + \".txt\"\n",
        "RNASeqoutputMetrics = \"test/\" + RNASeqoutrun + \".metrics\"\n",
        "RNASeqoutputSortBam = \"test/\" + RNASeqoutrun + \".sorted.bam\"\n",
        "\n",
        "for index, individual in enumerate(RNASeqoutrun):\n",
        "    run = RNASeqoutrun[index]\n",
        "    summary = RNASeqoutputAlignmentSummary[index] \n",
        "    metrics = RNASeqoutputMetrics[index]\n",
        "    sam = RNASeqoutputSam[index]\n",
        "    bam = RNASeqoutputSortBam[index]\n",
        "    cmduse = \"samtools view -bSF4 {0} | samtools sort -o {1}\".format(sam, bam)\n",
        "    print(cmduse)\n",
        "    #os.system(cmduse)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "samtools view -bSF4 test/SRR5494627.sam | samtools sort -o test/SRR5494627.sorted.bam\n",
            "samtools view -bSF4 test/SRR5494628.sam | samtools sort -o test/SRR5494628.sorted.bam\n",
            "samtools view -bSF4 test/SRR5494629.sam | samtools sort -o test/SRR5494629.sorted.bam\n",
            "samtools view -bSF4 test/SRR5494630.sam | samtools sort -o test/SRR5494630.sorted.bam\n",
            "samtools view -bSF4 test/SRR5494631.sam | samtools sort -o test/SRR5494631.sorted.bam\n",
            "samtools view -bSF4 test/SRR5494632.sam | samtools sort -o test/SRR5494632.sorted.bam\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "-A2e-8vasEKl",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# run one command\n",
        "!samtools view -bSF4 test/SRR5494627.sam | samtools sort -o test/SRR5494627.sorted.bam"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Xas48RrGs9hd",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "ff8f48bc-7e82-44c7-ee27-c7d447ef50c8"
      },
      "source": [
        "!du -sh test/SRR5494627.sorted.bam"
      ],
      "execution_count": 38,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "96K\ttest/SRR5494627.sorted.bam\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "m_AQg3nooFsQ",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 119
        },
        "outputId": "f295de9c-8a01-47ee-8c42-26957e4c0f9e"
      },
      "source": [
        "gtf = \"data/Saccharomyces_cerevisiae.R64-1-1.84.gtf\"\n",
        "\n",
        "RNASeqSRARunTableFile='seqacademy/data/RNASeqSRA.tsv'\n",
        "RNASeqSRATable = read_csv(RNASeqSRARunTableFile, delimiter='\\t')\n",
        "RNASeqoutrun = (RNASeqSRATable[\"Run\"])\n",
        "RNASeqoutputSortBam = \"test/\" + RNASeqoutrun + \".sorted.bam\"\n",
        "\n",
        "for index, individual in enumerate(RNASeqoutputSortBam):\n",
        "    input = individual\n",
        "    output = individual.replace(\".sorted.bam\", \"\") + \".genecount.txt\"\n",
        "    cmduse = \"htseq-count -m intersection-nonempty -s no -f bam %s %s > %s\" % (input, gtf, output)\n",
        "    print(cmduse)\n",
        "    #os.system(cmduse)"
      ],
      "execution_count": 15,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "htseq-count -m intersection-nonempty -s no -f bam test/SRR5494627.sorted.bam data/Saccharomyces_cerevisiae.R64-1-1.84.gtf > test/SRR5494627.genecount.txt\n",
            "htseq-count -m intersection-nonempty -s no -f bam test/SRR5494628.sorted.bam data/Saccharomyces_cerevisiae.R64-1-1.84.gtf > test/SRR5494628.genecount.txt\n",
            "htseq-count -m intersection-nonempty -s no -f bam test/SRR5494629.sorted.bam data/Saccharomyces_cerevisiae.R64-1-1.84.gtf > test/SRR5494629.genecount.txt\n",
            "htseq-count -m intersection-nonempty -s no -f bam test/SRR5494630.sorted.bam data/Saccharomyces_cerevisiae.R64-1-1.84.gtf > test/SRR5494630.genecount.txt\n",
            "htseq-count -m intersection-nonempty -s no -f bam test/SRR5494631.sorted.bam data/Saccharomyces_cerevisiae.R64-1-1.84.gtf > test/SRR5494631.genecount.txt\n",
            "htseq-count -m intersection-nonempty -s no -f bam test/SRR5494632.sorted.bam data/Saccharomyces_cerevisiae.R64-1-1.84.gtf > test/SRR5494632.genecount.txt\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "-Ro3_TDBsgMD",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "13587569-9cbb-4dee-f995-cc48703a8c29"
      },
      "source": [
        "!du -sh test/SRR5494627.sorted.bam"
      ],
      "execution_count": 40,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "96K\ttest/SRR5494627.sorted.bam\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Jyr2tC0Wo8G6",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 85
        },
        "outputId": "1471acea-3806-4f90-e456-c01054c23be4"
      },
      "source": [
        "%%shell\n",
        "# try one command\n",
        "htseq-count -m intersection-nonempty -s no test/SRR5494627.sorted.bam seqacademy/data/Saccharomyces_cerevisiae.R64-1-1.84.gtf > test/SRR5494627.genecount.txt"
      ],
      "execution_count": 39,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[E::idx_find_and_load] Could not retrieve index file for 'test/SRR5494627.sorted.bam'\n",
            "42071 GFF lines processed.\n",
            "[E::idx_find_and_load] Could not retrieve index file for 'test/SRR5494627.sorted.bam'\n",
            "2701 alignments  processed.\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              ""
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 39
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "aJFgh2fKsvCx",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "72e08a5a-c647-4682-a210-87fd7cb65c63"
      },
      "source": [
        "!du -sh test/SRR5494627.genecount.txt"
      ],
      "execution_count": 42,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "72K\ttest/SRR5494627.genecount.txt\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "oYXODwUexqWG",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 187
        },
        "outputId": "bc2cce62-93c3-4bd1-b2e2-3f1fd4563d07"
      },
      "source": [
        "!head test/SRR5494627.genecount.txt"
      ],
      "execution_count": 43,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "15S_rRNA\t0\n",
            "21S_rRNA\t0\n",
            "HRA1\t0\n",
            "ICR1\t0\n",
            "LSR1\t0\n",
            "NME1\t0\n",
            "PWR1\t0\n",
            "Q0010\t0\n",
            "Q0017\t0\n",
            "Q0032\t0\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "btjqISzyxsfV",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 187
        },
        "outputId": "326e865f-d809-485f-9242-94f580b3267d"
      },
      "source": [
        "# check if all 0's or not\n",
        "!cat test/SRR5494627.genecount.txt|awk '{print $2}'|sort|uniq|head"
      ],
      "execution_count": 46,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "0\n",
            "1\n",
            "10\n",
            "11\n",
            "12\n",
            "13\n",
            "147\n",
            "16\n",
            "19\n",
            "2\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
}