Last active
November 6, 2023 13:59
-
-
Save shadiakiki1986/3713187107a5c5322e507f0d38e6ff0b to your computer and use it in GitHub Desktop.
try seqacademy.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "try seqacademy.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"toc_visible": true, | |
"authorship_tag": "ABX9TyOjb1/A6k6mo17GqKVTiKF4", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/shadiakiki1986/3713187107a5c5322e507f0d38e6ff0b/try-seqacademy.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "SGyGLv5G0Bbm", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Based on https://github.com/NCBI-Hackathons/seqacademy/" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "UUXbH1y2nisC", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## Install hisat2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "egBGvqsxy9SA", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 289 | |
}, | |
"outputId": "c210f714-7949-4870-9b81-1ee2c767ac08" | |
}, | |
"source": [ | |
"%%shell\n", | |
"# download hisat2 https://ccb.jhu.edu/software/hisat2/manual.shtml\n", | |
"wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/downloads/hisat2-2.1.0-Linux_x86_64.zip\n", | |
"unzip hisat2-2.1.0-Linux_x86_64.zip > /dev/null\n", | |
"ls\n", | |
"#chmod +x hisat2-2.1.0/\n", | |
"mv hisat2-2.1.0/ /usr/local/bin/" | |
], | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"--2020-08-05 07:59:55-- ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/downloads/hisat2-2.1.0-Linux_x86_64.zip\n", | |
" => ‘hisat2-2.1.0-Linux_x86_64.zip’\n", | |
"Resolving ftp.ccb.jhu.edu (ftp.ccb.jhu.edu)... 128.220.174.63\n", | |
"Connecting to ftp.ccb.jhu.edu (ftp.ccb.jhu.edu)|128.220.174.63|:21... connected.\n", | |
"Logging in as anonymous ... Logged in!\n", | |
"==> SYST ... done. ==> PWD ... done.\n", | |
"==> TYPE I ... done. ==> CWD (1) /pub/infphilo/hisat2/downloads ... done.\n", | |
"==> SIZE hisat2-2.1.0-Linux_x86_64.zip ... 27104162\n", | |
"==> PASV ... done. ==> RETR hisat2-2.1.0-Linux_x86_64.zip ... done.\n", | |
"Length: 27104162 (26M) (unauthoritative)\n", | |
"\n", | |
"\r hisat2-2. 0%[ ] 0 --.-KB/s \r hisat2-2.1 56%[==========> ] 14.51M 69.2MB/s \rhisat2-2.1.0-Linux_ 100%[===================>] 25.85M 93.7MB/s in 0.3s \n", | |
"\n", | |
"2020-08-05 07:59:55 (93.7 MB/s) - ‘hisat2-2.1.0-Linux_x86_64.zip’ saved [27104162]\n", | |
"\n", | |
"hisat2-2.1.0 hisat2-2.1.0-Linux_x86_64.zip sample_data\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 1 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "VEF3wG472uS_", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 54 | |
}, | |
"outputId": "14e09e22-a9e6-4a64-897d-3136c17e019c" | |
}, | |
"source": [ | |
"#%%shell\n", | |
"\n", | |
"# using PATH=$PATH:... in shell doesn't persist the changes to other cells,\n", | |
"# so use os.environ instead\n", | |
"#PATH=$PATH:/usr/local/bin/hisat2-2.1.0\n", | |
"import os\n", | |
"os.environ['PATH'] += \":/usr/local/bin/hisat2-2.1.0\"\n", | |
"!echo $PATH" | |
], | |
"execution_count": 2, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/tools/node/bin:/tools/google-cloud-sdk/bin:/opt/bin:/usr/local/bin/hisat2-2.1.0\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "JL-LDdRq25Xh", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 54 | |
}, | |
"outputId": "3be69587-9628-420e-fdeb-785ee6d02857" | |
}, | |
"source": [ | |
"# check that PATH edit is persistent\n", | |
"!echo $PATH" | |
], | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/tools/node/bin:/tools/google-cloud-sdk/bin:/opt/bin:/usr/local/bin/hisat2-2.1.0\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "L1hZtmnQ1oKE", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 136 | |
}, | |
"outputId": "fbfa6b62-913e-4487-8cdc-9661adb0c663" | |
}, | |
"source": [ | |
"!hisat2-build --version" | |
], | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"hisat2-build version 2.1.0\n", | |
"64-bit\n", | |
"Built on login-node03\n", | |
"Wed Jun 7 15:52:43 EDT 2017\n", | |
"Compiler: gcc version 4.8.2 (GCC) \n", | |
"Options: -O3 -m64 -msse2 -funroll-loops -g3 -DPOPCNT_CAPABILITY\n", | |
"Sizeof {int, long, long long, void*, size_t, off_t}: {4, 8, 8, 8, 8, 8}\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "9obcFKcAoPEW", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## prep" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "7Rm-XEBW6G4l", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 289 | |
}, | |
"outputId": "3577c2e3-b88a-49f5-ac5d-3312fdf328f0" | |
}, | |
"source": [ | |
"%%shell\n", | |
"# download seqacademy source\n", | |
"wget https://github.com/NCBI-Hackathons/seqacademy/archive/master.zip\n", | |
"unzip master.zip > /dev/null\n", | |
"mv seqacademy-master seqacademy" | |
], | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"--2020-08-05 08:00:07-- https://github.com/NCBI-Hackathons/seqacademy/archive/master.zip\n", | |
"Resolving github.com (github.com)... 140.82.114.4\n", | |
"Connecting to github.com (github.com)|140.82.114.4|:443... connected.\n", | |
"HTTP request sent, awaiting response... 302 Found\n", | |
"Location: https://codeload.github.com/NCBI-Hackathons/seqacademy/zip/master [following]\n", | |
"--2020-08-05 08:00:07-- https://codeload.github.com/NCBI-Hackathons/seqacademy/zip/master\n", | |
"Resolving codeload.github.com (codeload.github.com)... 140.82.114.9\n", | |
"Connecting to codeload.github.com (codeload.github.com)|140.82.114.9|:443... connected.\n", | |
"HTTP request sent, awaiting response... 200 OK\n", | |
"Length: unspecified [application/zip]\n", | |
"Saving to: ‘master.zip’\n", | |
"\n", | |
"\rmaster.zip [<=> ] 0 --.-KB/s \rmaster.zip [ <=> ] 109.64K 537KB/s \rmaster.zip [ <=> ] 2.40M 8.04MB/s in 0.3s \n", | |
"\n", | |
"2020-08-05 08:00:08 (8.04 MB/s) - ‘master.zip’ saved [2518828]\n", | |
"\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 5 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "u-PMZUar7BCX", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"!mkdir test # required for next step" | |
], | |
"execution_count": 6, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "S6hulUPRoVwl", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## hisat2-build (build index)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "7IPeQjH1vn_P", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 272 | |
}, | |
"outputId": "3e7685ef-71c5-431d-bd5c-69e3bdececda" | |
}, | |
"source": [ | |
"%%shell\n", | |
"# from scripts/chipseq/index.sh\n", | |
"\n", | |
"mkdir yeast_index\n", | |
"cd yeast_index\n", | |
"\n", | |
"#\n", | |
"# Downloads sequences for the latest Yeast release from Ensembl.\n", | |
"#\n", | |
"# By default, this script builds an index for just the base files,\n", | |
"# since alignments to those sequences are the most useful. To change\n", | |
"# which categories are built by this script, edit the CHRS_TO_INDEX\n", | |
"# variable below.\n", | |
"#\n", | |
"\n", | |
"ENSEMBL_RELEASE=84\n", | |
"ENSEMBL_YEAST_BASE=ftp://ftp.ensembl.org/pub/release-${ENSEMBL_RELEASE}/fasta/saccharomyces_cerevisiae/dna/\n", | |
"\n", | |
"F=Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa\n", | |
"if [ ! -f $F ] ; then\n", | |
"\twget ${ENSEMBL_YEAST_BASE}/$F.gz || (echo \"Error getting $F\" && exit 1)\n", | |
"\tgunzip $F.gz || (echo \"Error unzipping $F\" && exit 1)\n", | |
"\tmv $F genome.fa\n", | |
"fi" | |
], | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"--2020-08-05 08:00:13-- ftp://ftp.ensembl.org/pub/release-84/fasta/saccharomyces_cerevisiae/dna//Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.gz\n", | |
" => ‘Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.gz’\n", | |
"Resolving ftp.ensembl.org (ftp.ensembl.org)... 193.62.193.139\n", | |
"Connecting to ftp.ensembl.org (ftp.ensembl.org)|193.62.193.139|:21... connected.\n", | |
"Logging in as anonymous ... Logged in!\n", | |
"==> SYST ... done. ==> PWD ... done.\n", | |
"==> TYPE I ... done. ==> CWD (1) /pub/release-84/fasta/saccharomyces_cerevisiae/dna/ ... done.\n", | |
"==> SIZE Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.gz ... 3786708\n", | |
"==> PASV ... done. ==> RETR Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.gz ... done.\n", | |
"Length: 3786708 (3.6M) (unauthoritative)\n", | |
"\n", | |
"Saccharomyces_cerev 100%[===================>] 3.61M 4.69MB/s in 0.8s \n", | |
"\n", | |
"2020-08-05 08:00:15 (4.69 MB/s) - ‘Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.gz’ saved [3786708]\n", | |
"\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 7 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "U9AjyCx7yYCb", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 1000 | |
}, | |
"outputId": "a755e7b4-b4cc-44ad-b60c-29491fd042e7" | |
}, | |
"source": [ | |
"%%shell\n", | |
"# from scripts/chipseq/index.sh\n", | |
"\n", | |
"hisat2-build yeast_index/genome.fa yeast_index/genome\n", | |
"\n", | |
"#rm genome.fa" | |
], | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Settings:\n", | |
" Output files: \"yeast_index/genome.*.ht2\"\n", | |
" Line rate: 6 (line is 64 bytes)\n", | |
" Lines per side: 1 (side is 64 bytes)\n", | |
" Offset rate: 4 (one in 16)\n", | |
" FTable chars: 10\n", | |
" Strings: unpacked\n", | |
" Local offset rate: 3 (one in 8)\n", | |
" Local fTable chars: 6\n", | |
" Local sequence length: 57344\n", | |
" Local sequence overlap between two consecutive indexes: 1024\n", | |
" Endianness: little\n", | |
" Actual local endianness: little\n", | |
" Sanity checking: disabled\n", | |
" Assertions: disabled\n", | |
" Random seed: 0\n", | |
" Sizeofs: void*:8, int:4, long:8, size_t:8\n", | |
"Input files DNA, FASTA:\n", | |
" yeast_index/genome.fa\n", | |
"Reading reference sizes\n", | |
" Time reading reference sizes: 00:00:01\n", | |
"Calculating joined length\n", | |
"Writing header\n", | |
"Reserving space for joined string\n", | |
"Joining reference sequences\n", | |
" Time to join reference sequences: 00:00:00\n", | |
" Time to read SNPs and splice sites: 00:00:00\n", | |
"Using parameters --bmax 2279457 --dcv 1024\n", | |
" Doing ahead-of-time memory usage test\n", | |
" Passed! Constructing with these parameters: --bmax 2279457 --dcv 1024\n", | |
"Constructing suffix-array element generator\n", | |
"Building DifferenceCoverSample\n", | |
" Building sPrime\n", | |
" Building sPrimeOrder\n", | |
" V-Sorting samples\n", | |
" V-Sorting samples time: 00:00:00\n", | |
" Allocating rank array\n", | |
" Ranking v-sort output\n", | |
" Ranking v-sort output time: 00:00:00\n", | |
" Invoking Larsson-Sadakane on ranks\n", | |
" Invoking Larsson-Sadakane on ranks time: 00:00:00\n", | |
" Sanity-checking and returning\n", | |
"Building samples\n", | |
"Reserving space for 12 sample suffixes\n", | |
"Generating random suffixes\n", | |
"QSorting 12 sample offsets, eliminating duplicates\n", | |
"QSorting sample offsets, eliminating duplicates time: 00:00:00\n", | |
"Multikey QSorting 12 samples\n", | |
" (Using difference cover)\n", | |
" Multikey QSorting samples time: 00:00:00\n", | |
"Calculating bucket sizes\n", | |
"Splitting and merging\n", | |
" Splitting and merging time: 00:00:00\n", | |
"Split 1, merged 5; iterating...\n", | |
"Splitting and merging\n", | |
" Splitting and merging time: 00:00:00\n", | |
"Avg bucket size: 1.51964e+06 (target: 2279456)\n", | |
"Converting suffix-array elements to index image\n", | |
"Allocating ftab, absorbFtab\n", | |
"Entering GFM loop\n", | |
"Getting block 1 of 8\n", | |
" Reserving size (2279457) for bucket 1\n", | |
" Calculating Z arrays for bucket 1\n", | |
" Entering block accumulator loop for bucket 1:\n", | |
" bucket 1: 10%\n", | |
" bucket 1: 20%\n", | |
" bucket 1: 30%\n", | |
" bucket 1: 40%\n", | |
" bucket 1: 50%\n", | |
" bucket 1: 60%\n", | |
" bucket 1: 70%\n", | |
" bucket 1: 80%\n", | |
" bucket 1: 90%\n", | |
" bucket 1: 100%\n", | |
" Sorting block of length 2000591 for bucket 1\n", | |
" (Using difference cover)\n", | |
" Sorting block time: 00:00:01\n", | |
"Returning block of 2000592 for bucket 1\n", | |
"Getting block 2 of 8\n", | |
" Reserving size (2279457) for bucket 2\n", | |
" Calculating Z arrays for bucket 2\n", | |
" Entering block accumulator loop for bucket 2:\n", | |
" bucket 2: 10%\n", | |
" bucket 2: 20%\n", | |
" bucket 2: 30%\n", | |
" bucket 2: 40%\n", | |
" bucket 2: 50%\n", | |
" bucket 2: 60%\n", | |
" bucket 2: 70%\n", | |
" bucket 2: 80%\n", | |
" bucket 2: 90%\n", | |
" bucket 2: 100%\n", | |
" Sorting block of length 1791111 for bucket 2\n", | |
" (Using difference cover)\n", | |
" Sorting block time: 00:00:01\n", | |
"Returning block of 1791112 for bucket 2\n", | |
"Getting block 3 of 8\n", | |
" Reserving size (2279457) for bucket 3\n", | |
" Calculating Z arrays for bucket 3\n", | |
" Entering block accumulator loop for bucket 3:\n", | |
" bucket 3: 10%\n", | |
" bucket 3: 20%\n", | |
" bucket 3: 30%\n", | |
" bucket 3: 40%\n", | |
" bucket 3: 50%\n", | |
" bucket 3: 60%\n", | |
" bucket 3: 70%\n", | |
" bucket 3: 80%\n", | |
" bucket 3: 90%\n", | |
" bucket 3: 100%\n", | |
" Sorting block of length 1386934 for bucket 3\n", | |
" (Using difference cover)\n", | |
" Sorting block time: 00:00:00\n", | |
"Returning block of 1386935 for bucket 3\n", | |
"Getting block 4 of 8\n", | |
" Reserving size (2279457) for bucket 4\n", | |
" Calculating Z arrays for bucket 4\n", | |
" Entering block accumulator loop for bucket 4:\n", | |
" bucket 4: 10%\n", | |
" bucket 4: 20%\n", | |
" bucket 4: 30%\n", | |
" bucket 4: 40%\n", | |
" bucket 4: 50%\n", | |
" bucket 4: 60%\n", | |
" bucket 4: 70%\n", | |
" bucket 4: 80%\n", | |
" bucket 4: 90%\n", | |
" bucket 4: 100%\n", | |
" Sorting block of length 1309547 for bucket 4\n", | |
" (Using difference cover)\n", | |
" Sorting block time: 00:00:00\n", | |
"Returning block of 1309548 for bucket 4\n", | |
"Getting block 5 of 8\n", | |
" Reserving size (2279457) for bucket 5\n", | |
" Calculating Z arrays for bucket 5\n", | |
" Entering block accumulator loop for bucket 5:\n", | |
" bucket 5: 10%\n", | |
" bucket 5: 20%\n", | |
" bucket 5: 30%\n", | |
" bucket 5: 40%\n", | |
" bucket 5: 50%\n", | |
" bucket 5: 60%\n", | |
" bucket 5: 70%\n", | |
" bucket 5: 80%\n", | |
" bucket 5: 90%\n", | |
" bucket 5: 100%\n", | |
" Sorting block of length 1717347 for bucket 5\n", | |
" (Using difference cover)\n", | |
" Sorting block time: 00:00:00\n", | |
"Returning block of 1717348 for bucket 5\n", | |
"Getting block 6 of 8\n", | |
" Reserving size (2279457) for bucket 6\n", | |
" Calculating Z arrays for bucket 6\n", | |
" Entering block accumulator loop for bucket 6:\n", | |
" bucket 6: 10%\n", | |
" bucket 6: 20%\n", | |
" bucket 6: 30%\n", | |
" bucket 6: 40%\n", | |
" bucket 6: 50%\n", | |
" bucket 6: 60%\n", | |
" bucket 6: 70%\n", | |
" bucket 6: 80%\n", | |
" bucket 6: 90%\n", | |
" bucket 6: 100%\n", | |
" Sorting block of length 1538951 for bucket 6\n", | |
" (Using difference cover)\n", | |
" Sorting block time: 00:00:01\n", | |
"Returning block of 1538952 for bucket 6\n", | |
"Getting block 7 of 8\n", | |
" Reserving size (2279457) for bucket 7\n", | |
" Calculating Z arrays for bucket 7\n", | |
" Entering block accumulator loop for bucket 7:\n", | |
" bucket 7: 10%\n", | |
" bucket 7: 20%\n", | |
" bucket 7: 30%\n", | |
" bucket 7: 40%\n", | |
" bucket 7: 50%\n", | |
" bucket 7: 60%\n", | |
" bucket 7: 70%\n", | |
" bucket 7: 80%\n", | |
" bucket 7: 90%\n", | |
" bucket 7: 100%\n", | |
" Sorting block of length 1421401 for bucket 7\n", | |
" (Using difference cover)\n", | |
" Sorting block time: 00:00:01\n", | |
"Returning block of 1421402 for bucket 7\n", | |
"Getting block 8 of 8\n", | |
" Reserving size (2279457) for bucket 8\n", | |
" Calculating Z arrays for bucket 8\n", | |
" Entering block accumulator loop for bucket 8:\n", | |
" bucket 8: 10%\n", | |
" bucket 8: 20%\n", | |
" bucket 8: 30%\n", | |
" bucket 8: 40%\n", | |
" bucket 8: 50%\n", | |
" bucket 8: 60%\n", | |
" bucket 8: 70%\n", | |
" bucket 8: 80%\n", | |
" bucket 8: 90%\n", | |
" bucket 8: 100%\n", | |
" Sorting block of length 991216 for bucket 8\n", | |
" (Using difference cover)\n", | |
" Sorting block time: 00:00:00\n", | |
"Returning block of 991217 for bucket 8\n", | |
"Exited GFM loop\n", | |
"fchr[A]: 0\n", | |
"fchr[C]: 3766349\n", | |
"fchr[G]: 6086925\n", | |
"fchr[T]: 8404025\n", | |
"fchr[$]: 12157105\n", | |
"Exiting GFM::buildToDisk()\n", | |
"Returning from initFromVector\n", | |
"Wrote 8248111 bytes to primary GFM file: yeast_index/genome.1.ht2\n", | |
"Wrote 3039284 bytes to secondary GFM file: yeast_index/genome.2.ht2\n", | |
"Re-opening _in1 and _in2 as input streams\n", | |
"Returning from GFM constructor\n", | |
"Returning from initFromVector\n", | |
"Wrote 5399069 bytes to primary GFM file: yeast_index/genome.5.ht2\n", | |
"Wrote 3092708 bytes to secondary GFM file: yeast_index/genome.6.ht2\n", | |
"Re-opening _in5 and _in5 as input streams\n", | |
"Returning from HierEbwt constructor\n", | |
"Headers:\n", | |
" len: 12157105\n", | |
" gbwtLen: 12157106\n", | |
" nodes: 12157106\n", | |
" sz: 3039277\n", | |
" gbwtSz: 3039277\n", | |
" lineRate: 6\n", | |
" offRate: 4\n", | |
" offMask: 0xfffffff0\n", | |
" ftabChars: 10\n", | |
" eftabLen: 0\n", | |
" eftabSz: 0\n", | |
" ftabLen: 1048577\n", | |
" ftabSz: 4194308\n", | |
" offsLen: 759820\n", | |
" offsSz: 3039280\n", | |
" lineSz: 64\n", | |
" sideSz: 64\n", | |
" sideGbwtSz: 48\n", | |
" sideGbwtLen: 192\n", | |
" numSides: 63319\n", | |
" numLines: 63319\n", | |
" gbwtTotLen: 4052416\n", | |
" gbwtTotSz: 4052416\n", | |
" reverse: 0\n", | |
" linearFM: Yes\n", | |
"Total time for call to driver() for forward index: 00:00:10\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 8 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "ZtaZTxNCncoo", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## chipseq" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "WSik-38b56Mv", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 105 | |
}, | |
"outputId": "7614c30f-ecfe-4187-97f3-c0eaa3cd8972" | |
}, | |
"source": [ | |
"# from scripts/chipseq/runchipseq.py\n", | |
"\n", | |
"import subprocess\n", | |
"import os\n", | |
"from pandas import read_csv\n", | |
"\n", | |
"\"\"\"\n", | |
"Run the ChIP-Seq aligner using HISAT\n", | |
"\"\"\"\n", | |
"\n", | |
"ChIPSeqSRARunTableFile='seqacademy/data/ChIPSeqSRA.tsv'\n", | |
"ChIPSeqSRATable = read_csv(ChIPSeqSRARunTableFile, delimiter='\\t')\n", | |
"ChIPSeqoutrun = (ChIPSeqSRATable[\"Run\"])\n", | |
"ChIPSeqoutputSam = \"test/\" + ChIPSeqoutrun + \".sam\"\n", | |
"ChIPSeqoutputAlignmentSummary = \"test/\" + ChIPSeqoutrun + \".txt\"\n", | |
"ChIPSeqoutputMetrics = \"test/\" + ChIPSeqoutrun + \".metrics\"\n", | |
"ChIPSeqoutputSortBam = \"test/\" + ChIPSeqoutrun + \".sorted.bam\"\n", | |
"\n", | |
"for index, individual in enumerate(ChIPSeqoutrun):\n", | |
" run = ChIPSeqoutrun[index]\n", | |
" summary = ChIPSeqoutputAlignmentSummary[index] \n", | |
" metrics = ChIPSeqoutputMetrics[index]\n", | |
" sam = ChIPSeqoutputSam[index]\n", | |
" bam = ChIPSeqoutputSortBam[index]\n", | |
" index = \"yeast_index/genome\"\n", | |
" cmduse = \"hisat2 -u 10000 -p 2 -x {0} --sra-acc {1} --new-summary --summary-file {2} --met-file {3} -S {4}\".format(index, run, summary, metrics, sam)\n", | |
" print(cmduse)\n", | |
" #os.system(cmduse) # just printing the command rather than running it, for the sake of running it manually in the cell below" | |
], | |
"execution_count": 9, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"hisat2 -u 10000 -p 2 -x yeast_index/genome --sra-acc SRR6703656 --new-summary --summary-file test/SRR6703656.txt --met-file test/SRR6703656.metrics -S test/SRR6703656.sam\n", | |
"hisat2 -u 10000 -p 2 -x yeast_index/genome --sra-acc SRR6703661 --new-summary --summary-file test/SRR6703661.txt --met-file test/SRR6703661.metrics -S test/SRR6703661.sam\n", | |
"hisat2 -u 10000 -p 2 -x yeast_index/genome --sra-acc SRR6703662 --new-summary --summary-file test/SRR6703662.txt --met-file test/SRR6703662.metrics -S test/SRR6703662.sam\n", | |
"hisat2 -u 10000 -p 2 -x yeast_index/genome --sra-acc SRR6703663 --new-summary --summary-file test/SRR6703663.txt --met-file test/SRR6703663.metrics -S test/SRR6703663.sam\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "M6vE-stH7KA4", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 204 | |
}, | |
"outputId": "8a50d539-5640-4da7-f21d-5722a01ae996" | |
}, | |
"source": [ | |
"# run one of the commands from above\n", | |
"!hisat2 -u 10000 -p 2 -x yeast_index/genome --sra-acc SRR6703656 --new-summary --summary-file test/SRR6703656.txt --met-file test/SRR6703656.metrics -S test/SRR6703656.sam" | |
], | |
"execution_count": 10, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"HISAT2 summary stats:\n", | |
"\tTotal pairs: 10000\n", | |
"\t\tAligned concordantly or discordantly 0 time: 1567 (15.67%)\n", | |
"\t\tAligned concordantly 1 time: 7092 (70.92%)\n", | |
"\t\tAligned concordantly >1 times: 1280 (12.80%)\n", | |
"\t\tAligned discordantly 1 time: 61 (0.61%)\n", | |
"\tTotal unpaired reads: 3134\n", | |
"\t\tAligned 0 time: 2027 (64.68%)\n", | |
"\t\tAligned 1 time: 918 (29.29%)\n", | |
"\t\tAligned >1 times: 189 (6.03%)\n", | |
"\tOverall alignment rate: 89.86%\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "X5vKLSa5sRkU", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "L6c_VubVpdcV", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## install htseq" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "b2Hfsn9ApexI", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 190 | |
}, | |
"outputId": "5f424381-2c52-4e47-db2c-676ea6bfb71d" | |
}, | |
"source": [ | |
"!pip install HTSeq" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Collecting HTSeq\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/53/ca/287519f54993e1ed1d7fcd773baac8a05d710b90b7ca5970c4664c686810/HTSeq-0.12.4-cp36-cp36m-manylinux2010_x86_64.whl (1.4MB)\n", | |
"\u001b[K |████████████████████████████████| 1.4MB 2.8MB/s \n", | |
"\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from HTSeq) (1.18.5)\n", | |
"Collecting pysam\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/87/a1/73e80a7a873f3fb0e52d368a4343eb9882b737c932b95020d82251f1087e/pysam-0.16.0.1-cp36-cp36m-manylinux1_x86_64.whl (9.9MB)\n", | |
"\u001b[K |████████████████████████████████| 10.0MB 16.6MB/s \n", | |
"\u001b[?25hInstalling collected packages: pysam, HTSeq\n", | |
"Successfully installed HTSeq-0.12.4 pysam-0.16.0.1\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "0Mhri2nPrTRS", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## install samtools" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "vPeZMsctrUgP", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"%%shell\n", | |
"# compile from source\n", | |
"#wget https://github.com/samtools/samtools/releases/download/1.10/samtools-1.10.tar.bz2\n", | |
"#bunzip2 samtools-1.10.tar.bz2\n", | |
"#tar -xf samtools-1.10.tar\n", | |
"#ls samtools-1.10\n", | |
"\n", | |
"# install from ubuntu ppa\n", | |
"apt-get install -q samtools\n", | |
"samtools --version" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "MaKyHQoxr5MG", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 68 | |
}, | |
"outputId": "22caa892-bd04-4bc4-9734-ef2e4ce4b7c0" | |
}, | |
"source": [ | |
"!samtools --version" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"samtools 1.7\n", | |
"Using htslib 1.7-2\n", | |
"Copyright (C) 2018 Genome Research Ltd.\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "1ZEHv_eOnthK", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## rnaseq\n", | |
"\n", | |
"https://github.com/NCBI-Hackathons/seqacademy/blob/master/rnaseq.md" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "IM0l5GqNnu49", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 119 | |
}, | |
"outputId": "67c0ba58-ae78-419e-fb10-524f328250e3" | |
}, | |
"source": [ | |
"# scripts/rnaseq/runrnaseq.py\n", | |
"\n", | |
"from pandas import read_csv\n", | |
"import os\n", | |
"\n", | |
"RNASeqSRARunTableFile='seqacademy/data/RNASeqSRA.tsv'\n", | |
"RNASeqSRATable = read_csv(RNASeqSRARunTableFile, delimiter='\\t')\n", | |
"RNASeqoutrun = (RNASeqSRATable[\"Run\"])\n", | |
"RNASeqoutputSam = \"test/\" + RNASeqoutrun + \".sam\"\n", | |
"RNASeqoutputAlignmentSummary = \"test/\" + RNASeqoutrun + \".txt\"\n", | |
"RNASeqoutputMetrics = \"test/\" + RNASeqoutrun + \".metrics\"\n", | |
"RNASeqoutputSortBam = \"test/\" + RNASeqoutrun + \".sorted.bam\"\n", | |
"\n", | |
"for index, individual in enumerate(RNASeqoutrun):\n", | |
" run = RNASeqoutrun[index]\n", | |
" summary = RNASeqoutputAlignmentSummary[index] \n", | |
" metrics = RNASeqoutputMetrics[index]\n", | |
" sam = RNASeqoutputSam[index]\n", | |
" bam = RNASeqoutputSortBam[index]\n", | |
" cmduse = \"hisat2 -u 2500 -x yeast_index/genome --sra-acc {0} --new-summary --summary-file {1} --met-file {2} -S {3}\".format(run, summary, metrics, sam)\n", | |
" print(cmduse)\n", | |
" #os.system(cmduse)" | |
], | |
"execution_count": 12, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"hisat2 -u 2500 -x yeast_index/genome --sra-acc SRR5494627 --new-summary --summary-file test/SRR5494627.txt --met-file test/SRR5494627.metrics -S test/SRR5494627.sam\n", | |
"hisat2 -u 2500 -x yeast_index/genome --sra-acc SRR5494628 --new-summary --summary-file test/SRR5494628.txt --met-file test/SRR5494628.metrics -S test/SRR5494628.sam\n", | |
"hisat2 -u 2500 -x yeast_index/genome --sra-acc SRR5494629 --new-summary --summary-file test/SRR5494629.txt --met-file test/SRR5494629.metrics -S test/SRR5494629.sam\n", | |
"hisat2 -u 2500 -x yeast_index/genome --sra-acc SRR5494630 --new-summary --summary-file test/SRR5494630.txt --met-file test/SRR5494630.metrics -S test/SRR5494630.sam\n", | |
"hisat2 -u 2500 -x yeast_index/genome --sra-acc SRR5494631 --new-summary --summary-file test/SRR5494631.txt --met-file test/SRR5494631.metrics -S test/SRR5494631.sam\n", | |
"hisat2 -u 2500 -x yeast_index/genome --sra-acc SRR5494632 --new-summary --summary-file test/SRR5494632.txt --met-file test/SRR5494632.metrics -S test/SRR5494632.sam\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "kycvWwXpozjo", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 119 | |
}, | |
"outputId": "eb6325f0-90f7-4940-8fe8-5c7a24ca40ae" | |
}, | |
"source": [ | |
"%%shell\n", | |
"# run one command from above\n", | |
"hisat2 -u 2500 -x yeast_index/genome --sra-acc SRR5494627 --new-summary --summary-file test/SRR5494627.txt --met-file test/SRR5494627.metrics -S test/SRR5494627.sam" | |
], | |
"execution_count": 13, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"HISAT2 summary stats:\n", | |
"\tTotal reads: 2500\n", | |
"\t\tAligned 0 time: 129 (5.16%)\n", | |
"\t\tAligned 1 time: 2164 (86.56%)\n", | |
"\t\tAligned >1 times: 207 (8.28%)\n", | |
"\tOverall alignment rate: 94.84%\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 13 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "KDGrzmyipfpw", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## htseq-count" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "rdRlohY6r-BY", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 119 | |
}, | |
"outputId": "adeb43a5-e8e6-4e09-9f38-3cbddd375948" | |
}, | |
"source": [ | |
"# from scripts/rnaseq/samtools.py\n", | |
"import os\n", | |
"from pandas import read_csv\n", | |
"\n", | |
"RNASeqSRARunTableFile='seqacademy/data/RNASeqSRA.tsv'\n", | |
"RNASeqSRATable = read_csv(RNASeqSRARunTableFile, delimiter='\\t')\n", | |
"RNASeqoutrun = (RNASeqSRATable[\"Run\"])\n", | |
"RNASeqoutputSam = \"test/\" + RNASeqoutrun + \".sam\"\n", | |
"RNASeqoutputAlignmentSummary = \"test/\" + RNASeqoutrun + \".txt\"\n", | |
"RNASeqoutputMetrics = \"test/\" + RNASeqoutrun + \".metrics\"\n", | |
"RNASeqoutputSortBam = \"test/\" + RNASeqoutrun + \".sorted.bam\"\n", | |
"\n", | |
"for index, individual in enumerate(RNASeqoutrun):\n", | |
" run = RNASeqoutrun[index]\n", | |
" summary = RNASeqoutputAlignmentSummary[index] \n", | |
" metrics = RNASeqoutputMetrics[index]\n", | |
" sam = RNASeqoutputSam[index]\n", | |
" bam = RNASeqoutputSortBam[index]\n", | |
" cmduse = \"samtools view -bSF4 {0} | samtools sort -o {1}\".format(sam, bam)\n", | |
" print(cmduse)\n", | |
" #os.system(cmduse)" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"samtools view -bSF4 test/SRR5494627.sam | samtools sort -o test/SRR5494627.sorted.bam\n", | |
"samtools view -bSF4 test/SRR5494628.sam | samtools sort -o test/SRR5494628.sorted.bam\n", | |
"samtools view -bSF4 test/SRR5494629.sam | samtools sort -o test/SRR5494629.sorted.bam\n", | |
"samtools view -bSF4 test/SRR5494630.sam | samtools sort -o test/SRR5494630.sorted.bam\n", | |
"samtools view -bSF4 test/SRR5494631.sam | samtools sort -o test/SRR5494631.sorted.bam\n", | |
"samtools view -bSF4 test/SRR5494632.sam | samtools sort -o test/SRR5494632.sorted.bam\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "-A2e-8vasEKl", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# run one command\n", | |
"!samtools view -bSF4 test/SRR5494627.sam | samtools sort -o test/SRR5494627.sorted.bam" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Xas48RrGs9hd", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "ff8f48bc-7e82-44c7-ee27-c7d447ef50c8" | |
}, | |
"source": [ | |
"!du -sh test/SRR5494627.sorted.bam" | |
], | |
"execution_count": 38, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"96K\ttest/SRR5494627.sorted.bam\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "m_AQg3nooFsQ", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 119 | |
}, | |
"outputId": "f295de9c-8a01-47ee-8c42-26957e4c0f9e" | |
}, | |
"source": [ | |
"gtf = \"data/Saccharomyces_cerevisiae.R64-1-1.84.gtf\"\n", | |
"\n", | |
"RNASeqSRARunTableFile='seqacademy/data/RNASeqSRA.tsv'\n", | |
"RNASeqSRATable = read_csv(RNASeqSRARunTableFile, delimiter='\\t')\n", | |
"RNASeqoutrun = (RNASeqSRATable[\"Run\"])\n", | |
"RNASeqoutputSortBam = \"test/\" + RNASeqoutrun + \".sorted.bam\"\n", | |
"\n", | |
"for index, individual in enumerate(RNASeqoutputSortBam):\n", | |
" input = individual\n", | |
" output = individual.replace(\".sorted.bam\", \"\") + \".genecount.txt\"\n", | |
" cmduse = \"htseq-count -m intersection-nonempty -s no -f bam %s %s > %s\" % (input, gtf, output)\n", | |
" print(cmduse)\n", | |
" #os.system(cmduse)" | |
], | |
"execution_count": 15, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"htseq-count -m intersection-nonempty -s no -f bam test/SRR5494627.sorted.bam data/Saccharomyces_cerevisiae.R64-1-1.84.gtf > test/SRR5494627.genecount.txt\n", | |
"htseq-count -m intersection-nonempty -s no -f bam test/SRR5494628.sorted.bam data/Saccharomyces_cerevisiae.R64-1-1.84.gtf > test/SRR5494628.genecount.txt\n", | |
"htseq-count -m intersection-nonempty -s no -f bam test/SRR5494629.sorted.bam data/Saccharomyces_cerevisiae.R64-1-1.84.gtf > test/SRR5494629.genecount.txt\n", | |
"htseq-count -m intersection-nonempty -s no -f bam test/SRR5494630.sorted.bam data/Saccharomyces_cerevisiae.R64-1-1.84.gtf > test/SRR5494630.genecount.txt\n", | |
"htseq-count -m intersection-nonempty -s no -f bam test/SRR5494631.sorted.bam data/Saccharomyces_cerevisiae.R64-1-1.84.gtf > test/SRR5494631.genecount.txt\n", | |
"htseq-count -m intersection-nonempty -s no -f bam test/SRR5494632.sorted.bam data/Saccharomyces_cerevisiae.R64-1-1.84.gtf > test/SRR5494632.genecount.txt\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "-Ro3_TDBsgMD", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "13587569-9cbb-4dee-f995-cc48703a8c29" | |
}, | |
"source": [ | |
"!du -sh test/SRR5494627.sorted.bam" | |
], | |
"execution_count": 40, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"96K\ttest/SRR5494627.sorted.bam\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Jyr2tC0Wo8G6", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 85 | |
}, | |
"outputId": "1471acea-3806-4f90-e456-c01054c23be4" | |
}, | |
"source": [ | |
"%%shell\n", | |
"# try one command\n", | |
"htseq-count -m intersection-nonempty -s no test/SRR5494627.sorted.bam seqacademy/data/Saccharomyces_cerevisiae.R64-1-1.84.gtf > test/SRR5494627.genecount.txt" | |
], | |
"execution_count": 39, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"[E::idx_find_and_load] Could not retrieve index file for 'test/SRR5494627.sorted.bam'\n", | |
"42071 GFF lines processed.\n", | |
"[E::idx_find_and_load] Could not retrieve index file for 'test/SRR5494627.sorted.bam'\n", | |
"2701 alignments processed.\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 39 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "aJFgh2fKsvCx", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "72e08a5a-c647-4682-a210-87fd7cb65c63" | |
}, | |
"source": [ | |
"!du -sh test/SRR5494627.genecount.txt" | |
], | |
"execution_count": 42, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"72K\ttest/SRR5494627.genecount.txt\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "oYXODwUexqWG", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 187 | |
}, | |
"outputId": "bc2cce62-93c3-4bd1-b2e2-3f1fd4563d07" | |
}, | |
"source": [ | |
"!head test/SRR5494627.genecount.txt" | |
], | |
"execution_count": 43, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"15S_rRNA\t0\n", | |
"21S_rRNA\t0\n", | |
"HRA1\t0\n", | |
"ICR1\t0\n", | |
"LSR1\t0\n", | |
"NME1\t0\n", | |
"PWR1\t0\n", | |
"Q0010\t0\n", | |
"Q0017\t0\n", | |
"Q0032\t0\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "btjqISzyxsfV", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 187 | |
}, | |
"outputId": "326e865f-d809-485f-9242-94f580b3267d" | |
}, | |
"source": [ | |
"# check if all 0's or not\n", | |
"!cat test/SRR5494627.genecount.txt|awk '{print $2}'|sort|uniq|head" | |
], | |
"execution_count": 46, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"0\n", | |
"1\n", | |
"10\n", | |
"11\n", | |
"12\n", | |
"13\n", | |
"147\n", | |
"16\n", | |
"19\n", | |
"2\n" | |
], | |
"name": "stdout" | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment