Last active
July 27, 2020 02:11
-
-
Save shadiakiki1986/336606ab067052203607309a3ee2d7a4 to your computer and use it in GitHub Desktop.
read plink files in python with libplinkio.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "read plink files in python with libplinkio.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"authorship_tag": "ABX9TyNfWbvok69nqMpfqRCljxnw", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/shadiakiki1986/336606ab067052203607309a3ee2d7a4/read-plink-files-in-python-with-libplinkio.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "U05vMCe-ARj8", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Utility function `read_bedbimfam` to wrap libplinkio python code [1] to read plink bed,bim,fam files [2]\n", | |
"\n", | |
"1. https://github.com/mfranberg/libplinkio\n", | |
"2. https://www.cog-genomics.org/plink/1.9/" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "2ucjTKow6DRb", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 255 | |
}, | |
"outputId": "27a0a54c-979c-40cf-8d8f-d8e86e8b388f" | |
}, | |
"source": [ | |
"# install libplinkio python library\n", | |
"!pip install plinkio pandas numpy" | |
], | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Collecting plinkio\n", | |
" Downloading https://files.pythonhosted.org/packages/4b/6e/e4cc41556d7e21c1e4ab8560b3eff8b14634570f6203c30ed46b66eecd5a/plinkio-0.9.7.tar.gz\n", | |
"Requirement already satisfied: pandas in /usr/local/lib/python3.6/dist-packages (1.0.5)\n", | |
"Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (1.18.5)\n", | |
"Requirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas) (2.8.1)\n", | |
"Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas) (2018.9)\n", | |
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil>=2.6.1->pandas) (1.15.0)\n", | |
"Building wheels for collected packages: plinkio\n", | |
" Building wheel for plinkio (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
" Created wheel for plinkio: filename=plinkio-0.9.7-cp36-cp36m-linux_x86_64.whl size=74471 sha256=e53c70a37fcb6a57fe37a695f2edbc5bb393a56a2879bd4a2fa11849042bc2fc\n", | |
" Stored in directory: /root/.cache/pip/wheels/7e/d7/79/5c8ec563c0c2df324ed39b9283dfae235cebef1d8709c59f7d\n", | |
"Successfully built plinkio\n", | |
"Installing collected packages: plinkio\n", | |
"Successfully installed plinkio-0.9.7\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "G7wHCg5O7aR6", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# define utility function\n", | |
"def read_bedbimfam(filenames_prefix):\n", | |
" \"\"\"\n", | |
" Returns\n", | |
" - phenotypes pandas dataframe (n samples rows)\n", | |
" - genotypes numpy matrix (n samples rows, n snps columns)\n", | |
" - sample IDs for genotype matrix rows\n", | |
" - snp IDs for genotype matrix columns\n", | |
"\n", | |
" Example usage when files are at example/wgas1.{bed,bim,fam}:\n", | |
" ex_pheno, ex_geno, ex_samples, ex_snps = read_bedbimfam(\"example/wgas1\")\n", | |
" \"\"\"\n", | |
" factory = PlinkFactory()\n", | |
" factory.open_files(filenames_prefix)\n", | |
" factory.read_phenotypes()\n", | |
" factory.read_genotypes()\n", | |
" return (factory.dfsim_pheno, factory.dfsim_geno1, factory.dfsim_samples, factory.dfsim_snps)\n", | |
"\n", | |
"\n", | |
"class PlinkFactory:\n", | |
" \"\"\"\n", | |
" Support class for read_bedbimfam\n", | |
" \"\"\"\n", | |
" def open_files(self, filenames_prefix):\n", | |
" from plinkio import plinkfile\n", | |
" # read plink files in python\n", | |
" # From example https://github.com/mfranberg/libplinkio#using-in-python\n", | |
" plink_file = plinkfile.open(filenames_prefix)\n", | |
" if not plink_file.one_locus_per_row( ):\n", | |
" raise Exception( \"This script requires that snps are rows and samples columns.\" )\n", | |
" self.plink_file = plink_file\n", | |
"\n", | |
" def read_phenotypes(self):\n", | |
" # Get phenotypes\n", | |
" self.sample_list = self.plink_file.get_samples()\n", | |
" import pandas as pd\n", | |
" self.dfsim_pheno = pd.DataFrame([x.__dict__ for x in self.sample_list])\n", | |
"\n", | |
" def read_genotypes(self):\n", | |
" sample_n = self.dfsim_pheno.shape[0]\n", | |
"\n", | |
" locus_list = self.plink_file.get_loci( )\n", | |
" locus_n = len(locus_list)\n", | |
"\n", | |
" import datetime as dt\n", | |
" import numpy as np\n", | |
"\n", | |
" # Get genotypes\n", | |
" # ETA ~ on google colab as of 2020-07-27: 11 seconds per 100k snps => ~1.5 minutes for ~900k snps on Affymetrix 6\n", | |
"\n", | |
" dfsim_geno1 = np.zeros((sample_n, locus_n), dtype=int)\n", | |
" dfsim_snps = [\"\"]*locus_n\n", | |
" dfsim_samples = [\"\"]*sample_n\n", | |
" print(\"Reading plink file genotypes: start\")\n", | |
" for i, (locus, row) in enumerate(zip( locus_list, self.plink_file )):\n", | |
" if i%100e3==0: print(\"%s: %i/%i\"%(dt.datetime.now(), i, locus_n))\n", | |
" for j, (sample, genotype) in enumerate(zip( self.sample_list, row )):\n", | |
" #print( \"Individual {0} has genotype {1} for snp {2}.\".format( sample.iid, genotype, locus.name ) )\n", | |
" #\n", | |
" # From the docs: https://github.com/mfranberg/libplinkio#genotype-coding\n", | |
" # The genotypes are coded 0, 1, 2, and 3. The numbers 0-2 represent the number of A2 alleles as specified in the .bim file. The number 3 represents a missing genotype.\n", | |
" #\n", | |
" #break\n", | |
" #dfsim_geno1.append({\"iid\": sample.iid, \"geno\": genotype, \"snp\": locus.name})\n", | |
" dfsim_geno1[j,i] = genotype\n", | |
" dfsim_samples[j] = sample.fid\n", | |
" dfsim_snps[i] = locus.name\n", | |
"\n", | |
" print(\"Reading plink file genotypes: end\")\n", | |
" self.dfsim_geno1 = dfsim_geno1\n", | |
" self.dfsim_samples = dfsim_samples\n", | |
" self.dfsim_snps = dfsim_snps" | |
], | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "wpJZalzyAc9d", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Illustrate the utility function by reading example files from plink" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "O-zaZK2z5wfL", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 391 | |
}, | |
"outputId": "2862e1f8-ef17-48f8-debb-1da0227402cb" | |
}, | |
"source": [ | |
"# download example plink files from\n", | |
"# https://www.cog-genomics.org/plink/1.9/resources#teach\n", | |
"\n", | |
"!wget https://www.cog-genomics.org/static/bin/plink/example.zip\n", | |
"!unzip example.zip" | |
], | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"--2020-07-27 02:10:37-- https://www.cog-genomics.org/static/bin/plink/example.zip\n", | |
"Resolving www.cog-genomics.org (www.cog-genomics.org)... 52.1.175.150, 34.197.140.245, 54.164.74.108, ...\n", | |
"Connecting to www.cog-genomics.org (www.cog-genomics.org)|52.1.175.150|:443... connected.\n", | |
"HTTP request sent, awaiting response... 200 OK\n", | |
"Length: 23107503 (22M) [application/zip]\n", | |
"Saving to: ‘example.zip’\n", | |
"\n", | |
"example.zip 100%[===================>] 22.04M 15.7MB/s in 1.4s \n", | |
"\n", | |
"2020-07-27 02:10:39 (15.7 MB/s) - ‘example.zip’ saved [23107503/23107503]\n", | |
"\n", | |
"Archive: example.zip\n", | |
" creating: example/\n", | |
" inflating: example/wgas1.map \n", | |
" inflating: example/plink.exe \n", | |
" inflating: example/extra.map \n", | |
" inflating: example/pop.cov \n", | |
" inflating: example/Haploview.jar \n", | |
" inflating: example/wgas1.ped \n", | |
" inflating: example/extra.ped \n", | |
" inflating: example/gPLINK.jar \n", | |
" inflating: example/command-list.txt \n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "0edqocRt-ZA_", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 306 | |
}, | |
"outputId": "5b0dbaaa-a6ce-4e23-faa8-3a2be1036ce2" | |
}, | |
"source": [ | |
"# install plink to convert ped file (plink text format) to bed, bim, fam (plink binary format)\n", | |
"!wget http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20200616.zip\n", | |
"!unzip plink_linux_x86_64_20200616.zip" | |
], | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"--2020-07-27 02:10:44-- http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20200616.zip\n", | |
"Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.229.19\n", | |
"Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.229.19|:80... connected.\n", | |
"HTTP request sent, awaiting response... 200 OK\n", | |
"Length: 8916236 (8.5M) [application/zip]\n", | |
"Saving to: ‘plink_linux_x86_64_20200616.zip’\n", | |
"\n", | |
"\r plink_lin 0%[ ] 0 --.-KB/s \r plink_linu 1%[ ] 102.09K 481KB/s \r plink_linux 5%[> ] 461.48K 990KB/s \r plink_linux_ 38%[======> ] 3.28M 4.93MB/s \rplink_linux_x86_64_ 100%[===================>] 8.50M 9.98MB/s in 0.9s \n", | |
"\n", | |
"2020-07-27 02:10:45 (9.98 MB/s) - ‘plink_linux_x86_64_20200616.zip’ saved [8916236/8916236]\n", | |
"\n", | |
"Archive: plink_linux_x86_64_20200616.zip\n", | |
" inflating: plink \n", | |
" inflating: LICENSE \n", | |
" inflating: toy.ped \n", | |
" inflating: toy.map \n", | |
" inflating: prettify \n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "V5q3Ep_g-tyY", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 51 | |
}, | |
"outputId": "1e539573-7309-4d5d-de3a-80554844f572" | |
}, | |
"source": [ | |
"!echo $PATH\n", | |
"!chmod +x plink # not needed if zip contained +x already on plink binary\n", | |
"!mv plink /usr/local/bin/\n", | |
"!which plink" | |
], | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/tools/node/bin:/tools/google-cloud-sdk/bin:/opt/bin\n", | |
"/usr/local/bin/plink\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "-TQO4pbh-2wB", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "495562ef-e899-4959-e272-4affbc2438a5" | |
}, | |
"source": [ | |
"!plink --version" | |
], | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"PLINK v1.90b6.18 64-bit (16 Jun 2020)\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "S90lALZS-o3t", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 476 | |
}, | |
"outputId": "2d2817b0-7a66-4cc4-873e-127adcf71b18" | |
}, | |
"source": [ | |
"# https://www.cog-genomics.org/plink/1.9/data#make_bed\n", | |
"# Note in stdout below: \"67158 variants removed due to minor allele threshold(s)\"\n", | |
"!plink --file example/wgas1 --maf 0.05 --make-bed --out example/wgas1" | |
], | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"PLINK v1.90b6.18 64-bit (16 Jun 2020) www.cog-genomics.org/plink/1.9/\n", | |
"(C) 2005-2020 Shaun Purcell, Christopher Chang GNU General Public License v3\n", | |
"Logging to example/wgas1.log.\n", | |
"Options in effect:\n", | |
" --file example/wgas1\n", | |
" --maf 0.05\n", | |
" --make-bed\n", | |
" --out example/wgas1\n", | |
"\n", | |
"13021 MB RAM detected; reserving 6510 MB for main workspace.\n", | |
"Scanning .ped file... 0%\b\b1%\b\b2%\b\b3%\b\b4%\b\b5%\b\b6%\b\b7%\b\b8%\b\b10%\b\b\b11%\b\b\b12%\b\b\b13%\b\b\b14%\b\b\b15%\b\b\b16%\b\b\b17%\b\b\b18%\b\b\b20%\b\b\b21%\b\b\b22%\b\b\b23%\b\b\b24%\b\b\b25%\b\b\b26%\b\b\b27%\b\b\b28%\b\b\b30%\b\b\b31%\b\b\b32%\b\b\b33%\b\b\b34%\b\b\b35%\b\b\b36%\b\b\b37%\b\b\b38%\b\b\b40%\b\b\b41%\b\b\b42%\b\b\b43%\b\b\b44%\b\b\b45%\b\b\b46%\b\b\b47%\b\b\b48%\b\b\b50%\b\b\b51%\b\b\b52%\b\b\b53%\b\b\b54%\b\b\b55%\b\b\b56%\b\b\b57%\b\b\b58%\b\b\b60%\b\b\b61%\b\b\b62%\b\b\b63%\b\b\b64%\b\b\b65%\b\b\b66%\b\b\b67%\b\b\b68%\b\b\b70%\b\b\b71%\b\b\b72%\b\b\b73%\b\b\b74%\b\b\b75%\b\b\b76%\b\b\b77%\b\b\b78%\b\b\b80%\b\b\b81%\b\b\b82%\b\b\b83%\b\b\b84%\b\b\b85%\b\b\b86%\b\b\b87%\b\b\b88%\b\b\b90%\b\b\b91%\b\b\b92%\b\b\b93%\b\b\b94%\b\b\b95%\b\b\b96%\b\b\b97%\b\b\b98%\b\b\b100%\r.ped scan complete (for binary autoconversion).\n", | |
"Performing single-pass .bed write (228694 variants, 90 people).\n", | |
"0%\b\b1%\b\b2%\b\b3%\b\b4%\b\b5%\b\b6%\b\b7%\b\b8%\b\b9%\b\b10%\b\b\b11%\b\b\b12%\b\b\b13%\b\b\b14%\b\b\b15%\b\b\b16%\b\b\b17%\b\b\b18%\b\b\b19%\b\b\b20%\b\b\b21%\b\b\b22%\b\b\b23%\b\b\b24%\b\b\b25%\b\b\b26%\b\b\b27%\b\b\b28%\b\b\b29%\b\b\b30%\b\b\b31%\b\b\b32%\b\b\b33%\b\b\b34%\b\b\b35%\b\b\b36%\b\b\b37%\b\b\b38%\b\b\b39%\b\b\b40%\b\b\b41%\b\b\b42%\b\b\b43%\b\b\b44%\b\b\b45%\b\b\b46%\b\b\b47%\b\b\b48%\b\b\b49%\b\b\b50%\b\b\b51%\b\b\b52%\b\b\b53%\b\b\b54%\b\b\b55%\b\b\b56%\b\b\b57%\b\b\b58%\b\b\b59%\b\b\b60%\b\b\b61%\b\b\b62%\b\b\b63%\b\b\b64%\b\b\b65%\b\b\b66%\b\b\b67%\b\b\b68%\b\b\b69%\b\b\b70%\b\b\b71%\b\b\b72%\b\b\b73%\b\b\b74%\b\b\b75%\b\b\b76%\b\b\b77%\b\b\b78%\b\b\b79%\b\b\b80%\b\b\b81%\b\b\b82%\b\b\b83%\b\b\b84%\b\b\b85%\b\b\b86%\b\b\b87%\b\b\b88%\b\b\b89%\b\b\b90%\b\b\b91%\b\b\b92%\b\b\b93%\b\b\b94%\r--file: example/wgas1-temporary.bed + example/wgas1-temporary.bim +\n", | |
"example/wgas1-temporary.fam written.\n", | |
"228694 variants loaded from .bim file.\n", | |
"90 people (45 males, 45 females) loaded from .fam.\n", | |
"90 phenotype values loaded from .fam.\n", | |
"Using 1 thread (no multithreaded calculations invoked).\n", | |
"Before main variant filters, 90 founders and 0 nonfounders present.\n", | |
"Calculating allele frequencies... 0%\b\b1%\b\b2%\b\b3%\b\b4%\b\b5%\b\b6%\b\b7%\b\b8%\b\b9%\b\b10%\b\b\b11%\b\b\b12%\b\b\b13%\b\b\b14%\b\b\b15%\b\b\b16%\b\b\b17%\b\b\b18%\b\b\b19%\b\b\b20%\b\b\b21%\b\b\b22%\b\b\b23%\b\b\b24%\b\b\b25%\b\b\b26%\b\b\b27%\b\b\b28%\b\b\b29%\b\b\b30%\b\b\b31%\b\b\b32%\b\b\b33%\b\b\b34%\b\b\b35%\b\b\b36%\b\b\b37%\b\b\b38%\b\b\b39%\b\b\b40%\b\b\b41%\b\b\b42%\b\b\b43%\b\b\b44%\b\b\b45%\b\b\b46%\b\b\b47%\b\b\b48%\b\b\b49%\b\b\b50%\b\b\b51%\b\b\b52%\b\b\b53%\b\b\b54%\b\b\b55%\b\b\b56%\b\b\b57%\b\b\b58%\b\b\b59%\b\b\b60%\b\b\b61%\b\b\b62%\b\b\b63%\b\b\b64%\b\b\b65%\b\b\b66%\b\b\b67%\b\b\b68%\b\b\b69%\b\b\b70%\b\b\b71%\b\b\b72%\b\b\b73%\b\b\b74%\b\b\b75%\b\b\b76%\b\b\b77%\b\b\b78%\b\b\b79%\b\b\b80%\b\b\b81%\b\b\b82%\b\b\b83%\b\b\b84%\b\b\b85%\b\b\b86%\b\b\b87%\b\b\b88%\b\b\b89%\b\b\b90%\b\b\b91%\b\b\b92%\b\b\b93%\b\b\b94%\b\b\b95%\b\b\b96%\b\b\b97%\b\b\b98%\b\b\b99%\b\b\b\b done.\n", | |
"Total genotyping rate is 0.993346.\n", | |
"67158 variants removed due to minor allele threshold(s)\n", | |
"(--maf/--max-maf/--mac/--max-mac).\n", | |
"161536 variants and 90 people pass filters and QC.\n", | |
"Among remaining phenotypes, 49 are cases and 41 are controls.\n", | |
"--make-bed to example/wgas1.bed + example/wgas1.bim + example/wgas1.fam ...\n", | |
"0%\b\b1%\b\b2%\b\b3%\b\b4%\b\b5%\b\b6%\b\b7%\b\b8%\b\b9%\b\b10%\b\b\b11%\b\b\b12%\b\b\b13%\b\b\b14%\b\b\b15%\b\b\b16%\b\b\b17%\b\b\b18%\b\b\b19%\b\b\b20%\b\b\b21%\b\b\b22%\b\b\b23%\b\b\b24%\b\b\b25%\b\b\b26%\b\b\b27%\b\b\b28%\b\b\b29%\b\b\b30%\b\b\b31%\b\b\b32%\b\b\b33%\b\b\b34%\b\b\b35%\b\b\b36%\b\b\b37%\b\b\b38%\b\b\b39%\b\b\b40%\b\b\b41%\b\b\b42%\b\b\b43%\b\b\b44%\b\b\b45%\b\b\b46%\b\b\b47%\b\b\b48%\b\b\b49%\b\b\b50%\b\b\b51%\b\b\b52%\b\b\b53%\b\b\b54%\b\b\b55%\b\b\b56%\b\b\b57%\b\b\b58%\b\b\b59%\b\b\b60%\b\b\b61%\b\b\b62%\b\b\b63%\b\b\b64%\b\b\b65%\b\b\b66%\b\b\b67%\b\b\b68%\b\b\b69%\b\b\b70%\b\b\b71%\b\b\b72%\b\b\b73%\b\b\b74%\b\b\b75%\b\b\b76%\b\b\b77%\b\b\b78%\b\b\b79%\b\b\b80%\b\b\b81%\b\b\b82%\b\b\b83%\b\b\b84%\b\b\b85%\b\b\b86%\b\b\b87%\b\b\b88%\b\b\b89%\b\b\b90%\b\b\b91%\b\b\b92%\b\b\b93%\b\b\b94%\b\b\b95%\b\b\b96%\b\b\b97%\b\b\b98%\b\b\b99%\b\b\bdone.\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Mk6cIC2--K9_", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 85 | |
}, | |
"outputId": "657c4fcd-04fa-47bc-9e8e-0ba8ce612304" | |
}, | |
"source": [ | |
"# read plink example files with the utility function\n", | |
"ex_pheno, ex_geno, ex_samples, ex_snps = read_bedbimfam(\"example/wgas1\")" | |
], | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Reading plink file genotypes: start\n", | |
"2020-07-27 02:11:05.432656: 0/161536\n", | |
"2020-07-27 02:11:08.476102: 100000/161536\n", | |
"Reading plink file genotypes: end\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "jnVBO6JZA5Zd", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Check returned variables" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "vd0kS8TV_b8E", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 204 | |
}, | |
"outputId": "9c17a552-71b7-43e1-a6e0-03a03535e88b" | |
}, | |
"source": [ | |
"ex_pheno.head() # pandas dataframe" | |
], | |
"execution_count": 9, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>fid</th>\n", | |
" <th>iid</th>\n", | |
" <th>father_iid</th>\n", | |
" <th>mother_iid</th>\n", | |
" <th>sex</th>\n", | |
" <th>affection</th>\n", | |
" <th>phenotype</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>CH18526</td>\n", | |
" <td>NA18526</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>CH18524</td>\n", | |
" <td>NA18524</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>CH18529</td>\n", | |
" <td>NA18529</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>CH18558</td>\n", | |
" <td>NA18558</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>CH18532</td>\n", | |
" <td>NA18532</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" fid iid father_iid mother_iid sex affection phenotype\n", | |
"0 CH18526 NA18526 0 0 0 0 0.0\n", | |
"1 CH18524 NA18524 0 0 1 0 0.0\n", | |
"2 CH18529 NA18529 0 0 0 0 0.0\n", | |
"3 CH18558 NA18558 0 0 1 0 0.0\n", | |
"4 CH18532 NA18532 0 0 0 0 0.0" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 9 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "UDEO69jd_dNh", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "939ecc52-f419-427d-f453-1ebebb15a52d" | |
}, | |
"source": [ | |
"ex_geno.shape # numpy matrix" | |
], | |
"execution_count": 10, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(90, 161536)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 10 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "oz9xLZGW_nCy", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "0e671a14-9a28-4302-f13b-d77b8f15fa9e" | |
}, | |
"source": [ | |
"len(ex_samples) # list of strings" | |
], | |
"execution_count": 11, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"90" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 11 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "zbtUia6g_oNk", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "5c945e3c-6b27-4255-c563-23e0ec6d722e" | |
}, | |
"source": [ | |
"len(ex_snps) # list of strings" | |
], | |
"execution_count": 12, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"161536" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 12 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "4c6qd9Su_fLm", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 102 | |
}, | |
"outputId": "162f8fc8-2284-4606-d71d-6cfeec79dde2" | |
}, | |
"source": [ | |
"ex_geno[:5,:5]" | |
], | |
"execution_count": 13, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"array([[2, 2, 2, 1, 2],\n", | |
" [2, 2, 2, 1, 1],\n", | |
" [1, 1, 2, 1, 2],\n", | |
" [2, 2, 2, 1, 2],\n", | |
" [2, 2, 1, 0, 1]])" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 13 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "sY6_J45e_frt", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "1882e14c-baf4-4c57-e682-ac4495418b5a" | |
}, | |
"source": [ | |
"ex_samples[:5]" | |
], | |
"execution_count": 14, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"['CH18526', 'CH18524', 'CH18529', 'CH18558', 'CH18532']" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 14 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "7cVCLy8J_i0g", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "e52371d5-62cc-4081-b42e-5148450c4e01" | |
}, | |
"source": [ | |
"ex_snps[:5]" | |
], | |
"execution_count": 15, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"['rs3094315', 'rs4040617', 'rs4075116', 'rs9442385', 'rs6685064']" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 15 | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment