Skip to content

Instantly share code, notes, and snippets.

@rvernica
Created March 25, 2017 04:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rvernica/04fc7ed530366787c17c3b294bf95848 to your computer and use it in GitHub Desktop.
Save rvernica/04fc7ed530366787c17c3b294bf95848 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"Import [dbNSFP 2.9](https://github.com/Paradigm4/variant_warehouse/tree/master/load_dbnsfpv2.9) into SciDB\n",
"\n",
"Once data is imported we can run:\n",
"\n",
" iquery --afl --query \"\n",
" project(\n",
" cross_join(\n",
" between(DBNSFP_V2p9_VARIANT, 6, 538204, null, 6, 538206, null) as variant, \n",
" DBNSFP_V2p9_CHROMOSOME as chrom, \n",
" variant.chromosome_id, \n",
" chrom.chromosome_id), \n",
" chromosome, ref, alt, aaalt, SIFT_score, SIFT_converted_rankscore)\"\n",
" {chromosome_id,pos,variant_no} chromosome,ref,alt,aaalt,SIFT_score,SIFT_converted_rankscore\n",
" {6,538204,0} '7','C','A','S','0.0',0.90636\n",
" {6,538204,1} '7','C','G','S','0.0',0.90636\n",
" {6,538205,0} '7','C','A','M','0.0',0.90636\n",
" {6,538205,1} '7','C','G','T','0.0',0.90636\n",
" {6,538205,2} '7','C','T','K','0.0',0.90636\n",
" {6,538206,0} '7','T','A','W','0.0',0.90636\n",
" {6,538206,1} '7','T','C','G','0.0',0.90636"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"import scidbpy\n",
"\n",
"db = scidbpy.connect()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th>chromosome</th>\n",
" <th>ref</th>\n",
" <th>alt</th>\n",
" <th>aaalt</th>\n",
" <th>SIFT_score</th>\n",
" <th>SIFT_converted_rankscore</th>\n",
" </tr>\n",
" <tr>\n",
" <th>chromosome_id</th>\n",
" <th>pos</th>\n",
" <th>variant_no</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"7\" valign=\"top\">6</th>\n",
" <th rowspan=\"2\" valign=\"top\">538204</th>\n",
" <th>0</th>\n",
" <td>7</td>\n",
" <td>C</td>\n",
" <td>A</td>\n",
" <td>S</td>\n",
" <td>0.0</td>\n",
" <td>0.90636</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>7</td>\n",
" <td>C</td>\n",
" <td>G</td>\n",
" <td>S</td>\n",
" <td>0.0</td>\n",
" <td>0.90636</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">538205</th>\n",
" <th>0</th>\n",
" <td>7</td>\n",
" <td>C</td>\n",
" <td>A</td>\n",
" <td>M</td>\n",
" <td>0.0</td>\n",
" <td>0.90636</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>7</td>\n",
" <td>C</td>\n",
" <td>G</td>\n",
" <td>T</td>\n",
" <td>0.0</td>\n",
" <td>0.90636</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>7</td>\n",
" <td>C</td>\n",
" <td>T</td>\n",
" <td>K</td>\n",
" <td>0.0</td>\n",
" <td>0.90636</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">538206</th>\n",
" <th>0</th>\n",
" <td>7</td>\n",
" <td>T</td>\n",
" <td>A</td>\n",
" <td>W</td>\n",
" <td>0.0</td>\n",
" <td>0.90636</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>7</td>\n",
" <td>T</td>\n",
" <td>C</td>\n",
" <td>G</td>\n",
" <td>0.0</td>\n",
" <td>0.90636</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" chromosome ref alt aaalt SIFT_score \\\n",
"chromosome_id pos variant_no \n",
"6 538204 0 7 C A S 0.0 \n",
" 1 7 C G S 0.0 \n",
" 538205 0 7 C A M 0.0 \n",
" 1 7 C G T 0.0 \n",
" 2 7 C T K 0.0 \n",
" 538206 0 7 T A W 0.0 \n",
" 1 7 T C G 0.0 \n",
"\n",
" SIFT_converted_rankscore \n",
"chromosome_id pos variant_no \n",
"6 538204 0 0.90636 \n",
" 1 0.90636 \n",
" 538205 0 0.90636 \n",
" 1 0.90636 \n",
" 2 0.90636 \n",
" 538206 0 0.90636 \n",
" 1 0.90636 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## See how to create DBNSFP_V2p9_VARIANT_l at the end of the notebook\n",
"df = db.cross_join(\n",
" db.wrap_array('DBNSFP_V2p9_VARIANT_l').between(\n",
" 6, 538204, 'null',\n",
" 6, 538206, 'null'),\n",
" db.wrap_array('DBNSFP_V2p9_CHROMOSOME'),\n",
" (0, 0)).project(\n",
" 'chromosome', 'ref', 'alt', 'aaalt', 'SIFT_score', 'SIFT_converted_rankscore').todataframe()\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th></th>\n",
" <th colspan=\"3\" halign=\"left\">chromosome</th>\n",
" <th colspan=\"3\" halign=\"left\">ref</th>\n",
" <th colspan=\"3\" halign=\"left\">alt</th>\n",
" <th colspan=\"3\" halign=\"left\">aaalt</th>\n",
" <th colspan=\"3\" halign=\"left\">SIFT_score</th>\n",
" <th colspan=\"3\" halign=\"left\">SIFT_converted_rankscore</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>variant_no</th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" </tr>\n",
" <tr>\n",
" <th>chromosome_id</th>\n",
" <th>pos</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"3\" valign=\"top\">6</th>\n",
" <th>538204</th>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>None</td>\n",
" <td>C</td>\n",
" <td>C</td>\n",
" <td>None</td>\n",
" <td>A</td>\n",
" <td>G</td>\n",
" <td>None</td>\n",
" <td>S</td>\n",
" <td>S</td>\n",
" <td>None</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>None</td>\n",
" <td>0.90636</td>\n",
" <td>0.90636</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>538205</th>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>C</td>\n",
" <td>C</td>\n",
" <td>C</td>\n",
" <td>A</td>\n",
" <td>G</td>\n",
" <td>T</td>\n",
" <td>M</td>\n",
" <td>T</td>\n",
" <td>K</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.90636</td>\n",
" <td>0.90636</td>\n",
" <td>0.90636</td>\n",
" </tr>\n",
" <tr>\n",
" <th>538206</th>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>None</td>\n",
" <td>T</td>\n",
" <td>T</td>\n",
" <td>None</td>\n",
" <td>A</td>\n",
" <td>C</td>\n",
" <td>None</td>\n",
" <td>W</td>\n",
" <td>G</td>\n",
" <td>None</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>None</td>\n",
" <td>0.90636</td>\n",
" <td>0.90636</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" chromosome ref alt aaalt \\\n",
"variant_no 0 1 2 0 1 2 0 1 2 0 1 \n",
"chromosome_id pos \n",
"6 538204 7 7 None C C None A G None S S \n",
" 538205 7 7 7 C C C A G T M T \n",
" 538206 7 7 None T T None A C None W G \n",
"\n",
" SIFT_score SIFT_converted_rankscore \\\n",
"variant_no 2 0 1 2 0 \n",
"chromosome_id pos \n",
"6 538204 None 0.0 0.0 None 0.90636 \n",
" 538205 K 0.0 0.0 0.0 0.90636 \n",
" 538206 None 0.0 0.0 None 0.90636 \n",
"\n",
" \n",
"variant_no 1 2 \n",
"chromosome_id pos \n",
"6 538204 0.90636 NaN \n",
" 538205 0.90636 0.90636 \n",
" 538206 0.90636 NaN "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.unstack('variant_no')"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"Create a new array with bounded dimensions and copy the data into it to avoid [#13](https://github.com/Paradigm4/SciDB/issues/13)\n",
"\n",
" create array DBNSFP_V2p9_VARIANT_l<\n",
" ref:string,\n",
" alt:string,\n",
" aaref:string,\n",
" aaalt:string,\n",
" rs_dbSNP141:string,\n",
" hg18_pos:int64,\n",
" hg38_chr:string,\n",
" hg38_pos:int64,\n",
" genename:string,\n",
" Uniprot_acc:string,\n",
" Uniprot_id:string,\n",
" Uniprot_aapos:string,\n",
" Interpro_domain:string,\n",
" cds_strand:string,\n",
" refcodon:string,\n",
" SLR_test_statistic:double,\n",
" codonpos:uint8,\n",
" fold_degenerate:uint8,\n",
" Ancestral_allele:string,\n",
" Ensembl_geneid:string,\n",
" Ensembl_transcriptid:string,\n",
" aapos:string,\n",
" aapos_SIFT:string,\n",
" aapos_FATHMM:string,\n",
" SIFT_score:string,\n",
" SIFT_converted_rankscore:double,\n",
" SIFT_pred:string,\n",
" Polyphen2_HDIV_score:string,\n",
" Polyphen2_HDIV_rankscore:double,\n",
" Polyphen2_HDIV_pred:string,\n",
" Polyphen2_HVAR_score:string,\n",
" Polyphen2_HVAR_rankscore:double,\n",
" Polyphen2_HVAR_pred:string,\n",
" LRT_score:double,\n",
" LRT_converted_rankscore:double,\n",
" LRT_pred:string,\n",
" MutationTaster_score:string,\n",
" MutationTaster_converted_rankscore:double,\n",
" MutationTaster_pred:string,\n",
" MutationAssessor_score:double,\n",
" MutationAssessor_rankscore:double,\n",
" MutationAssessor_pred:string,\n",
" FATHMM_score:double,\n",
" FATHMM_rankscore:double,\n",
" FATHMM_pred:string,\n",
" MetaSVM_score:double,\n",
" MetaSVM_rankscore:double,\n",
" MetaSVM_pred:string,\n",
" MetaLR_score:double,\n",
" MetaLR_rankscore:double,\n",
" MetaLR_pred:string,\n",
" Reliability_index:uint8,\n",
" VEST3_score:double,\n",
" VEST3_rankscore:double,\n",
" PROVEAN_score:string,\n",
" PROVEAN_converted_rankscore:double,\n",
" PROVEAN_pred:string,\n",
" CADD_raw:double,\n",
" CADD_raw_rankscore:double,\n",
" CADD_phred:double,\n",
" GERPPP_NR:double,\n",
" GERPPP_RS:double,\n",
" GERPPP_RS_rankscore:double,\n",
" phyloP46way_primate:double,\n",
" pyloP46way_primate_rankscore:double,\n",
" phyloP46way_placental:double,\n",
" phyloP46way_placental_rankscore:double,\n",
" phyloP100way_vertebrate:double,\n",
" phyloP100way_vertebrate_rankscore:double,\n",
" phastCons46way_primate:double,\n",
" phastCons46way_primate_rankscore:double,\n",
" phastCons46way_placental:double,\n",
" phastCons46way_placental_rankscore:double,\n",
" phastCons100way_vertebrate:double,\n",
" phastCons100way_vertebrate_rankscore:double,\n",
" SiPhy_29way_pi:string,\n",
" SiPhy_29way_logOdds:double,\n",
" SiPhy_29way_logOdds_rankscore:double,\n",
" LRT_Omega:double,\n",
" UniSNP_ids:string,\n",
" KGp1_AC:int64,\n",
" KGp1_AF:double,\n",
" KGp1_AFR_AC:int64,\n",
" KGp1_AFR_AF:double,\n",
" KGp1_EUR_AC:int64,\n",
" KGp1_EUR_AF:double,\n",
" KGp1_AMR_AC:int64,\n",
" KGp1_AMR_AF:double,\n",
" KGp1_ASN_AC:int64,\n",
" KGp1_ASN_AF:double,\n",
" ESP6500_AA_AF:double,\n",
" ESP6500_EA_AF:double,\n",
" ARIC5606_AA_AC:int64,\n",
" ARIC5606_AA_AF:double,\n",
" ARIC5606_EA_AC:int64,\n",
" ARIC5606_EA_AF:double,\n",
" ExAC_AC:int64,\n",
" ExAC_AF:double,\n",
" ExAC_Adj_AC:int64,\n",
" ExAC_Adj_AF:double,\n",
" ExAC_AFR_AC:int64,\n",
" ExAC_AFR_AF:double,\n",
" ExAC_AMR_AC:int64,\n",
" ExAC_AMR_AF:double,\n",
" ExAC_EAS_AC:int64,\n",
" ExAC_EAS_AF:double,\n",
" ExAC_FIN_AC:int64,\n",
" ExAC_FIN_AF:double,\n",
" ExAC_NFE_AC:int64,\n",
" ExAC_NFE_AF:double,\n",
" null_SAS_AC:int64,\n",
" ExAC_SAS_AF:double,\n",
" clinvar_rs:string,\n",
" clinvar_clnsig:int8,\n",
" clinvar_trait:string,\n",
" COSMIC_ID:string,\n",
" COSMIC_CNT:int64>\n",
" [chromosome_id=0:6:0:1; pos=193200:810160:0:10000000; variant_no=0:19:0:20];\n",
"\n",
" store(\n",
" redimension(DBNSFP_V2p9_VARIANT, DBNSFP_V2p9_VARIANT_l),\n",
" DBNSFP_V2p9_VARIANT_l);"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment