Created
March 25, 2017 04:52
-
-
Save rvernica/04fc7ed530366787c17c3b294bf95848 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"deletable": true, | |
"editable": true | |
}, | |
"source": [ | |
"Import [dbNSFP 2.9](https://github.com/Paradigm4/variant_warehouse/tree/master/load_dbnsfpv2.9) into SciDB\n", | |
"\n", | |
"Once data is imported we can run:\n", | |
"\n", | |
" iquery --afl --query \"\n", | |
" project(\n", | |
" cross_join(\n", | |
" between(DBNSFP_V2p9_VARIANT, 6, 538204, null, 6, 538206, null) as variant, \n", | |
" DBNSFP_V2p9_CHROMOSOME as chrom, \n", | |
" variant.chromosome_id, \n", | |
" chrom.chromosome_id), \n", | |
" chromosome, ref, alt, aaalt, SIFT_score, SIFT_converted_rankscore)\"\n", | |
" {chromosome_id,pos,variant_no} chromosome,ref,alt,aaalt,SIFT_score,SIFT_converted_rankscore\n", | |
" {6,538204,0} '7','C','A','S','0.0',0.90636\n", | |
" {6,538204,1} '7','C','G','S','0.0',0.90636\n", | |
" {6,538205,0} '7','C','A','M','0.0',0.90636\n", | |
" {6,538205,1} '7','C','G','T','0.0',0.90636\n", | |
" {6,538205,2} '7','C','T','K','0.0',0.90636\n", | |
" {6,538206,0} '7','T','A','W','0.0',0.90636\n", | |
" {6,538206,1} '7','T','C','G','0.0',0.90636" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import scidbpy\n", | |
"\n", | |
"db = scidbpy.connect()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th>chromosome</th>\n", | |
" <th>ref</th>\n", | |
" <th>alt</th>\n", | |
" <th>aaalt</th>\n", | |
" <th>SIFT_score</th>\n", | |
" <th>SIFT_converted_rankscore</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>chromosome_id</th>\n", | |
" <th>pos</th>\n", | |
" <th>variant_no</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th rowspan=\"7\" valign=\"top\">6</th>\n", | |
" <th rowspan=\"2\" valign=\"top\">538204</th>\n", | |
" <th>0</th>\n", | |
" <td>7</td>\n", | |
" <td>C</td>\n", | |
" <td>A</td>\n", | |
" <td>S</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.90636</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>7</td>\n", | |
" <td>C</td>\n", | |
" <td>G</td>\n", | |
" <td>S</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.90636</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"3\" valign=\"top\">538205</th>\n", | |
" <th>0</th>\n", | |
" <td>7</td>\n", | |
" <td>C</td>\n", | |
" <td>A</td>\n", | |
" <td>M</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.90636</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>7</td>\n", | |
" <td>C</td>\n", | |
" <td>G</td>\n", | |
" <td>T</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.90636</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>7</td>\n", | |
" <td>C</td>\n", | |
" <td>T</td>\n", | |
" <td>K</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.90636</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">538206</th>\n", | |
" <th>0</th>\n", | |
" <td>7</td>\n", | |
" <td>T</td>\n", | |
" <td>A</td>\n", | |
" <td>W</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.90636</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>7</td>\n", | |
" <td>T</td>\n", | |
" <td>C</td>\n", | |
" <td>G</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.90636</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" chromosome ref alt aaalt SIFT_score \\\n", | |
"chromosome_id pos variant_no \n", | |
"6 538204 0 7 C A S 0.0 \n", | |
" 1 7 C G S 0.0 \n", | |
" 538205 0 7 C A M 0.0 \n", | |
" 1 7 C G T 0.0 \n", | |
" 2 7 C T K 0.0 \n", | |
" 538206 0 7 T A W 0.0 \n", | |
" 1 7 T C G 0.0 \n", | |
"\n", | |
" SIFT_converted_rankscore \n", | |
"chromosome_id pos variant_no \n", | |
"6 538204 0 0.90636 \n", | |
" 1 0.90636 \n", | |
" 538205 0 0.90636 \n", | |
" 1 0.90636 \n", | |
" 2 0.90636 \n", | |
" 538206 0 0.90636 \n", | |
" 1 0.90636 " | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"## See how to create DBNSFP_V2p9_VARIANT_l at the end of the notebook\n", | |
"df = db.cross_join(\n", | |
" db.wrap_array('DBNSFP_V2p9_VARIANT_l').between(\n", | |
" 6, 538204, 'null',\n", | |
" 6, 538206, 'null'),\n", | |
" db.wrap_array('DBNSFP_V2p9_CHROMOSOME'),\n", | |
" (0, 0)).project(\n", | |
" 'chromosome', 'ref', 'alt', 'aaalt', 'SIFT_score', 'SIFT_converted_rankscore').todataframe()\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th colspan=\"3\" halign=\"left\">chromosome</th>\n", | |
" <th colspan=\"3\" halign=\"left\">ref</th>\n", | |
" <th colspan=\"3\" halign=\"left\">alt</th>\n", | |
" <th colspan=\"3\" halign=\"left\">aaalt</th>\n", | |
" <th colspan=\"3\" halign=\"left\">SIFT_score</th>\n", | |
" <th colspan=\"3\" halign=\"left\">SIFT_converted_rankscore</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th></th>\n", | |
" <th>variant_no</th>\n", | |
" <th>0</th>\n", | |
" <th>1</th>\n", | |
" <th>2</th>\n", | |
" <th>0</th>\n", | |
" <th>1</th>\n", | |
" <th>2</th>\n", | |
" <th>0</th>\n", | |
" <th>1</th>\n", | |
" <th>2</th>\n", | |
" <th>0</th>\n", | |
" <th>1</th>\n", | |
" <th>2</th>\n", | |
" <th>0</th>\n", | |
" <th>1</th>\n", | |
" <th>2</th>\n", | |
" <th>0</th>\n", | |
" <th>1</th>\n", | |
" <th>2</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>chromosome_id</th>\n", | |
" <th>pos</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th rowspan=\"3\" valign=\"top\">6</th>\n", | |
" <th>538204</th>\n", | |
" <td>7</td>\n", | |
" <td>7</td>\n", | |
" <td>None</td>\n", | |
" <td>C</td>\n", | |
" <td>C</td>\n", | |
" <td>None</td>\n", | |
" <td>A</td>\n", | |
" <td>G</td>\n", | |
" <td>None</td>\n", | |
" <td>S</td>\n", | |
" <td>S</td>\n", | |
" <td>None</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>None</td>\n", | |
" <td>0.90636</td>\n", | |
" <td>0.90636</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>538205</th>\n", | |
" <td>7</td>\n", | |
" <td>7</td>\n", | |
" <td>7</td>\n", | |
" <td>C</td>\n", | |
" <td>C</td>\n", | |
" <td>C</td>\n", | |
" <td>A</td>\n", | |
" <td>G</td>\n", | |
" <td>T</td>\n", | |
" <td>M</td>\n", | |
" <td>T</td>\n", | |
" <td>K</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.90636</td>\n", | |
" <td>0.90636</td>\n", | |
" <td>0.90636</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>538206</th>\n", | |
" <td>7</td>\n", | |
" <td>7</td>\n", | |
" <td>None</td>\n", | |
" <td>T</td>\n", | |
" <td>T</td>\n", | |
" <td>None</td>\n", | |
" <td>A</td>\n", | |
" <td>C</td>\n", | |
" <td>None</td>\n", | |
" <td>W</td>\n", | |
" <td>G</td>\n", | |
" <td>None</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>None</td>\n", | |
" <td>0.90636</td>\n", | |
" <td>0.90636</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" chromosome ref alt aaalt \\\n", | |
"variant_no 0 1 2 0 1 2 0 1 2 0 1 \n", | |
"chromosome_id pos \n", | |
"6 538204 7 7 None C C None A G None S S \n", | |
" 538205 7 7 7 C C C A G T M T \n", | |
" 538206 7 7 None T T None A C None W G \n", | |
"\n", | |
" SIFT_score SIFT_converted_rankscore \\\n", | |
"variant_no 2 0 1 2 0 \n", | |
"chromosome_id pos \n", | |
"6 538204 None 0.0 0.0 None 0.90636 \n", | |
" 538205 K 0.0 0.0 0.0 0.90636 \n", | |
" 538206 None 0.0 0.0 None 0.90636 \n", | |
"\n", | |
" \n", | |
"variant_no 1 2 \n", | |
"chromosome_id pos \n", | |
"6 538204 0.90636 NaN \n", | |
" 538205 0.90636 0.90636 \n", | |
" 538206 0.90636 NaN " | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.unstack('variant_no')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"deletable": true, | |
"editable": true | |
}, | |
"source": [ | |
"Create a new array with bounded dimensions and copy the data into it to avoid [#13](https://github.com/Paradigm4/SciDB/issues/13)\n", | |
"\n", | |
" create array DBNSFP_V2p9_VARIANT_l<\n", | |
" ref:string,\n", | |
" alt:string,\n", | |
" aaref:string,\n", | |
" aaalt:string,\n", | |
" rs_dbSNP141:string,\n", | |
" hg18_pos:int64,\n", | |
" hg38_chr:string,\n", | |
" hg38_pos:int64,\n", | |
" genename:string,\n", | |
" Uniprot_acc:string,\n", | |
" Uniprot_id:string,\n", | |
" Uniprot_aapos:string,\n", | |
" Interpro_domain:string,\n", | |
" cds_strand:string,\n", | |
" refcodon:string,\n", | |
" SLR_test_statistic:double,\n", | |
" codonpos:uint8,\n", | |
" fold_degenerate:uint8,\n", | |
" Ancestral_allele:string,\n", | |
" Ensembl_geneid:string,\n", | |
" Ensembl_transcriptid:string,\n", | |
" aapos:string,\n", | |
" aapos_SIFT:string,\n", | |
" aapos_FATHMM:string,\n", | |
" SIFT_score:string,\n", | |
" SIFT_converted_rankscore:double,\n", | |
" SIFT_pred:string,\n", | |
" Polyphen2_HDIV_score:string,\n", | |
" Polyphen2_HDIV_rankscore:double,\n", | |
" Polyphen2_HDIV_pred:string,\n", | |
" Polyphen2_HVAR_score:string,\n", | |
" Polyphen2_HVAR_rankscore:double,\n", | |
" Polyphen2_HVAR_pred:string,\n", | |
" LRT_score:double,\n", | |
" LRT_converted_rankscore:double,\n", | |
" LRT_pred:string,\n", | |
" MutationTaster_score:string,\n", | |
" MutationTaster_converted_rankscore:double,\n", | |
" MutationTaster_pred:string,\n", | |
" MutationAssessor_score:double,\n", | |
" MutationAssessor_rankscore:double,\n", | |
" MutationAssessor_pred:string,\n", | |
" FATHMM_score:double,\n", | |
" FATHMM_rankscore:double,\n", | |
" FATHMM_pred:string,\n", | |
" MetaSVM_score:double,\n", | |
" MetaSVM_rankscore:double,\n", | |
" MetaSVM_pred:string,\n", | |
" MetaLR_score:double,\n", | |
" MetaLR_rankscore:double,\n", | |
" MetaLR_pred:string,\n", | |
" Reliability_index:uint8,\n", | |
" VEST3_score:double,\n", | |
" VEST3_rankscore:double,\n", | |
" PROVEAN_score:string,\n", | |
" PROVEAN_converted_rankscore:double,\n", | |
" PROVEAN_pred:string,\n", | |
" CADD_raw:double,\n", | |
" CADD_raw_rankscore:double,\n", | |
" CADD_phred:double,\n", | |
" GERPPP_NR:double,\n", | |
" GERPPP_RS:double,\n", | |
" GERPPP_RS_rankscore:double,\n", | |
" phyloP46way_primate:double,\n", | |
" pyloP46way_primate_rankscore:double,\n", | |
" phyloP46way_placental:double,\n", | |
" phyloP46way_placental_rankscore:double,\n", | |
" phyloP100way_vertebrate:double,\n", | |
" phyloP100way_vertebrate_rankscore:double,\n", | |
" phastCons46way_primate:double,\n", | |
" phastCons46way_primate_rankscore:double,\n", | |
" phastCons46way_placental:double,\n", | |
" phastCons46way_placental_rankscore:double,\n", | |
" phastCons100way_vertebrate:double,\n", | |
" phastCons100way_vertebrate_rankscore:double,\n", | |
" SiPhy_29way_pi:string,\n", | |
" SiPhy_29way_logOdds:double,\n", | |
" SiPhy_29way_logOdds_rankscore:double,\n", | |
" LRT_Omega:double,\n", | |
" UniSNP_ids:string,\n", | |
" KGp1_AC:int64,\n", | |
" KGp1_AF:double,\n", | |
" KGp1_AFR_AC:int64,\n", | |
" KGp1_AFR_AF:double,\n", | |
" KGp1_EUR_AC:int64,\n", | |
" KGp1_EUR_AF:double,\n", | |
" KGp1_AMR_AC:int64,\n", | |
" KGp1_AMR_AF:double,\n", | |
" KGp1_ASN_AC:int64,\n", | |
" KGp1_ASN_AF:double,\n", | |
" ESP6500_AA_AF:double,\n", | |
" ESP6500_EA_AF:double,\n", | |
" ARIC5606_AA_AC:int64,\n", | |
" ARIC5606_AA_AF:double,\n", | |
" ARIC5606_EA_AC:int64,\n", | |
" ARIC5606_EA_AF:double,\n", | |
" ExAC_AC:int64,\n", | |
" ExAC_AF:double,\n", | |
" ExAC_Adj_AC:int64,\n", | |
" ExAC_Adj_AF:double,\n", | |
" ExAC_AFR_AC:int64,\n", | |
" ExAC_AFR_AF:double,\n", | |
" ExAC_AMR_AC:int64,\n", | |
" ExAC_AMR_AF:double,\n", | |
" ExAC_EAS_AC:int64,\n", | |
" ExAC_EAS_AF:double,\n", | |
" ExAC_FIN_AC:int64,\n", | |
" ExAC_FIN_AF:double,\n", | |
" ExAC_NFE_AC:int64,\n", | |
" ExAC_NFE_AF:double,\n", | |
" null_SAS_AC:int64,\n", | |
" ExAC_SAS_AF:double,\n", | |
" clinvar_rs:string,\n", | |
" clinvar_clnsig:int8,\n", | |
" clinvar_trait:string,\n", | |
" COSMIC_ID:string,\n", | |
" COSMIC_CNT:int64>\n", | |
" [chromosome_id=0:6:0:1; pos=193200:810160:0:10000000; variant_no=0:19:0:20];\n", | |
"\n", | |
" store(\n", | |
" redimension(DBNSFP_V2p9_VARIANT, DBNSFP_V2p9_VARIANT_l),\n", | |
" DBNSFP_V2p9_VARIANT_l);" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.13" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment