Instantly share code, notes, and snippets.
Forked from macieksk/R_pubmed_download_authors_affiliations.ipynb
Created
January 11, 2021 22:41
-
Star
(0)
0
You must be signed in to star a gist -
Fork
(0)
0
You must be signed in to fork a gist
-
Save sandeshregmi/2d701b15ce876be8ddc887fa350ea327 to your computer and use it in GitHub Desktop.
Download authors and their affiliations from Pubmed articles using R packages: RISmed, rentrez
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#Download authors and their affiliations from Pubmed articles\n", | |
"#\n", | |
"#Author: Maciek Sykulski (macieksk@gmail.com), partially based on solution by Tim Read\n", | |
"#\n", | |
"#Uses IRKernel in Jupyter Notebook https://github.com/IRkernel/IRkernel\n", | |
"#" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"8" | |
], | |
"text/latex": [ | |
"8" | |
], | |
"text/markdown": [ | |
"8" | |
], | |
"text/plain": [ | |
"[1] 8" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#install.packages(\"RISmed\",repos='http://cran.us.r-project.org')\n", | |
"library(\"RISmed\")\n", | |
"\n", | |
"res <- EUtilsSummary('Sykulski M[author]', type='esearch', db='pubmed', mindate='2011', maxdate='2016')\n", | |
"\n", | |
"QueryCount(res)\n", | |
"#fetch <- EUtilsGet(res)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 234, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#attributes(res)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<ol class=list-inline>\n", | |
"\t<li>'XMLInternalDocument'</li>\n", | |
"\t<li>'XMLAbstractDocument'</li>\n", | |
"</ol>\n" | |
], | |
"text/latex": [ | |
"\\begin{enumerate*}\n", | |
"\\item 'XMLInternalDocument'\n", | |
"\\item 'XMLAbstractDocument'\n", | |
"\\end{enumerate*}\n" | |
], | |
"text/markdown": [ | |
"1. 'XMLInternalDocument'\n", | |
"2. 'XMLAbstractDocument'\n", | |
"\n", | |
"\n" | |
], | |
"text/plain": [ | |
"[1] \"XMLInternalDocument\" \"XMLAbstractDocument\"" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#install.packages(\"rentrez\",repos='http://cran.us.r-project.org')\n", | |
"library(rentrez)\n", | |
"\n", | |
"your.ids <- attributes(res)$PMID #c(\"26386083\",\"26273372\",\"26066373\",\"25837167\",\"25466451\",\"25013473\")\n", | |
"# rentrez function to get the data from pubmed db\n", | |
"fetch.pubmed <- entrez_fetch(db = \"pubmed\", id = your.ids,\n", | |
" rettype = \"xml\", parsed = TRUE)\n", | |
"class(fetch.pubmed)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<ol>\n", | |
"\t<li>'Gene Expression Profile of the Clinically Aggressive Micropapillary Variant of Bladder Cancer.'</li>\n", | |
"\t<li>'Spaced seeds improve k-mer-based metagenomic classification.'</li>\n", | |
"\t<li>'Application of array comparative genomic hybridization in 256 patients with developmental delay or intellectual disability.'</li>\n", | |
"\t<li>'Multiple samples aCGH analysis for rare CNVs detection.'</li>\n", | |
"\t<li>'Functional performance of aCGH design for clinical cytogenetics.'</li>\n", | |
"\t<li>'Assessment of the role of copy-number variants in 150 patients with congenital heart defects.'</li>\n", | |
"\t<li>'Application of custom-designed oligonucleotide array CGH in 145 patients with autistic spectrum disorders.'</li>\n", | |
"\t<li>'Application of array comparative genomic hybridization in 102 patients with epilepsy and additional neurodevelopmental disorders.'</li>\n", | |
"</ol>\n" | |
], | |
"text/latex": [ | |
"\\begin{enumerate}\n", | |
"\\item 'Gene Expression Profile of the Clinically Aggressive Micropapillary Variant of Bladder Cancer.'\n", | |
"\\item 'Spaced seeds improve k-mer-based metagenomic classification.'\n", | |
"\\item 'Application of array comparative genomic hybridization in 256 patients with developmental delay or intellectual disability.'\n", | |
"\\item 'Multiple samples aCGH analysis for rare CNVs detection.'\n", | |
"\\item 'Functional performance of aCGH design for clinical cytogenetics.'\n", | |
"\\item 'Assessment of the role of copy-number variants in 150 patients with congenital heart defects.'\n", | |
"\\item 'Application of custom-designed oligonucleotide array CGH in 145 patients with autistic spectrum disorders.'\n", | |
"\\item 'Application of array comparative genomic hybridization in 102 patients with epilepsy and additional neurodevelopmental disorders.'\n", | |
"\\end{enumerate}\n" | |
], | |
"text/markdown": [ | |
"1. 'Gene Expression Profile of the Clinically Aggressive Micropapillary Variant of Bladder Cancer.'\n", | |
"2. 'Spaced seeds improve k-mer-based metagenomic classification.'\n", | |
"3. 'Application of array comparative genomic hybridization in 256 patients with developmental delay or intellectual disability.'\n", | |
"4. 'Multiple samples aCGH analysis for rare CNVs detection.'\n", | |
"5. 'Functional performance of aCGH design for clinical cytogenetics.'\n", | |
"6. 'Assessment of the role of copy-number variants in 150 patients with congenital heart defects.'\n", | |
"7. 'Application of custom-designed oligonucleotide array CGH in 145 patients with autistic spectrum disorders.'\n", | |
"8. 'Application of array comparative genomic hybridization in 102 patients with epilepsy and additional neurodevelopmental disorders.'\n", | |
"\n", | |
"\n" | |
], | |
"text/plain": [ | |
"[[1]]\n", | |
"[1] \"Gene Expression Profile of the Clinically Aggressive Micropapillary Variant of Bladder Cancer.\"\n", | |
"\n", | |
"[[2]]\n", | |
"[1] \"Spaced seeds improve k-mer-based metagenomic classification.\"\n", | |
"\n", | |
"[[3]]\n", | |
"[1] \"Application of array comparative genomic hybridization in 256 patients with developmental delay or intellectual disability.\"\n", | |
"\n", | |
"[[4]]\n", | |
"[1] \"Multiple samples aCGH analysis for rare CNVs detection.\"\n", | |
"\n", | |
"[[5]]\n", | |
"[1] \"Functional performance of aCGH design for clinical cytogenetics.\"\n", | |
"\n", | |
"[[6]]\n", | |
"[1] \"Assessment of the role of copy-number variants in 150 patients with congenital heart defects.\"\n", | |
"\n", | |
"[[7]]\n", | |
"[1] \"Application of custom-designed oligonucleotide array CGH in 145 patients with autistic spectrum disorders.\"\n", | |
"\n", | |
"[[8]]\n", | |
"[1] \"Application of array comparative genomic hybridization in 102 patients with epilepsy and additional neurodevelopmental disorders.\"\n" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#install.packages(\"XML\",repos='http://cran.us.r-project.org')\n", | |
"library(XML)\n", | |
"xpathApply(fetch.pubmed, '//PubmedArticle//MedlineCitation//Article//ArticleTitle',xmlValue)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Extract the Abstracts for the respective IDS. \n", | |
"affil = xpathApply(fetch.pubmed, '//PubmedArticle//MedlineCitation//Article//AuthorList//Author',\n", | |
" function(x)xmlChildren(x))\n", | |
" \n", | |
"getAuthorAffil<-function(x)cbind(paste(xmlValue(x$ForeName),\n", | |
" xmlValue(x$LastName)),\n", | |
" ifelse(is.null(x[[\"AffiliationInfo\"]]),NA,\n", | |
" xmlValue(xmlChildren(x[[\"AffiliationInfo\"]])$Affiliation))\n", | |
" ) \n", | |
"affil<-do.call(rbind,lapply(affil,function(x)as.data.frame(getAuthorAffil(x),stringsAsFactors=FALSE)))\n", | |
"colnames(affil)<-c(\"name\",\"affil\") " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<ol class=list-inline>\n", | |
"\t<li>25</li>\n", | |
"\t<li>2</li>\n", | |
"</ol>\n" | |
], | |
"text/latex": [ | |
"\\begin{enumerate*}\n", | |
"\\item 25\n", | |
"\\item 2\n", | |
"\\end{enumerate*}\n" | |
], | |
"text/markdown": [ | |
"1. 25\n", | |
"2. 2\n", | |
"\n", | |
"\n" | |
], | |
"text/plain": [ | |
"[1] 25 2" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<table>\n", | |
"<thead><tr><th></th><th scope=col>name</th><th scope=col>affil</th></tr></thead>\n", | |
"<tbody>\n", | |
"\t<tr><th scope=row>1</th><td>Anna Gambin </td><td>1. Institute of Informatics, University of Warsaw, Warsaw, Poland.</td></tr>\n", | |
"\t<tr><th scope=row>2</th><td>Arlene Siefker-Radtke </td><td>1. Department of Genitourinary Medical Oncology, University of Texas MD Anderson Cancer Center, Houston, TX, USA.</td></tr>\n", | |
"\t<tr><th scope=row>3</th><td>Ashish Madhav Kamat </td><td>1. Department of Urology, University of Texas MD Anderson Cancer Center, Houston, TX, USA.</td></tr>\n", | |
"\t<tr><th scope=row>4</th><td>Barbara Wiśniowiecka-Kowalnik </td><td>1. Department of Medical Genetics, Institute of Mother and Child, Warsaw, Poland.</td></tr>\n", | |
"\t<tr><th scope=row>5</th><td>Bogdan Czerniak </td><td>1. Department of Pathology, University of Texas MD Anderson Cancer Center, Houston, TX, USA. Electronic address: bczernia@mdanderson.org.</td></tr>\n", | |
"\t<tr><th scope=row>6</th><td>Charles Chuanhai Guo </td><td>1. Department of Pathology, University of Texas MD Anderson Cancer Center, Houston, TX, USA.</td></tr>\n", | |
"</tbody>\n", | |
"</table>\n" | |
], | |
"text/latex": [ | |
"\\begin{tabular}{r|ll}\n", | |
" & name & affil\\\\\n", | |
"\\hline\n", | |
"\t1 & Anna Gambin & 1. Institute of Informatics, University of Warsaw, Warsaw, Poland.\\\\\n", | |
"\t2 & Arlene Siefker-Radtke & 1. Department of Genitourinary Medical Oncology, University of Texas MD Anderson Cancer Center, Houston, TX, USA.\\\\\n", | |
"\t3 & Ashish Madhav Kamat & 1. Department of Urology, University of Texas MD Anderson Cancer Center, Houston, TX, USA.\\\\\n", | |
"\t4 & Barbara Wiśniowiecka-Kowalnik & 1. Department of Medical Genetics, Institute of Mother and Child, Warsaw, Poland.\\\\\n", | |
"\t5 & Bogdan Czerniak & 1. Department of Pathology, University of Texas MD Anderson Cancer Center, Houston, TX, USA. Electronic address: bczernia@mdanderson.org.\\\\\n", | |
"\t6 & Charles Chuanhai Guo & 1. Department of Pathology, University of Texas MD Anderson Cancer Center, Houston, TX, USA.\\\\\n", | |
"\\end{tabular}\n" | |
], | |
"text/plain": [ | |
" name\n", | |
"1 Anna Gambin\n", | |
"2 Arlene Siefker-Radtke\n", | |
"3 Ashish Madhav Kamat\n", | |
"4 Barbara Wiśniowiecka-Kowalnik\n", | |
"5 Bogdan Czerniak\n", | |
"6 Charles Chuanhai Guo\n", | |
" affil\n", | |
"1 1. Institute of Informatics, University of Warsaw, Warsaw, Poland.\n", | |
"2 1. Department of Genitourinary Medical Oncology, University of Texas MD Anderson Cancer Center, Houston, TX, USA.\n", | |
"3 1. Department of Urology, University of Texas MD Anderson Cancer Center, Houston, TX, USA.\n", | |
"4 1. Department of Medical Genetics, Institute of Mother and Child, Warsaw, Poland.\n", | |
"5 1. Department of Pathology, University of Texas MD Anderson Cancer Center, Houston, TX, USA. Electronic address: bczernia@mdanderson.org.\n", | |
"6 1. Department of Pathology, University of Texas MD Anderson Cancer Center, Houston, TX, USA." | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"affil.agg<-aggregate(affil~name,data=affil,\n", | |
" FUN=function(x)paste(paste(1:length(unique(x)),unique(x),sep=\". \"),collapse=\" | \")) \n", | |
"dim(affil.agg)\n", | |
"head(affil.agg) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"Download: <a href=affil_out/coauthors_with_affil.tab>affil_out/coauthors_with_affil.tab</a>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"system(\"mkdir -p affil_out\")\n", | |
"fname<-\"affil_out/coauthors_with_affil.tab\"\n", | |
"write.table(affil.agg,file=fname,col.names=TRUE,row.names=TRUE,quote=FALSE,sep=\"\\t\")\n", | |
"library(IRdisplay)\n", | |
"IRdisplay::display_html(paste(\"Download: <a href=\",fname,\">\",fname,\"</a>\",sep=\"\"))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "R", | |
"language": "R", | |
"name": "ir" | |
}, | |
"language_info": { | |
"codemirror_mode": "r", | |
"file_extension": ".r", | |
"mimetype": "text/x-r-source", | |
"name": "R", | |
"pygments_lexer": "r", | |
"version": "3.2.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment