Skip to content

Instantly share code, notes, and snippets.

@afrendeiro
Last active November 8, 2015 10:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save afrendeiro/6ec23ce2d0317a160e8f to your computer and use it in GitHub Desktop.
Save afrendeiro/6ec23ce2d0317a160e8f to your computer and use it in GitHub Desktop.
Get the all the words in titles of publications for a particular PubMed search to make a wordcloud.
import sys
import json
import urllib2
import re
from collections import Counter
def get_ids(term, ids=list(), retstart=0, retmax=1000):
"""
Return all Pubmed Ids of articles containing a term, in a recursive fashion.
"""
base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&"
# build query based on options
options = list()
options.append("term=%s" % term)
options.append("retstart=%i" % retstart) # restart query where it was left
options.append("retmax=%s" % retmax)
options.append("retmode=json")
# get query
url = base_url + "&".join(options)
data = json.load(urllib2.urlopen(url))
# if there are hits, add to ids list
if len(data["esearchresult"]["idlist"]) > 0:
ids += data["esearchresult"]["idlist"]
# if there is more data than retmax
if int(data["esearchresult"]["count"]) > retmax:
return get_ids(term, ids, retstart + int(data["esearchresult"]["retmax"]))
# if there are no hits, return ids list
else:
return ids
def get_titles(ids, titles=list(), retstart=0, retmax=100):
"""
Return the title of a publication based on its Pubmed id.
"""
base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&"
# build query based on options
options = list()
options.append("id=%s" % ",".join(ids[:retmax]))
options.append("rettype=title")
options.append("retstart=%i" % retstart) # restart query where it was left
options.append("retmax=%s" % retmax)
options.append("retmode=json")
# get query
url = base_url + "&".join(options)
data = json.load(urllib2.urlopen(url))
# if there are hits, add to ids list
if "result" in data.keys():
for pmid in data["result"].keys():
if pmid != "uids": # avoid last record
titles.append(data["result"][pmid]["title"])
# assess if it's time to return
# if there are still more ids than `retmax`, run again
if len(ids[retmax:]) > retmax:
return get_titles(ids[retmax:], titles, retstart + retmax)
else:
return titles
term = sys.argv[1]
# get pubmed ids of publications with term
ids = get_ids(term)
# get titles of publications
titles = get_titles(ids)
# get all words in a single list, remove punctuation
words = [re.sub("\.", "", word).lower() for title in titles for word in title.split(" ")]
# count word frequency
count = Counter(words)
# look at top 20 words
sorted(count.items(), key=lambda x: x[1], reverse=True)[:20]
# remove common words (articles mainly)
common_words = ["of", "in", "and", "the", "as", "i", "not", "to", "use", "by", "a", "with", "an", "for", "during", "is", "on", "from"]
common_words += [str(x) for x in range(10)]
words_filtered = [word for word in words if word not in common_words]
# looks good, let's save the word list
with open("%s_pubmed_title_wordlist.txt" % term, "w") as handle:
for word in words_filtered:
try:
handle.write(word + "\n")
except UnicodeEncodeError:
continue
# Let's save the frequency of each word
with open("%s_pubmed_title_wordlist.frequency.txt" % term, "w") as handle:
for word, frequency in sorted(Counter(words_filtered).items(), key=lambda x: x[1], reverse=True):
if frequency > 10:
try:
handle.write("\t".join([str(frequency), word]) + "\n")
except UnicodeEncodeError:
continue
# Let's have each word represented log10(word) times
import numpy as np
with open("%s_pubmed_title_wordlist.log10.txt" % term, "w") as handle:
for word, frequency in sorted(Counter(words_filtered).items(), key=lambda x: x[1], reverse=True):
if frequency > 10:
for _ in range(int(np.log10(frequency))):
try:
handle.write(word + "\n")
except UnicodeEncodeError:
continue
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment