Last active
November 8, 2015 10:23
-
-
Save afrendeiro/6ec23ce2d0317a160e8f to your computer and use it in GitHub Desktop.
Get the all the words in titles of publications for a particular PubMed search to make a wordcloud.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import json | |
import urllib2 | |
import re | |
from collections import Counter | |
def get_ids(term, ids=list(), retstart=0, retmax=1000): | |
""" | |
Return all Pubmed Ids of articles containing a term, in a recursive fashion. | |
""" | |
base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&" | |
# build query based on options | |
options = list() | |
options.append("term=%s" % term) | |
options.append("retstart=%i" % retstart) # restart query where it was left | |
options.append("retmax=%s" % retmax) | |
options.append("retmode=json") | |
# get query | |
url = base_url + "&".join(options) | |
data = json.load(urllib2.urlopen(url)) | |
# if there are hits, add to ids list | |
if len(data["esearchresult"]["idlist"]) > 0: | |
ids += data["esearchresult"]["idlist"] | |
# if there is more data than retmax | |
if int(data["esearchresult"]["count"]) > retmax: | |
return get_ids(term, ids, retstart + int(data["esearchresult"]["retmax"])) | |
# if there are no hits, return ids list | |
else: | |
return ids | |
def get_titles(ids, titles=list(), retstart=0, retmax=100): | |
""" | |
Return the title of a publication based on its Pubmed id. | |
""" | |
base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&" | |
# build query based on options | |
options = list() | |
options.append("id=%s" % ",".join(ids[:retmax])) | |
options.append("rettype=title") | |
options.append("retstart=%i" % retstart) # restart query where it was left | |
options.append("retmax=%s" % retmax) | |
options.append("retmode=json") | |
# get query | |
url = base_url + "&".join(options) | |
data = json.load(urllib2.urlopen(url)) | |
# if there are hits, add to ids list | |
if "result" in data.keys(): | |
for pmid in data["result"].keys(): | |
if pmid != "uids": # avoid last record | |
titles.append(data["result"][pmid]["title"]) | |
# assess if it's time to return | |
# if there are still more ids than `retmax`, run again | |
if len(ids[retmax:]) > retmax: | |
return get_titles(ids[retmax:], titles, retstart + retmax) | |
else: | |
return titles | |
term = sys.argv[1] | |
# get pubmed ids of publications with term | |
ids = get_ids(term) | |
# get titles of publications | |
titles = get_titles(ids) | |
# get all words in a single list, remove punctuation | |
words = [re.sub("\.", "", word).lower() for title in titles for word in title.split(" ")] | |
# count word frequency | |
count = Counter(words) | |
# look at top 20 words | |
sorted(count.items(), key=lambda x: x[1], reverse=True)[:20] | |
# remove common words (articles mainly) | |
common_words = ["of", "in", "and", "the", "as", "i", "not", "to", "use", "by", "a", "with", "an", "for", "during", "is", "on", "from"] | |
common_words += [str(x) for x in range(10)] | |
words_filtered = [word for word in words if word not in common_words] | |
# looks good, let's save the word list | |
with open("%s_pubmed_title_wordlist.txt" % term, "w") as handle: | |
for word in words_filtered: | |
try: | |
handle.write(word + "\n") | |
except UnicodeEncodeError: | |
continue | |
# Let's save the frequency of each word | |
with open("%s_pubmed_title_wordlist.frequency.txt" % term, "w") as handle: | |
for word, frequency in sorted(Counter(words_filtered).items(), key=lambda x: x[1], reverse=True): | |
if frequency > 10: | |
try: | |
handle.write("\t".join([str(frequency), word]) + "\n") | |
except UnicodeEncodeError: | |
continue | |
# Let's have each word represented log10(word) times | |
import numpy as np | |
with open("%s_pubmed_title_wordlist.log10.txt" % term, "w") as handle: | |
for word, frequency in sorted(Counter(words_filtered).items(), key=lambda x: x[1], reverse=True): | |
if frequency > 10: | |
for _ in range(int(np.log10(frequency))): | |
try: | |
handle.write(word + "\n") | |
except UnicodeEncodeError: | |
continue |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment