afrendeiro/pubmed2wordcloud.py

## pubmed2wordcloud.py
import sys
import json
import urllib2
import re
from collections import Counter


def get_ids(term, ids=list(), retstart=0, retmax=1000):
    """
    Return all Pubmed Ids of articles containing a term, in a recursive fashion.
    """
    base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&"
    # build query based on options
    options = list()
    options.append("term=%s" % term)
    options.append("retstart=%i" % retstart)  # restart query where it was left
    options.append("retmax=%s" % retmax)
    options.append("retmode=json")

    # get query
    url = base_url + "&".join(options)
    data = json.load(urllib2.urlopen(url))

    # if there are hits, add to ids list
    if len(data["esearchresult"]["idlist"]) > 0:
        ids += data["esearchresult"]["idlist"]

        # if there is more data than retmax
        if int(data["esearchresult"]["count"]) > retmax:
            return get_ids(term, ids, retstart + int(data["esearchresult"]["retmax"]))

    # if there are no hits, return ids list
    else:
        return ids


def get_titles(ids, titles=list(), retstart=0, retmax=100):
    """
    Return the title of a publication based on its Pubmed id.
    """
    base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&"
    # build query based on options
    options = list()
    options.append("id=%s" % ",".join(ids[:retmax]))
    options.append("rettype=title")
    options.append("retstart=%i" % retstart)  # restart query where it was left
    options.append("retmax=%s" % retmax)
    options.append("retmode=json")

    # get query
    url = base_url + "&".join(options)
    data = json.load(urllib2.urlopen(url))

    # if there are hits, add to ids list
    if "result" in data.keys():
        for pmid in data["result"].keys():
            if pmid != "uids":  # avoid last record
                titles.append(data["result"][pmid]["title"])

    # assess if it's time to return
    # if there are still more ids than `retmax`, run again
    if len(ids[retmax:]) > retmax:
        return get_titles(ids[retmax:], titles, retstart + retmax)
    else:
        return titles


term = sys.argv[1]

# get pubmed ids of publications with term
ids = get_ids(term)
# get titles of publications
titles = get_titles(ids)

# get all words in a single list, remove punctuation
words = [re.sub("\.", "", word).lower() for title in titles for word in title.split(" ")]

# count word frequency
count = Counter(words)

# look at top 20 words
sorted(count.items(), key=lambda x: x[1], reverse=True)[:20]

# remove common words (articles mainly)
common_words = ["of", "in", "and", "the", "as", "i", "not", "to", "use", "by", "a", "with", "an", "for", "during", "is", "on", "from"]
common_words += [str(x) for x in range(10)]
words_filtered = [word for word in words if word not in common_words]

# looks good, let's save the word list
with open("%s_pubmed_title_wordlist.txt" % term, "w") as handle:
    for word in words_filtered:
        try:
            handle.write(word + "\n")
        except UnicodeEncodeError:
            continue

# Let's save the frequency of each word
with open("%s_pubmed_title_wordlist.frequency.txt" % term, "w") as handle:
    for word, frequency in sorted(Counter(words_filtered).items(), key=lambda x: x[1], reverse=True):
        if frequency > 10:
            try:
                handle.write("\t".join([str(frequency), word]) + "\n")
            except UnicodeEncodeError:
                continue

# Let's have each word represented log10(word) times
import numpy as np
with open("%s_pubmed_title_wordlist.log10.txt" % term, "w") as handle:
    for word, frequency in sorted(Counter(words_filtered).items(), key=lambda x: x[1], reverse=True):
        if frequency > 10:
            for _ in range(int(np.log10(frequency))):
                try:
                    handle.write(word + "\n")
                except UnicodeEncodeError:
                    continue
	import sys
	import json
	import urllib2
	import re
	from collections import Counter


	def get_ids(term, ids=list(), retstart=0, retmax=1000):
	"""
	Return all Pubmed Ids of articles containing a term, in a recursive fashion.
	"""
	base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&"
	# build query based on options
	options = list()
	options.append("term=%s" % term)
	options.append("retstart=%i" % retstart) # restart query where it was left
	options.append("retmax=%s" % retmax)
	options.append("retmode=json")

	# get query
	url = base_url + "&".join(options)
	data = json.load(urllib2.urlopen(url))

	# if there are hits, add to ids list
	if len(data["esearchresult"]["idlist"]) > 0:
	ids += data["esearchresult"]["idlist"]

	# if there is more data than retmax
	if int(data["esearchresult"]["count"]) > retmax:
	return get_ids(term, ids, retstart + int(data["esearchresult"]["retmax"]))

	# if there are no hits, return ids list
	else:
	return ids


	def get_titles(ids, titles=list(), retstart=0, retmax=100):
	"""
	Return the title of a publication based on its Pubmed id.
	"""
	base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&"
	# build query based on options
	options = list()
	options.append("id=%s" % ",".join(ids[:retmax]))
	options.append("rettype=title")
	options.append("retstart=%i" % retstart) # restart query where it was left
	options.append("retmax=%s" % retmax)
	options.append("retmode=json")

	# get query
	url = base_url + "&".join(options)
	data = json.load(urllib2.urlopen(url))

	# if there are hits, add to ids list
	if "result" in data.keys():
	for pmid in data["result"].keys():
	if pmid != "uids": # avoid last record
	titles.append(data["result"][pmid]["title"])

	# assess if it's time to return
	# if there are still more ids than `retmax`, run again
	if len(ids[retmax:]) > retmax:
	return get_titles(ids[retmax:], titles, retstart + retmax)
	else:
	return titles


	term = sys.argv[1]

	# get pubmed ids of publications with term
	ids = get_ids(term)
	# get titles of publications
	titles = get_titles(ids)

	# get all words in a single list, remove punctuation
	words = [re.sub("\.", "", word).lower() for title in titles for word in title.split(" ")]

	# count word frequency
	count = Counter(words)

	# look at top 20 words
	sorted(count.items(), key=lambda x: x[1], reverse=True)[:20]

	# remove common words (articles mainly)
	common_words = ["of", "in", "and", "the", "as", "i", "not", "to", "use", "by", "a", "with", "an", "for", "during", "is", "on", "from"]
	common_words += [str(x) for x in range(10)]
	words_filtered = [word for word in words if word not in common_words]

	# looks good, let's save the word list
	with open("%s_pubmed_title_wordlist.txt" % term, "w") as handle:
	for word in words_filtered:
	try:
	handle.write(word + "\n")
	except UnicodeEncodeError:
	continue

	# Let's save the frequency of each word
	with open("%s_pubmed_title_wordlist.frequency.txt" % term, "w") as handle:
	for word, frequency in sorted(Counter(words_filtered).items(), key=lambda x: x[1], reverse=True):
	if frequency > 10:
	try:
	handle.write("\t".join([str(frequency), word]) + "\n")
	except UnicodeEncodeError:
	continue

	# Let's have each word represented log10(word) times
	import numpy as np
	with open("%s_pubmed_title_wordlist.log10.txt" % term, "w") as handle:
	for word, frequency in sorted(Counter(words_filtered).items(), key=lambda x: x[1], reverse=True):
	if frequency > 10:
	for _ in range(int(np.log10(frequency))):
	try:
	handle.write(word + "\n")
	except UnicodeEncodeError:
	continue