danielecook/format_pubs.py

## format_pubs.py
"""
Daniel E. Cook 2013
(danielecook.com)

This script takes a csv containing authors and associated Pubmed identifiers (PMIDs) of their publications and outputs a formatted html document of their publications.
The first row of the csv should contain the authors, and each row below their publications (as PMIDs). If you put something other than a PMID in it will simply be outputted -
so you can add publications that might not be in pubmed or that you want to display in a certain way.

This script might be useful for individuals who maintains publication lists for researchers at a university, for instance.

Requires BioPython:
pip install biopython

The way the publications are displayed can be customized using CSS. This CSS can be used if desired:

/* pubs */
.pub_title {
  font-weight: bold;
	font-size: 13px;
	margin: 0px;
	}
.pub_authors {
	color: #929292;
	font-size: 11px;
	margin: 0px;
}
.pub_info {
	font-size: 11px;
}
.pub_info a {
	padding-left: 3px;
	padding-right: 3px;
}

"""


from Bio import Entrez
from Bio import Medline
import csv
import os


# Set your email here.
email = "Danielecook@gmail.com"

def f7(seq):
	""" Removes non-unique items, stolen from stackoverflow (thanks stack overflow!) """
	seen = set()
	seen_add = seen.add
	return [ x for x in seq if x not in seen and not seen_add(x)]

def csv_dict_array(f):
	""" Convert CSV to array for each author """
	f = csv.DictReader(open(f,'rU'),dialect='excel') # U = Universal New Line Dialect
	# Generate per author dictionary
	auth_dict = {}
	for row in f:
		for auth in row.keys():
			# Set Default - initialize array; else append.
			if row[auth] != '':
				auth_dict.setdefault(auth,[]).append(row[auth])
	# Remove duplicates
	for i in auth_dict:
		auth_dict[i] = list(f7(auth_dict[i]))
	return auth_dict

def fetch_pub(pmid):
	Entrez.email = email
	recs = []
	for k,v in enumerate(pmid):
		print v
		try:
			""" Fetches pubmed data on publication using PMID """
			handle = Entrez.efetch(db="pubmed",id=int(v),retmode="text",rettype="medline")
			pub = Medline.parse(handle)
			for p in pub:
				pubmed_link = "<a class='pub_link' href='http://www.ncbi.nlm.nih.gov/pubmed/%s'>%s</a>" % (p['PMID'],p['PMID'])
				if 'PMC' in p:
					pubmed_link += " ( <a class='pmc_link' href='http://www.ncbi.nlm.nih.gov/pmc/articles/%s/'>Full Text</a> )" % (p['PMC'])

				formatted = """
				<div class='pub'>
					<div class='pub_title'>%s</div>
					<div class='pub_authors'>%s</div>
					<div class='pub_date'>%s</div>
					<div class='pub_journal_pages'>%s</div>
					%s
				</div>""" % (p['TI'],', '.join(p['AU']),p['DP'],p['SO'],pubmed_link)

				recs.append(formatted.replace('\t','').strip())
		except:
			recs.append(v)

	return recs


pubs = csv_dict_array("pubs.csv")
for auth,pub_list in pubs.items():
	if not os.path.exists("pubs_formatted"):
		os.makedirs("pubs_formatted")
	f = open("pubs_formatted" + "/" + auth + ".txt",'w!a')
	f.write('\n'.join(fetch_pub(pub_list)))
	f.close()
	"""
	Daniel E. Cook 2013
	(danielecook.com)

	This script takes a csv containing authors and associated Pubmed identifiers (PMIDs) of their publications and outputs a formatted html document of their publications.
	The first row of the csv should contain the authors, and each row below their publications (as PMIDs). If you put something other than a PMID in it will simply be outputted -
	so you can add publications that might not be in pubmed or that you want to display in a certain way.

	This script might be useful for individuals who maintains publication lists for researchers at a university, for instance.

	Requires BioPython:
	pip install biopython

	The way the publications are displayed can be customized using CSS. This CSS can be used if desired:

	/* pubs */
	.pub_title {
	font-weight: bold;
	font-size: 13px;
	margin: 0px;
	}
	.pub_authors {
	color: #929292;
	font-size: 11px;
	margin: 0px;
	}
	.pub_info {
	font-size: 11px;
	}
	.pub_info a {
	padding-left: 3px;
	padding-right: 3px;
	}

	"""


	from Bio import Entrez
	from Bio import Medline
	import csv
	import os


	# Set your email here.
	email = "Danielecook@gmail.com"

	def f7(seq):
	""" Removes non-unique items, stolen from stackoverflow (thanks stack overflow!) """
	seen = set()
	seen_add = seen.add
	return [ x for x in seq if x not in seen and not seen_add(x)]

	def csv_dict_array(f):
	""" Convert CSV to array for each author """
	f = csv.DictReader(open(f,'rU'),dialect='excel') # U = Universal New Line Dialect
	# Generate per author dictionary
	auth_dict = {}
	for row in f:
	for auth in row.keys():
	# Set Default - initialize array; else append.
	if row[auth] != '':
	auth_dict.setdefault(auth,[]).append(row[auth])
	# Remove duplicates
	for i in auth_dict:
	auth_dict[i] = list(f7(auth_dict[i]))
	return auth_dict

	def fetch_pub(pmid):
	Entrez.email = email
	recs = []
	for k,v in enumerate(pmid):
	print v
	try:
	""" Fetches pubmed data on publication using PMID """
	handle = Entrez.efetch(db="pubmed",id=int(v),retmode="text",rettype="medline")
	pub = Medline.parse(handle)
	for p in pub:
	pubmed_link = "<a class='pub_link' href='http://www.ncbi.nlm.nih.gov/pubmed/%s'>%s</a>" % (p['PMID'],p['PMID'])
	if 'PMC' in p:
	pubmed_link += " ( <a class='pmc_link' href='http://www.ncbi.nlm.nih.gov/pmc/articles/%s/'>Full Text</a> )" % (p['PMC'])

	formatted = """
	<div class='pub'>
	<div class='pub_title'>%s</div>
	<div class='pub_authors'>%s</div>
	<div class='pub_date'>%s</div>
	<div class='pub_journal_pages'>%s</div>
	%s
	</div>""" % (p['TI'],', '.join(p['AU']),p['DP'],p['SO'],pubmed_link)

	recs.append(formatted.replace('\t','').strip())
	except:
	recs.append(v)

	return recs


	pubs = csv_dict_array("pubs.csv")
	for auth,pub_list in pubs.items():
	if not os.path.exists("pubs_formatted"):
	os.makedirs("pubs_formatted")
	f = open("pubs_formatted" + "/" + auth + ".txt",'w!a')
	f.write('\n'.join(fetch_pub(pub_list)))
	f.close()