platinhom/0-EndnoteProcess.md

## 0-EndnoteProcess.md

      
    Raw
  

              0-EndnoteProcess.md
            
          
    Scripts for endnote librart processing.

modifyXML.py: To deal with nonsense chars in "Notes" column, reserve Time Cited information; modify improper doi format (remove DOI:, litter case)
prepareDOI.py: To pre-process doi number for getPDF.py script.
getPDF.py: Search PDF for endnote XML DOI records based on scihub.

Save the doi number in a file (one doi per line) and use the file as script input.

Each file save as "10.1021_ci111111a.pdf" in current directory. You have to move the valid file to "Done" Directory and use addPDF.py.
addPDF.py: Put found PDF in "Done" Directory, Some pdf don't want to search anymore in "Accept".

Give a endnote XML as input. Move the PDF to a directory based on doi-paper number. A new xml file will be generated.

Move all the directories in "Done" to "Endnote library.Data/PDF" and import the new xml file.  You may delete old records firstly.
checkdone.sh: Use input file in getPDF.py (saving doi numbers) as input

and generated a "not.txt" file saving doi not found.


## addPDF.py
#! /usr/bin/env python
# -*- coding: utf8 -*-
import os,sys

predoi="10.1021/"
pdfdir="Done/"

if (__name__ == '__main__'):
	fname=sys.argv[1]
	fnamelist=os.path.splitext(fname)
	fwname=fnamelist[0]+"_new"+fnamelist[1]
	fr=open(fname)
	all=fr.read()
	fr.close()
	fw=open(fwname,'w')
	length=len(all)

	pos1=0;pos2=0

	while True:
		pos1=all.find("</urls>",pos2)
		if (pos1 is -1):
			break
		else:
			fw.write(all[pos2:pos1])
		try:
			pos2=all.find("</style></electronic-resource-num>",pos1)
			if (all.find("pdf-urls>",pos1-50,pos1) is -1):
				pd=all.find(predoi,pos1,pos2)
				doi=all[pd:pos2]
				doii=doi.split('/')
				if (os.path.exists(pdfdir+doii[0]+"_"+doii[1]+".pdf")):
					if (not os.path.exists(pdfdir+doii[1])): os.mkdir(pdfdir+doii[1])
					os.renames(pdfdir+doii[0]+"_"+doii[1]+".pdf",pdfdir+doii[1]+os.sep+doii[0]+"_"+doii[1]+".pdf")
					fw.write("<pdf-urls><url>internal-pdf://"+doii[1]+"/"+doii[0]+"_"+doii[1]+".pdf"+"</url></pdf-urls>")
		except:
			pass
		fw.write(all[pos1:pos2])
	#last part
	fw.write(all[pos2:])
	fw.close()

## checkdone.sh
#! /bin/bash
# Check doi list.. generate not.txt for files not done
# Need doi list input file
dos2unix $1
echo > not.txt
for line in `cat $1`
do
if [ -z $line ];then
	continue;
fi
pre=${line:0:7}
post=${line:8}
if [ ! -f Done/${pre}_${post}.pdf ];then
	echo "${pre}/${post}" >> not.txt
fi
done

## getfiledoi.py
#! /usr/bin/env python
# Author: Hom, 2015.12.20
# Purpose: To find the doi number in first page of pdf
# Usage: python script.py pdffile [pdffile2 pdffile3 ...]
#
# Require pdfminer module
#    To install pdfminer: pip install pdfminer

from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
from pdfminer.image import ImageWriter

import sys,os,re

class stdmodel(object):
	'''a class to model stdout for pdfminer file parameter
	Can get context use get() method'''

	# saved string
	_str=""

	def __str__(self):
		return self._str

	def reset(self):
		'''Reset the saved string'''
		self._str=""
	def get(self):
		'''Get the saved string'''
		return self._str

	def write(self,line):
		'''model write method of file'''
		self._str+=line

	def open(self,*args):
		'''model open method of file'''
		self._str=""

	def close(self):
		'''model close method of file'''
		self._str=""

	def read(self):
		'''model read method of file'''
		return self._str

#	def writeline(self,lines):
#		pass
#	def readline(self):
#		pass
#	def readlines(self):
#		pass

####### Setup for pdfminer ############

# debug option
debug = 0
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug

#only first page
pagenos=set([0])
pageno = 1

#outfp = sys.stdout
outfp = stdmodel()

codec = 'utf-8'
showpageno = True
scale = 1
password = ''
maxpages = 0
rotation = 0
imagewriter = None
laparams = LAParams()

# ResourceManager facilitates reuse of shared resources
# such as fonts and images so that large objects are not
# allocated multiple times.
#### This will cause some problem when set to default True.
caching = False
rsrcmgr = PDFResourceManager(caching=caching)

# Important Main converter for pdf file
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)

####### Functions for read doi ############

def GetFirstPage(fname):
	'''Get First Page contents of PDF, return string'''
	fp = file(fname, 'rb')
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	for page in PDFPage.get_pages(fp, pagenos,
	                              maxpages=maxpages, password=password,
	                              caching=caching, check_extractable=True):
		page.rotate = (page.rotate+rotation) % 360
		interpreter.process_page(page)
	fp.close()
	outstr=outfp.get()
	outfp.reset()
	return outstr

# avoid repeat generate doipattern
doipattern=re.compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\\b")

def getdoi(instr):
	'''Get DOI number of input string'''
	match=doipattern.search(instr)
	if (match):
		return match.group()
	else:
		return ""

def getfiledoi(fname):
	'''Get DOI number from first page of PDF.
	If not found, return "" '''
	outs=GetFirstPage(fname)
	return getdoi(outs)

def doirenamefile(fname, doi):
	'''Rename file based on doi number'''
	realdoi=getdoi(doi)
	if ( realdoi is not "" ):
		fnames=os.path.split(os.path.abspath(fname))
		dois=realdoi.split('/',1)
		os.renames(fname,
			fnames[0]+os.sep+dois[0]+"@"+dois[1]+".pdf")
	#else don't rename it

def mainusage():
	'''Print usage'''
	print 'usage: %s [-r] [-d] pdffile ...' % sys.argv[0]
	exit(100)

if __name__=="__main__":
	import getopt
	try:
		(opts, args) = getopt.getopt(sys.argv[1:], 'rd')
	except getopt.GetoptError:
		mainusage()
	if not args: mainusage()

	# -r : rename file
	rename_=False
	# -d : only output doi name
	onlydoi_=False
	for (k, v) in opts:
		if k == '-r': rename_=True
		if k == '-d': onlydoi_=True

	# Perform for each file
	for fname in args:
		#fname=sys.argv[1]
		#fnamelist=os.path.splitext(fname)

		doi=getfiledoi(fname);
		if (rename_):
			doirenamefile(fname,doi)
			if (onlydoi_):
				print doi
		else:
			if (onlydoi_):
				print doi
			else:
				print fname+" "+"Found: "+doi

## getPDF.py
#! /usr/bin/env python
import sys,os,shutil
import urllib2 as ul2
import random

#doi="10.1021/ci960138u"
#my_headers = ['Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
#    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1',
#    'Mozilla/4.0 (compatible; GoogleToolbar 5.0.2124.2070; Windows 6.0; MSIE 8.0.6001.18241)',
#    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
#    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; Sleipnir/2.9.8)']

f=open(sys.argv[1])
for l in f:
	doi=l.strip().strip('/')
	doisplit=doi.split('/')
	doiout="_".join(doisplit)
	if (len(doisplit)<2):
		print "Error DOI:"+doi
		continue
	if os.path.exists("./"+doiout+".pdf"):
		continue
	if os.path.exists("Done/"+doiout+".pdf"):
		continue
	if os.path.exists("Accept/"+doiout+".pdf"):
		continue
	if os.path.exists("Done/"+doi.split('/',1)[1]+"/"):
		continue
	try:
		link="http://sci-hub.io/"+doi
		#random_header = random.choice(my_headers)
		#req=ul2.Request(link)
		#req.add_header("User-Agent",random_header)
		#req.add_header('Host', 'pubs.acs.org.sci-hub.io')
		#req.add_header('Referer', 'http://pubs.acs.org.sci-hub.io')
		#req.add_header('GET', link)
		web=ul2.urlopen(link)#(req)
		pdflink=""
		for line in web:
			if "<iframe src" in line and "sci-hub.io" in line and ".pdf" in line:
				pdflink=""
				if ("http" in line):
					i=line.index("http");
					j=line.index(".pdf");
					pdflink = line[i:j+4]
				else:
					i=line.index("sci-hub.io");
					j=line.index(".pdf");
					pdflink = "http://"+line[i:j+4]
				break
		# Another store link..
		if (len(pdflink)<5):
			link="http://pubs.acs.org.sci-hub.io/doi/abs/"+doi
			web=ul2.urlopen(link)
			for line in web:
				if "<iframe src" in line and "sci-hub.io" in line and ".pdf" in line:
					pdflink=""
					if ("http" in line):
						i=line.index("http");
						j=line.index(".pdf");
						pdflink = line[i:j+4]
					else:
						i=line.index("sci-hub.io");
						j=line.index(".pdf");
						pdflink = "http://"+line[i:j+4]
					break

		#pdfreq=ul2.urlopen(link)
		#with open(doiout+".pdf",'w') as fp:
		#	shutil.copyfileobj(pdfreq,fp)
		if (len(pdflink)<5):
			print doi+" can't find!!!!"
		else:
			os.system("wget "+pdflink+" -O "+doiout+".pdf")
	except :
		pass
f.close()

## modifyXML.py
#! /usr/bin/env python
# -*- coding: utf8 -*-
import os,sys

predoi="10.1021/"

pos1str='<notes><style face="normal" font="default" size="100%">'
pos1len=len(pos1str)
pos2str='Times Cited'
pos2len=len(pos2str)
doistr='<electronic-resource-num><style face="normal" font="default" size="100%">'
doistrlen=len(doistr)

substr=False

def processdoi(stri):
	pos1=stri.find(doistr)
	pos2=stri.find('</style></electronic-resource-num>')
	if (pos1 is -1 or pos2 is -1):
		return stri
	dois=stri[pos1+doistrlen:pos2]
	pos3=dois.find("10.")
	if ( pos3 >=0):
		newdoi=dois[pos3:].lower().strip()
		return stri[:pos1+doistrlen]+newdoi+stri[pos2:]
	else:
		return stri


if (__name__ == '__main__'):
	fname=sys.argv[1]
	fnamelist=os.path.splitext(fname)
	fwname=fnamelist[0]+"_new"+fnamelist[1]
	fr=open(fname)
	all=fr.read()
	fr.close()
	fw=open(fwname,'w')
	length=len(all)

	prepos1=0; pos1=0;pos2=0

	while True:
		prepos1=pos1;
		pos1=all.find(pos1str,pos2)
		writestr=""
		if (pos1 is -1):
			break
		elif ((pos1-pos2)>50):
			fw.write(processdoi(all[pos2:pos1+pos1len]))
		else:
			fw.write(processdoi(all[prepos1:pos1+pos1len]))


		try:
			pos2=all.find(pos2str,pos1)
			if (substr):
				#oristr=all[pos1+pos1len:pos2]
				fw.write(substr)
		except:
			pass
		#fw.write(all[pos1+pos1len:pos2])
	#last part
	fw.write(processdoi(all[pos2:]))
	fw.close()

## prepareDOI.py
#! /usr/bin/env python
# -*- coding: utf8 -*-
import os,sys

predoi="10."

if (__name__ == '__main__'):
	fname=sys.argv[1]
	fnamelist=os.path.splitext(fname)
	fwname=fnamelist[0]+"_new"+fnamelist[1]
	fr=open(fname)
	fw=open(fwname,'w')
	for line in fr:
		fw.write(line[line.find("10."):].lower().strip()+"\n")
	fr.close()
	fw.close()
	#! /usr/bin/env python
	# -- coding: utf8 --
	import os,sys

	predoi="10.1021/"
	pdfdir="Done/"

	if (__name__ == '__main__'):
	fname=sys.argv[1]
	fnamelist=os.path.splitext(fname)
	fwname=fnamelist[0]+"_new"+fnamelist[1]
	fr=open(fname)
	all=fr.read()
	fr.close()
	fw=open(fwname,'w')
	length=len(all)

	pos1=0;pos2=0

	while True:
	pos1=all.find("</urls>",pos2)
	if (pos1 is -1):
	break
	else:
	fw.write(all[pos2:pos1])
	try:
	pos2=all.find("</style></electronic-resource-num>",pos1)
	if (all.find("pdf-urls>",pos1-50,pos1) is -1):
	pd=all.find(predoi,pos1,pos2)
	doi=all[pd:pos2]
	doii=doi.split('/')
	if (os.path.exists(pdfdir+doii[0]+"_"+doii[1]+".pdf")):
	if (not os.path.exists(pdfdir+doii[1])): os.mkdir(pdfdir+doii[1])
	os.renames(pdfdir+doii[0]+"_"+doii[1]+".pdf",pdfdir+doii[1]+os.sep+doii[0]+"_"+doii[1]+".pdf")
	fw.write("<pdf-urls><url>internal-pdf://"+doii[1]+"/"+doii[0]+"_"+doii[1]+".pdf"+"</url></pdf-urls>")
	except:
	pass
	fw.write(all[pos1:pos2])
	#last part
	fw.write(all[pos2:])
	fw.close()
	#! /bin/bash
	# Check doi list.. generate not.txt for files not done
	# Need doi list input file
	dos2unix $1
	echo > not.txt
	for line in `cat $1`
	do
	if [ -z $line ];then
	continue;
	fi
	pre=${line:0:7}
	post=${line:8}
	if [ ! -f Done/${pre}_${post}.pdf ];then
	echo "${pre}/${post}" >> not.txt
	fi
	done
	#! /usr/bin/env python
	# Author: Hom, 2015.12.20
	# Purpose: To find the doi number in first page of pdf
	# Usage: python script.py pdffile [pdffile2 pdffile3 ...]
	#
	# Require pdfminer module
	# To install pdfminer: pip install pdfminer

	from pdfminer.pdfdocument import PDFDocument
	from pdfminer.pdfparser import PDFParser
	from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
	from pdfminer.pdfdevice import PDFDevice, TagExtractor
	from pdfminer.pdfpage import PDFPage
	from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
	from pdfminer.cmapdb import CMapDB
	from pdfminer.layout import LAParams
	from pdfminer.image import ImageWriter

	import sys,os,re

	class stdmodel(object):
	'''a class to model stdout for pdfminer file parameter
	Can get context use get() method'''

	# saved string
	_str=""

	def __str__(self):
	return self._str

	def reset(self):
	'''Reset the saved string'''
	self._str=""
	def get(self):
	'''Get the saved string'''
	return self._str

	def write(self,line):
	'''model write method of file'''
	self._str+=line

	def open(self,*args):
	'''model open method of file'''
	self._str=""

	def close(self):
	'''model close method of file'''
	self._str=""

	def read(self):
	'''model read method of file'''
	return self._str

	# def writeline(self,lines):
	# pass
	# def readline(self):
	# pass
	# def readlines(self):
	# pass

	####### Setup for pdfminer ############

	# debug option
	debug = 0
	PDFDocument.debug = debug
	PDFParser.debug = debug
	CMapDB.debug = debug
	PDFResourceManager.debug = debug
	PDFPageInterpreter.debug = debug
	PDFDevice.debug = debug

	#only first page
	pagenos=set([0])
	pageno = 1

	#outfp = sys.stdout
	outfp = stdmodel()

	codec = 'utf-8'
	showpageno = True
	scale = 1
	password = ''
	maxpages = 0
	rotation = 0
	imagewriter = None
	laparams = LAParams()

	# ResourceManager facilitates reuse of shared resources
	# such as fonts and images so that large objects are not
	# allocated multiple times.
	#### This will cause some problem when set to default True.
	caching = False
	rsrcmgr = PDFResourceManager(caching=caching)

	# Important Main converter for pdf file
	device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
	imagewriter=imagewriter)

	####### Functions for read doi ############

	def GetFirstPage(fname):
	'''Get First Page contents of PDF, return string'''
	fp = file(fname, 'rb')
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	for page in PDFPage.get_pages(fp, pagenos,
	maxpages=maxpages, password=password,
	caching=caching, check_extractable=True):
	page.rotate = (page.rotate+rotation) % 360
	interpreter.process_page(page)
	fp.close()
	outstr=outfp.get()
	outfp.reset()
	return outstr

	# avoid repeat generate doipattern
	doipattern=re.compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\\b")

	def getdoi(instr):
	'''Get DOI number of input string'''
	match=doipattern.search(instr)
	if (match):
	return match.group()
	else:
	return ""

	def getfiledoi(fname):
	'''Get DOI number from first page of PDF.
	If not found, return "" '''
	outs=GetFirstPage(fname)
	return getdoi(outs)

	def doirenamefile(fname, doi):
	'''Rename file based on doi number'''
	realdoi=getdoi(doi)
	if ( realdoi is not "" ):
	fnames=os.path.split(os.path.abspath(fname))
	dois=realdoi.split('/',1)
	os.renames(fname,
	fnames[0]+os.sep+dois[0]+"@"+dois[1]+".pdf")
	#else don't rename it

	def mainusage():
	'''Print usage'''
	print 'usage: %s [-r] [-d] pdffile ...' % sys.argv[0]
	exit(100)

	if __name__=="__main__":
	import getopt
	try:
	(opts, args) = getopt.getopt(sys.argv[1:], 'rd')
	except getopt.GetoptError:
	mainusage()
	if not args: mainusage()

	# -r : rename file
	rename_=False
	# -d : only output doi name
	onlydoi_=False
	for (k, v) in opts:
	if k == '-r': rename_=True
	if k == '-d': onlydoi_=True

	# Perform for each file
	for fname in args:
	#fname=sys.argv[1]
	#fnamelist=os.path.splitext(fname)

	doi=getfiledoi(fname);
	if (rename_):
	doirenamefile(fname,doi)
	if (onlydoi_):
	print doi
	else:
	if (onlydoi_):
	print doi
	else:
	print fname+" "+"Found: "+doi
	#! /usr/bin/env python
	import sys,os,shutil
	import urllib2 as ul2
	import random

	#doi="10.1021/ci960138u"
	#my_headers = ['Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
	# 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1',
	# 'Mozilla/4.0 (compatible; GoogleToolbar 5.0.2124.2070; Windows 6.0; MSIE 8.0.6001.18241)',
	# 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
	# 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; Sleipnir/2.9.8)']

	f=open(sys.argv[1])
	for l in f:
	doi=l.strip().strip('/')
	doisplit=doi.split('/')
	doiout="_".join(doisplit)
	if (len(doisplit)<2):
	print "Error DOI:"+doi
	continue
	if os.path.exists("./"+doiout+".pdf"):
	continue
	if os.path.exists("Done/"+doiout+".pdf"):
	continue
	if os.path.exists("Accept/"+doiout+".pdf"):
	continue
	if os.path.exists("Done/"+doi.split('/',1)[1]+"/"):
	continue
	try:
	link="http://sci-hub.io/"+doi
	#random_header = random.choice(my_headers)
	#req=ul2.Request(link)
	#req.add_header("User-Agent",random_header)
	#req.add_header('Host', 'pubs.acs.org.sci-hub.io')
	#req.add_header('Referer', 'http://pubs.acs.org.sci-hub.io')
	#req.add_header('GET', link)
	web=ul2.urlopen(link)#(req)
	pdflink=""
	for line in web:
	if "<iframe src" in line and "sci-hub.io" in line and ".pdf" in line:
	pdflink=""
	if ("http" in line):
	i=line.index("http");
	j=line.index(".pdf");
	pdflink = line[i:j+4]
	else:
	i=line.index("sci-hub.io");
	j=line.index(".pdf");
	pdflink = "http://"+line[i:j+4]
	break
	# Another store link..
	if (len(pdflink)<5):
	link="http://pubs.acs.org.sci-hub.io/doi/abs/"+doi
	web=ul2.urlopen(link)
	for line in web:
	if "<iframe src" in line and "sci-hub.io" in line and ".pdf" in line:
	pdflink=""
	if ("http" in line):
	i=line.index("http");
	j=line.index(".pdf");
	pdflink = line[i:j+4]
	else:
	i=line.index("sci-hub.io");
	j=line.index(".pdf");
	pdflink = "http://"+line[i:j+4]
	break

	#pdfreq=ul2.urlopen(link)
	#with open(doiout+".pdf",'w') as fp:
	# shutil.copyfileobj(pdfreq,fp)
	if (len(pdflink)<5):
	print doi+" can't find!!!!"
	else:
	os.system("wget "+pdflink+" -O "+doiout+".pdf")
	except :
	pass
	f.close()