Skip to content

Instantly share code, notes, and snippets.

@platinhom
Last active November 28, 2023 07:32
Show Gist options
  • Save platinhom/07475ec4efc514dd90d8 to your computer and use it in GitHub Desktop.
Save platinhom/07475ec4efc514dd90d8 to your computer and use it in GitHub Desktop.
DOI-endnote-process

Scripts for endnote librart processing.

  • modifyXML.py: To deal with nonsense chars in "Notes" column, reserve Time Cited information; modify improper doi format (remove DOI:, litter case)
  • prepareDOI.py: To pre-process doi number for getPDF.py script.
  • getPDF.py: Search PDF for endnote XML DOI records based on scihub.
    Save the doi number in a file (one doi per line) and use the file as script input.
    Each file save as "10.1021_ci111111a.pdf" in current directory. You have to move the valid file to "Done" Directory and use addPDF.py.
  • addPDF.py: Put found PDF in "Done" Directory, Some pdf don't want to search anymore in "Accept".
    Give a endnote XML as input. Move the PDF to a directory based on doi-paper number. A new xml file will be generated.
    Move all the directories in "Done" to "Endnote library.Data/PDF" and import the new xml file. You may delete old records firstly.
  • checkdone.sh: Use input file in getPDF.py (saving doi numbers) as input
    and generated a "not.txt" file saving doi not found.
#! /usr/bin/env python
# -*- coding: utf8 -*-
import os,sys
predoi="10.1021/"
pdfdir="Done/"
if (__name__ == '__main__'):
fname=sys.argv[1]
fnamelist=os.path.splitext(fname)
fwname=fnamelist[0]+"_new"+fnamelist[1]
fr=open(fname)
all=fr.read()
fr.close()
fw=open(fwname,'w')
length=len(all)
pos1=0;pos2=0
while True:
pos1=all.find("</urls>",pos2)
if (pos1 is -1):
break
else:
fw.write(all[pos2:pos1])
try:
pos2=all.find("</style></electronic-resource-num>",pos1)
if (all.find("pdf-urls>",pos1-50,pos1) is -1):
pd=all.find(predoi,pos1,pos2)
doi=all[pd:pos2]
doii=doi.split('/')
if (os.path.exists(pdfdir+doii[0]+"_"+doii[1]+".pdf")):
if (not os.path.exists(pdfdir+doii[1])): os.mkdir(pdfdir+doii[1])
os.renames(pdfdir+doii[0]+"_"+doii[1]+".pdf",pdfdir+doii[1]+os.sep+doii[0]+"_"+doii[1]+".pdf")
fw.write("<pdf-urls><url>internal-pdf://"+doii[1]+"/"+doii[0]+"_"+doii[1]+".pdf"+"</url></pdf-urls>")
except:
pass
fw.write(all[pos1:pos2])
#last part
fw.write(all[pos2:])
fw.close()
#! /bin/bash
# Check doi list.. generate not.txt for files not done
# Need doi list input file
dos2unix $1
echo > not.txt
for line in `cat $1`
do
if [ -z $line ];then
continue;
fi
pre=${line:0:7}
post=${line:8}
if [ ! -f Done/${pre}_${post}.pdf ];then
echo "${pre}/${post}" >> not.txt
fi
done
#! /usr/bin/env python
# Author: Hom, 2015.12.20
# Purpose: To find the doi number in first page of pdf
# Usage: python script.py pdffile [pdffile2 pdffile3 ...]
#
# Require pdfminer module
# To install pdfminer: pip install pdfminer
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
from pdfminer.image import ImageWriter
import sys,os,re
class stdmodel(object):
'''a class to model stdout for pdfminer file parameter
Can get context use get() method'''
# saved string
_str=""
def __str__(self):
return self._str
def reset(self):
'''Reset the saved string'''
self._str=""
def get(self):
'''Get the saved string'''
return self._str
def write(self,line):
'''model write method of file'''
self._str+=line
def open(self,*args):
'''model open method of file'''
self._str=""
def close(self):
'''model close method of file'''
self._str=""
def read(self):
'''model read method of file'''
return self._str
# def writeline(self,lines):
# pass
# def readline(self):
# pass
# def readlines(self):
# pass
####### Setup for pdfminer ############
# debug option
debug = 0
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#only first page
pagenos=set([0])
pageno = 1
#outfp = sys.stdout
outfp = stdmodel()
codec = 'utf-8'
showpageno = True
scale = 1
password = ''
maxpages = 0
rotation = 0
imagewriter = None
laparams = LAParams()
# ResourceManager facilitates reuse of shared resources
# such as fonts and images so that large objects are not
# allocated multiple times.
#### This will cause some problem when set to default True.
caching = False
rsrcmgr = PDFResourceManager(caching=caching)
# Important Main converter for pdf file
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
####### Functions for read doi ############
def GetFirstPage(fname):
'''Get First Page contents of PDF, return string'''
fp = file(fname, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,
caching=caching, check_extractable=True):
page.rotate = (page.rotate+rotation) % 360
interpreter.process_page(page)
fp.close()
outstr=outfp.get()
outfp.reset()
return outstr
# avoid repeat generate doipattern
doipattern=re.compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\\b")
def getdoi(instr):
'''Get DOI number of input string'''
match=doipattern.search(instr)
if (match):
return match.group()
else:
return ""
def getfiledoi(fname):
'''Get DOI number from first page of PDF.
If not found, return "" '''
outs=GetFirstPage(fname)
return getdoi(outs)
def doirenamefile(fname, doi):
'''Rename file based on doi number'''
realdoi=getdoi(doi)
if ( realdoi is not "" ):
fnames=os.path.split(os.path.abspath(fname))
dois=realdoi.split('/',1)
os.renames(fname,
fnames[0]+os.sep+dois[0]+"@"+dois[1]+".pdf")
#else don't rename it
def mainusage():
'''Print usage'''
print 'usage: %s [-r] [-d] pdffile ...' % sys.argv[0]
exit(100)
if __name__=="__main__":
import getopt
try:
(opts, args) = getopt.getopt(sys.argv[1:], 'rd')
except getopt.GetoptError:
mainusage()
if not args: mainusage()
# -r : rename file
rename_=False
# -d : only output doi name
onlydoi_=False
for (k, v) in opts:
if k == '-r': rename_=True
if k == '-d': onlydoi_=True
# Perform for each file
for fname in args:
#fname=sys.argv[1]
#fnamelist=os.path.splitext(fname)
doi=getfiledoi(fname);
if (rename_):
doirenamefile(fname,doi)
if (onlydoi_):
print doi
else:
if (onlydoi_):
print doi
else:
print fname+" "+"Found: "+doi
#! /usr/bin/env python
import sys,os,shutil
import urllib2 as ul2
import random
#doi="10.1021/ci960138u"
#my_headers = ['Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
# 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1',
# 'Mozilla/4.0 (compatible; GoogleToolbar 5.0.2124.2070; Windows 6.0; MSIE 8.0.6001.18241)',
# 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
# 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; Sleipnir/2.9.8)']
f=open(sys.argv[1])
for l in f:
doi=l.strip().strip('/')
doisplit=doi.split('/')
doiout="_".join(doisplit)
if (len(doisplit)<2):
print "Error DOI:"+doi
continue
if os.path.exists("./"+doiout+".pdf"):
continue
if os.path.exists("Done/"+doiout+".pdf"):
continue
if os.path.exists("Accept/"+doiout+".pdf"):
continue
if os.path.exists("Done/"+doi.split('/',1)[1]+"/"):
continue
try:
link="http://sci-hub.io/"+doi
#random_header = random.choice(my_headers)
#req=ul2.Request(link)
#req.add_header("User-Agent",random_header)
#req.add_header('Host', 'pubs.acs.org.sci-hub.io')
#req.add_header('Referer', 'http://pubs.acs.org.sci-hub.io')
#req.add_header('GET', link)
web=ul2.urlopen(link)#(req)
pdflink=""
for line in web:
if "<iframe src" in line and "sci-hub.io" in line and ".pdf" in line:
pdflink=""
if ("http" in line):
i=line.index("http");
j=line.index(".pdf");
pdflink = line[i:j+4]
else:
i=line.index("sci-hub.io");
j=line.index(".pdf");
pdflink = "http://"+line[i:j+4]
break
# Another store link..
if (len(pdflink)<5):
link="http://pubs.acs.org.sci-hub.io/doi/abs/"+doi
web=ul2.urlopen(link)
for line in web:
if "<iframe src" in line and "sci-hub.io" in line and ".pdf" in line:
pdflink=""
if ("http" in line):
i=line.index("http");
j=line.index(".pdf");
pdflink = line[i:j+4]
else:
i=line.index("sci-hub.io");
j=line.index(".pdf");
pdflink = "http://"+line[i:j+4]
break
#pdfreq=ul2.urlopen(link)
#with open(doiout+".pdf",'w') as fp:
# shutil.copyfileobj(pdfreq,fp)
if (len(pdflink)<5):
print doi+" can't find!!!!"
else:
os.system("wget "+pdflink+" -O "+doiout+".pdf")
except :
pass
f.close()
#! /usr/bin/env python
# -*- coding: utf8 -*-
import os,sys
predoi="10.1021/"
pos1str='<notes><style face="normal" font="default" size="100%">'
pos1len=len(pos1str)
pos2str='Times Cited'
pos2len=len(pos2str)
doistr='<electronic-resource-num><style face="normal" font="default" size="100%">'
doistrlen=len(doistr)
substr=False
def processdoi(stri):
pos1=stri.find(doistr)
pos2=stri.find('</style></electronic-resource-num>')
if (pos1 is -1 or pos2 is -1):
return stri
dois=stri[pos1+doistrlen:pos2]
pos3=dois.find("10.")
if ( pos3 >=0):
newdoi=dois[pos3:].lower().strip()
return stri[:pos1+doistrlen]+newdoi+stri[pos2:]
else:
return stri
if (__name__ == '__main__'):
fname=sys.argv[1]
fnamelist=os.path.splitext(fname)
fwname=fnamelist[0]+"_new"+fnamelist[1]
fr=open(fname)
all=fr.read()
fr.close()
fw=open(fwname,'w')
length=len(all)
prepos1=0; pos1=0;pos2=0
while True:
prepos1=pos1;
pos1=all.find(pos1str,pos2)
writestr=""
if (pos1 is -1):
break
elif ((pos1-pos2)>50):
fw.write(processdoi(all[pos2:pos1+pos1len]))
else:
fw.write(processdoi(all[prepos1:pos1+pos1len]))
try:
pos2=all.find(pos2str,pos1)
if (substr):
#oristr=all[pos1+pos1len:pos2]
fw.write(substr)
except:
pass
#fw.write(all[pos1+pos1len:pos2])
#last part
fw.write(processdoi(all[pos2:]))
fw.close()
#! /usr/bin/env python
# -*- coding: utf8 -*-
import os,sys
predoi="10."
if (__name__ == '__main__'):
fname=sys.argv[1]
fnamelist=os.path.splitext(fname)
fwname=fnamelist[0]+"_new"+fnamelist[1]
fr=open(fname)
fw=open(fwname,'w')
for line in fr:
fw.write(line[line.find("10."):].lower().strip()+"\n")
fr.close()
fw.close()
@brainu
Copy link

brainu commented Oct 29, 2017

how to use these scripts? thanks!

@Brispark
Copy link

Brispark commented Nov 5, 2017

How do you use this scripts?

@srvaclmax
Copy link

I think the URLs are outdated, a colleague of mine would be interested in this script, also a tutorial on how to use it would be nice.
Thanks.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment