Skip to content

Instantly share code, notes, and snippets.

@sandyUni
Last active October 20, 2020 07:22
Show Gist options
  • Save sandyUni/8346a5299b8a0e4da222471b5046e208 to your computer and use it in GitHub Desktop.
Save sandyUni/8346a5299b8a0e4da222471b5046e208 to your computer and use it in GitHub Desktop.
An pdf merge tool to auto remove annotations and merge papers
# -*- coding:utf-8 -*-
Despriptor=r'''
An pdf merge tool to auto remove annotations and merge papers
work-flow:
1. Export biblatex with attachment files throgh Zotero(https://www.zotero.org)
2. cd to the export root folder, and excute the command `mergeFile.py -t yourtitle`
requirements:
0.python version: tested with 3.7.3, win64
1.python packages
bibtexparser, PyPDF2
2.latex package: tested with texlive
Author: Lai Jun(mfary\#139.com)
'''
import bibtexparser
import glob
if __name__ == "__main__":
import argparse
from argparse import RawTextHelpFormatter
import random
import hashlib
import shutil
import os
from PyPDF2 import PdfFileWriter, PdfFileReader
import subprocess as sub
parser = argparse.ArgumentParser(description=Despriptor,formatter_class=RawTextHelpFormatter)
parser.add_argument('-t',dest='title', help='the title')
args = parser.parse_args()
if args.title is None:
mytitle = 'Paper Collections for:'
else:
mytitle = args.title
bibfiles = glob.glob("./*.bib")
for bibfile in bibfiles:
file1 = open(bibfile,'r',encoding='utf-8')
bibdata = bibtexparser.load(file1)
file1.close()
tmpfilename = './'+hashlib.sha224(str(random.random()).encode('utf-8')).hexdigest()
tmpfilenameTex =tmpfilename +'.tex'
tmpfilenamePDF =tmpfilename +'.pdf'
tmpfile = open(tmpfilenameTex,'w',encoding='utf8')
preamble = r'''
\documentclass{article}
\usepackage[final]{pdfpages}
\usepackage{fancyhdr}
\fancyhf{}
\usepackage[a4paper,bindingoffset=0.2in,%
left=1in,right=1in,top=1in,bottom=1in,%
footskip=.25in]{geometry}
\fancypagestyle{plain}{%
\fancyhf{} % clear all header and footer fields
\fancyfoot[RO,RE]{{\vspace*{0.5\baselineskip}\thepage}} %RO=right odd, RE=right even
\renewcommand{\headrulewidth}{0pt}
\renewcommand{\footrulewidth}{0pt}}
\begin{document}
'''
endString =r'''
\end{document}'''
listofcontents = ''
tmpfileLists = []
authors = []
titlesandpages = []
for oneEntry in bibdata.entries:
if u'file' in oneEntry:
if oneEntry[u'file'].find(';') !=-1:
pdffiles = oneEntry[u'file'].split(';')
print('I just reserve the first pdf file')
entry = pdffiles[0]
for pdffile in pdffiles:
if pdffile.find('application/pdf') !=-1:
entry = pdffile
break
else:
entry = oneEntry[u'file']
pdfFileNameMeta = entry.split(':')
authors.append(oneEntry[u'author'])
tmptitle = oneEntry[u'title']
else:
pdfFileNameMeta = None
if pdfFileNameMeta is not None and pdfFileNameMeta[2] == u'application/pdf':
pdfFileName = u'./'+pdfFileNameMeta[1]
tmpPdfFileName = './'+hashlib.sha224(str(random.random()).encode('utf-8')).hexdigest()+'.pdf'
tmpfileLists.append(tmpPdfFileName)
shutil.copy2(pdfFileName,tmpPdfFileName)
infile = PdfFileReader(tmpPdfFileName, 'rb')
output = PdfFileWriter()
for i in range(infile.getNumPages()):
p = infile.getPage(i)
output.addPage(p)
tmppages = infile.getNumPages()
tmpPdfFileName = './'+hashlib.sha224(str(random.random()).encode('utf-8')).hexdigest()+'.pdf'
tmpfileLists.append(tmpPdfFileName)
f = open(tmpPdfFileName,'wb')
output.write(f)
f.close()
else:
tmppages = 0
if pdfFileNameMeta is not None:
try:
print(pdfFileNameMeta[2])
except:
pass
if pdfFileNameMeta is not None:
titlesandpages.append({'title':tmptitle,'pages':tmppages})
# listofcontents = listofcontents + u"""\includepdf[pages=-,pagecommand={\thispagestyle{plain}}]"""+ u'{'+pdfFileName+ u'}'+u"\n"
listofcontents = listofcontents + r"""\includepdf[pages=-,pagecommand={\thispagestyle{plain}}]"""
listofcontents = listofcontents + u'{'+tmpPdfFileName+ u'}'
listofcontents = listofcontents + u"\n"
fileContent = preamble+listofcontents+endString
tmpfile.write(fileContent)
tmpfile.close()
p = sub.Popen(['xelatex', tmpfilenameTex,'-interaction=nonstopmode','-halt-on-error'],stdout=sub.PIPE,stderr=sub.PIPE)
output, errors = p.communicate()
for tmpfile in tmpfileLists:
os.remove(tmpfile)
# create cover
coverContent = r'''
\documentclass[12pt,twoside,a4paper]{report}
\usepackage[utf8]{inputenc}
\usepackage{amsmath}
\usepackage{array}
\newcolumntype{P}[1]{>{\raggedright\arraybackslash}p{#1}}
\usepackage{longtable}
\usepackage{geometry}
\usepackage{marginnote}
\usepackage{color}
\newcommand*{\titleTH}{\begingroup% T&H Typography
\raggedleft
\vspace*{\baselineskip}
%{\bfseries The Big Book of}\\[\baselineskip]
{\textcolor{red}{\Huge \textbf{yourtitle}}}\\\vspace*{3\baselineskip}
\raggedright
{yourtableofcontent} \\[0.167\textheight]
{yourAuthors}\par
\raggedleft
\vfill{\Large \today }\par
\vspace*{3\baselineskip}\endgroup}
\begin{document}
\thispagestyle{empty}
\titleTH
\end{document}
'''
tableofContent = u''
startpage = 1
for titlesandpage in titlesandpages:
tmpStr = u''
tmpStr = titlesandpage['title']
pages = titlesandpage['pages']
if pages ==0:
pass
else:
tmpStr = tmpStr + u":" + str(startpage) +u'--' +str(startpage+pages-1)
startpage = startpage+pages
# tmpStr = tmpStr+u'\\\\'
if len(tableofContent) == 0:
tableofContent = tableofContent+tmpStr
else:
tableofContent = tableofContent+u'\\\\'+tmpStr
coverContent = coverContent.replace('yourtitle',mytitle)
coverContent = coverContent.replace('yourtableofcontent',tableofContent)
authorlists =u''
for author in authors:
authorlists = authorlists + author +'; '
if len(authorlists) > 1024:
break
coverContent = coverContent.replace('yourAuthors',authorlists)
tmpfilenameofCover = './'+hashlib.sha224(str(random.random()).encode('utf8')).hexdigest()
tmpfilenameofCoverTex = tmpfilenameofCover +'.tex'
tmpfilenameofCoverPdf = tmpfilenameofCover +'.pdf'
tmpfile = open(tmpfilenameofCoverTex,'w')
try:
tmpfile.write(coverContent)
except:
tmpfile.write(coverContent.encode('utf8'))
tmpfile.close()
p = sub.Popen(['xelatex', tmpfilenameofCoverTex,'-interaction=nonstopmode','-halt-on-error'],stdout=sub.PIPE,stderr=sub.PIPE)
output, errors = p.communicate()
#merge to the final pdf
print('merge cover and pdfs')
writer = PdfFileWriter()
pdflist = [tmpfilenameofCoverPdf,tmpfilenamePDF]
for pdf in pdflist:
pdff = open(pdf,'rb')
pdfFile = PdfFileReader(pdff)
for pageNum in range(pdfFile.getNumPages()):
currentPage = pdfFile.getPage(pageNum)
writer.addPage(currentPage)
outputStream = open("mergedFile.pdf","wb")
writer.write(outputStream)
outputStream.close()
# p = sub.Popen(['pdfmerge',tmpfilenameofCoverPdf,tmpfilenamePDF,'-o mergedFile.pdf'],stdout=sub.PIPE,stderr=sub.PIPE)
# output, errors = p.communicate()
# print(errors)
# del files
texfilenamesprefix = [tmpfilenameofCover,tmpfilename]
filetypes=[u'aux',u'log',u'pdf',u'tex']
for fileprefix in texfilenamesprefix:
for filetype in filetypes:
filenames = fileprefix+u'.'+filetype
try:
os.remove(filenames)
except:
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment