Last active
October 20, 2020 07:22
-
-
Save sandyUni/8346a5299b8a0e4da222471b5046e208 to your computer and use it in GitHub Desktop.
An pdf merge tool to auto remove annotations and merge papers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding:utf-8 -*- | |
Despriptor=r''' | |
An pdf merge tool to auto remove annotations and merge papers | |
work-flow: | |
1. Export biblatex with attachment files throgh Zotero(https://www.zotero.org) | |
2. cd to the export root folder, and excute the command `mergeFile.py -t yourtitle` | |
requirements: | |
0.python version: tested with 3.7.3, win64 | |
1.python packages | |
bibtexparser, PyPDF2 | |
2.latex package: tested with texlive | |
Author: Lai Jun(mfary\#139.com) | |
''' | |
import bibtexparser | |
import glob | |
if __name__ == "__main__": | |
import argparse | |
from argparse import RawTextHelpFormatter | |
import random | |
import hashlib | |
import shutil | |
import os | |
from PyPDF2 import PdfFileWriter, PdfFileReader | |
import subprocess as sub | |
parser = argparse.ArgumentParser(description=Despriptor,formatter_class=RawTextHelpFormatter) | |
parser.add_argument('-t',dest='title', help='the title') | |
args = parser.parse_args() | |
if args.title is None: | |
mytitle = 'Paper Collections for:' | |
else: | |
mytitle = args.title | |
bibfiles = glob.glob("./*.bib") | |
for bibfile in bibfiles: | |
file1 = open(bibfile,'r',encoding='utf-8') | |
bibdata = bibtexparser.load(file1) | |
file1.close() | |
tmpfilename = './'+hashlib.sha224(str(random.random()).encode('utf-8')).hexdigest() | |
tmpfilenameTex =tmpfilename +'.tex' | |
tmpfilenamePDF =tmpfilename +'.pdf' | |
tmpfile = open(tmpfilenameTex,'w',encoding='utf8') | |
preamble = r''' | |
\documentclass{article} | |
\usepackage[final]{pdfpages} | |
\usepackage{fancyhdr} | |
\fancyhf{} | |
\usepackage[a4paper,bindingoffset=0.2in,% | |
left=1in,right=1in,top=1in,bottom=1in,% | |
footskip=.25in]{geometry} | |
\fancypagestyle{plain}{% | |
\fancyhf{} % clear all header and footer fields | |
\fancyfoot[RO,RE]{{\vspace*{0.5\baselineskip}\thepage}} %RO=right odd, RE=right even | |
\renewcommand{\headrulewidth}{0pt} | |
\renewcommand{\footrulewidth}{0pt}} | |
\begin{document} | |
''' | |
endString =r''' | |
\end{document}''' | |
listofcontents = '' | |
tmpfileLists = [] | |
authors = [] | |
titlesandpages = [] | |
for oneEntry in bibdata.entries: | |
if u'file' in oneEntry: | |
if oneEntry[u'file'].find(';') !=-1: | |
pdffiles = oneEntry[u'file'].split(';') | |
print('I just reserve the first pdf file') | |
entry = pdffiles[0] | |
for pdffile in pdffiles: | |
if pdffile.find('application/pdf') !=-1: | |
entry = pdffile | |
break | |
else: | |
entry = oneEntry[u'file'] | |
pdfFileNameMeta = entry.split(':') | |
authors.append(oneEntry[u'author']) | |
tmptitle = oneEntry[u'title'] | |
else: | |
pdfFileNameMeta = None | |
if pdfFileNameMeta is not None and pdfFileNameMeta[2] == u'application/pdf': | |
pdfFileName = u'./'+pdfFileNameMeta[1] | |
tmpPdfFileName = './'+hashlib.sha224(str(random.random()).encode('utf-8')).hexdigest()+'.pdf' | |
tmpfileLists.append(tmpPdfFileName) | |
shutil.copy2(pdfFileName,tmpPdfFileName) | |
infile = PdfFileReader(tmpPdfFileName, 'rb') | |
output = PdfFileWriter() | |
for i in range(infile.getNumPages()): | |
p = infile.getPage(i) | |
output.addPage(p) | |
tmppages = infile.getNumPages() | |
tmpPdfFileName = './'+hashlib.sha224(str(random.random()).encode('utf-8')).hexdigest()+'.pdf' | |
tmpfileLists.append(tmpPdfFileName) | |
f = open(tmpPdfFileName,'wb') | |
output.write(f) | |
f.close() | |
else: | |
tmppages = 0 | |
if pdfFileNameMeta is not None: | |
try: | |
print(pdfFileNameMeta[2]) | |
except: | |
pass | |
if pdfFileNameMeta is not None: | |
titlesandpages.append({'title':tmptitle,'pages':tmppages}) | |
# listofcontents = listofcontents + u"""\includepdf[pages=-,pagecommand={\thispagestyle{plain}}]"""+ u'{'+pdfFileName+ u'}'+u"\n" | |
listofcontents = listofcontents + r"""\includepdf[pages=-,pagecommand={\thispagestyle{plain}}]""" | |
listofcontents = listofcontents + u'{'+tmpPdfFileName+ u'}' | |
listofcontents = listofcontents + u"\n" | |
fileContent = preamble+listofcontents+endString | |
tmpfile.write(fileContent) | |
tmpfile.close() | |
p = sub.Popen(['xelatex', tmpfilenameTex,'-interaction=nonstopmode','-halt-on-error'],stdout=sub.PIPE,stderr=sub.PIPE) | |
output, errors = p.communicate() | |
for tmpfile in tmpfileLists: | |
os.remove(tmpfile) | |
# create cover | |
coverContent = r''' | |
\documentclass[12pt,twoside,a4paper]{report} | |
\usepackage[utf8]{inputenc} | |
\usepackage{amsmath} | |
\usepackage{array} | |
\newcolumntype{P}[1]{>{\raggedright\arraybackslash}p{#1}} | |
\usepackage{longtable} | |
\usepackage{geometry} | |
\usepackage{marginnote} | |
\usepackage{color} | |
\newcommand*{\titleTH}{\begingroup% T&H Typography | |
\raggedleft | |
\vspace*{\baselineskip} | |
%{\bfseries The Big Book of}\\[\baselineskip] | |
{\textcolor{red}{\Huge \textbf{yourtitle}}}\\\vspace*{3\baselineskip} | |
\raggedright | |
{yourtableofcontent} \\[0.167\textheight] | |
{yourAuthors}\par | |
\raggedleft | |
\vfill{\Large \today }\par | |
\vspace*{3\baselineskip}\endgroup} | |
\begin{document} | |
\thispagestyle{empty} | |
\titleTH | |
\end{document} | |
''' | |
tableofContent = u'' | |
startpage = 1 | |
for titlesandpage in titlesandpages: | |
tmpStr = u'' | |
tmpStr = titlesandpage['title'] | |
pages = titlesandpage['pages'] | |
if pages ==0: | |
pass | |
else: | |
tmpStr = tmpStr + u":" + str(startpage) +u'--' +str(startpage+pages-1) | |
startpage = startpage+pages | |
# tmpStr = tmpStr+u'\\\\' | |
if len(tableofContent) == 0: | |
tableofContent = tableofContent+tmpStr | |
else: | |
tableofContent = tableofContent+u'\\\\'+tmpStr | |
coverContent = coverContent.replace('yourtitle',mytitle) | |
coverContent = coverContent.replace('yourtableofcontent',tableofContent) | |
authorlists =u'' | |
for author in authors: | |
authorlists = authorlists + author +'; ' | |
if len(authorlists) > 1024: | |
break | |
coverContent = coverContent.replace('yourAuthors',authorlists) | |
tmpfilenameofCover = './'+hashlib.sha224(str(random.random()).encode('utf8')).hexdigest() | |
tmpfilenameofCoverTex = tmpfilenameofCover +'.tex' | |
tmpfilenameofCoverPdf = tmpfilenameofCover +'.pdf' | |
tmpfile = open(tmpfilenameofCoverTex,'w') | |
try: | |
tmpfile.write(coverContent) | |
except: | |
tmpfile.write(coverContent.encode('utf8')) | |
tmpfile.close() | |
p = sub.Popen(['xelatex', tmpfilenameofCoverTex,'-interaction=nonstopmode','-halt-on-error'],stdout=sub.PIPE,stderr=sub.PIPE) | |
output, errors = p.communicate() | |
#merge to the final pdf | |
print('merge cover and pdfs') | |
writer = PdfFileWriter() | |
pdflist = [tmpfilenameofCoverPdf,tmpfilenamePDF] | |
for pdf in pdflist: | |
pdff = open(pdf,'rb') | |
pdfFile = PdfFileReader(pdff) | |
for pageNum in range(pdfFile.getNumPages()): | |
currentPage = pdfFile.getPage(pageNum) | |
writer.addPage(currentPage) | |
outputStream = open("mergedFile.pdf","wb") | |
writer.write(outputStream) | |
outputStream.close() | |
# p = sub.Popen(['pdfmerge',tmpfilenameofCoverPdf,tmpfilenamePDF,'-o mergedFile.pdf'],stdout=sub.PIPE,stderr=sub.PIPE) | |
# output, errors = p.communicate() | |
# print(errors) | |
# del files | |
texfilenamesprefix = [tmpfilenameofCover,tmpfilename] | |
filetypes=[u'aux',u'log',u'pdf',u'tex'] | |
for fileprefix in texfilenamesprefix: | |
for filetype in filetypes: | |
filenames = fileprefix+u'.'+filetype | |
try: | |
os.remove(filenames) | |
except: | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment