Skip to content

Instantly share code, notes, and snippets.

@TwistingTwists
Last active March 14, 2020 08:22
Show Gist options
  • Save TwistingTwists/67029df4a9f1d5847337dd4ca39e9f35 to your computer and use it in GitHub Desktop.
Save TwistingTwists/67029df4a9f1d5847337dd4ca39e9f35 to your computer and use it in GitHub Desktop.
Extract text from pdf
*.jar
*.pdf
.DS_Store
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don’t work, or not
# install all needed dependencies.
#Pipfile.lock
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/

python solutions.py --f 5A.pdf -- for normal file with nunmbers python solutions.py --f 5A.pdf --ans True -- to generate only answer to copy in excel

Insights Pre 2019 Questios extractions

python InsightsPre.py --f 26_with_solutions.pdf it will create Qs.txt as text file of all questions which can be converted to pdf via following:

generate pdf from Qs.txt

python txt2pdf.py Qs.txt

import fire
import re
import sys
import csv
from helper import multiple_replace, getText, cleantext, getTikaAppText, fileExtensionChange
# sys.path.append("/Users/abhishektripathi/Downloads/InsightQuiz/pdfans/")
def var2File(variable, file):
with open(file, 'w')as f:
f.write(variable)
print("saved variable s to ", file)
def saveasCSV(mylist, file):
with open(file, 'w', newline='') as myfile:
writer = csv.writer(myfile, lineterminator='\n')
for tup in mylist:
writer.writerow(tup)
# wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
# wr.writerow(mylist)
def getRegexSolutions(s):
# https://regex101.com/r/A69Vm8/2/
reg = r"-?(Q.(\d{1,})\))\s{1,}\~{1,}Ans\)\s{1,}(\w)"
# https: // regex101.com/r/O8wLXu/3
reg = r"(Q.(\d{1,})\))\s{1,}\~{1,}Ans(?:\)|\:|)\s{1,}(\w)"
# https://regex101.com/r/SANmul/1
reg = r"(Q.(\d{1,})(?:\)|))(?:\s{1,}|)\~{1,}Ans(?:\)|\:|)\s{1,}(\w)"
ans = re.findall(reg, s)
tocsv = [(aa[1], aa[2]) for aa in ans]
return tocsv
def ForumAnswers(*files):
# f = "AF23.pdf"
for Afile in files:
s = getTikaAppText(Afile)
s = cleantext(s)
var2File(s, fileExtensionChange(Afile, ".txt"))
tocsv = getRegexSolutions(s)
csvfile = fileExtensionChange(Afile, ".csv")
saveasCSV(tocsv, csvfile)
print(str(len(tocsv)), "Questions saved successfully to ", csvfile)
if __name__ == "__main__":
fire.Fire(ForumAnswers)
import re
from tikapp import TikaApp
import os
import PyPDF2
import fire
from tika import parser
# https://stackoverflow.com/questions/44429610/apache-tika-dependencies-without-maven-which-dependencies-to-download
####################
# text cleaning and processing
####################
def cleantext(s):
return s.replace("\n", "~")
def multiple_replace(dict, text):
# Create a regular expression from the dictionary keys
regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))
# For each match, look-up corresponding value in dictionary
return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)
####################
# tika parser getting text from pdf
####################
def getText(fname):
raw = parser.from_file(fname)
s = raw['content']
# print(s)
return s
def getTikaAppText(fname):
# fulll path to tikajar
# tikajar = "/Users/abhishektripathi/Desktop/tika-app-1.20.jar"
# tikajar = "~/tika-app-1.20.jar"
tikajar = os.path.expanduser("~/tika-app-1.20.jar")
tika_client = TikaApp(file_jar=tikajar)
s = tika_client.extract_only_content(fname)
# print(s)
return s
def getPyPDFText(f):
""" extract text from pdf using PyPDF2
"""
s = ""
# creating a pdf file object
pdfFileObj = open(f, 'rb')
# creating a pdf reader object
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
# printing number of pages in pdf file
print(pdfReader.numPages)
for pg in range(0, pdfReader.numPages):
# creating a page object
fp = pdfReader.getPage(pg)
s += fp.extractText()
print(fp.extractText())
return s
####################
# saving text to file
####################
def savetofile(s, f="pdf.txt"):
f = open(f, 'w')
f.write(s)
f.close()
print(" \nFile saved! ")
def SaveRawText(fname):
# s = getText(fname)
# s = getPyPDFText(fname)
s = getTikaAppText(fname)
# s = cleantext(s)
savetofile(s, "rawPDF.txt")
####################
# JUNK
####################
def fileExtensionChange(renamee, new_extension):
import os
# renamee is the file getting renamed, pre is the part of file name before extension and ext is current extension
pre, ext = os.path.splitext(renamee)
return pre + new_extension
def kachra():
# replacing text after "Q Source" with "QuestionS"
Qinsert = r"(?<=Q Source:)\s\S.+"
S = re.sub(Qinsert, 'QuestionS', s)
savetofile(S, "S.txt")
# find Q and answer pair # https://regex101.com/r/rwxxDw/1
QA = "-?(?<=QuestionS)\s*[\d]*|-?(Correct Answer : )(\w)"
# https://regex101.com/r/rwxxDw/2
QA = "-?((?<=QuestionS)\s*\d*)|-?(?<=Correct Answer : )(\w)"
QA = "-?((?<=QuestionS)\s*\d\s)|-?(?<=Correct Answer : )(\w)"
Q = "((?<=QuestionS)\s*\d\s)"
A = "(?<=Correct Answer : )(\w)"
def parse(f):
# ans = []
# reg = r"(?<=Q\s)[\d\.\w]*"
# reg = r"Solution:\s\w"
reg = r"(\d+\.)(.*?)(Solution:\s)"
ans = re.findall(reg, s)
print(ans)
# ans = verifySort(ans)
listToFile(ans)
# closing the pdf file object
pdfFileObj.close()
def parset(f):
# ans = []
# reg = r"(?<=Q\s)[\d\.\w]*")
# reg = r"Solution:\s\w"
reg = r"(\d+\.)(.*?)(Solution:\s)"
s = getText(f)
ans = re.findall(reg, s)
print(ans)
# ans = verifySort(ans)
listToFile(ans)
if __name__ == "__main__":
fire.Fire(InsightsPreQuestions)
1 B
2 C
3 B
4 D
5 B
6 D
7 D
8 A
9 A
10 A
11 D
12 D
13 D
14 D
15 D
16 A
17 A
18 D
19 D
20 D
21 B
22 B
23 C
24 D
25 C
26 A
27 A
28 A
29 B
30 C
31 D
32 C
33 D
34 A
35 B
36 B
37 C
38 A
39 C
40 B
41 C
42 A
43 C
44 D
45 A
46 C
47 B
48 C
49 B
50 A
51 D
52 C
1 D
54 A
2 C
56 D
57 C
58 B
59 D
60 C
61 B
62 B
63 A
64 D
65 B
66 A
67 A
68 D
69 B
70 D
71 A
72 D
73 C
74 C
75 D
76 C
77 C
78 B
79 C
80 C
81 B
82 B
83 A
84 A
85 B
86 D
87 C
88 C
89 D
90 C
91 B
92 A
93 C
94 B
95 D
96 C
97 D
98 C
99 A
100 D
B
C
B
D
B
D
D
A
A
A
D
D
D
D
D
A
A
D
D
D
B
B
C
D
C
A
A
A
B
C
D
C
D
A
B
B
C
A
C
B
C
A
C
D
A
C
B
C
B
A
D
C
D
A
C
D
C
B
D
C
B
B
A
D
B
A
A
D
B
D
A
D
C
C
D
C
C
B
C
C
B
B
A
A
B
D
C
C
D
C
B
A
C
B
D
C
D
C
A
D
import re
import fire
from tikapp import TikaApp
from helper import savetofile, SaveRawText, getTikaAppText, cleantext, multiple_replace
from urlregex import URL_REGEX
####################
# parsing related functions
####################
def parsing(S):
""" takes a string from pdf to parse Q As accordingly
"""
# https://stackoverflow.com/questions/52197413/ignoring-newline-character-in-regex-match
# QA = "(((?<=QuestionS)\s*\d)|((?<=Correct Answer : )(\w)))"
# QAs = re.findall(QA, S, re.MULTILINE)
# correct = r"(?<=Correct Answer : )(\w)"
# to parse the first question
S = "Q Source:" + S
# https://regex101.com/r/Ij967H/2 Questions, answer, justificaiotn
# qaj = r'((?<=Q Source:).*?((?<=Correct Answer : )\w)).*?((?<=Justification :).*?Q Sou)'
# https://regex101.com/r/Ij967H/3/
qaj = r"((?<=Q Source:).*?Correct\s).*?((?<=Correct Answer : )\w).*?((?<=Justification :).*?Q Sou)"
QAJ = re.findall(qaj, S, re.MULTILINE)
return QAJ
def Ques(QAJ):
"""
remove telegram promotional material
"""
s = ""
dic = {"Total Marks : 200": "",
"Join Our Telegram Channel https://t.me/UPSCMaterials For Instant Updates": "",
"~Mock Test 26~": "",
"~~~": "",
"Correct": "",
}
for qaj in QAJ:
s += multiple_replace(dic, qaj[0])
s = removeurls(s)
s = removepagenums(s)
return s
def options_in_front(s):
# https://regex101.com/r/zTMgH7/1
orex = re.compile(r"(.*\w)([ABCD].$)")
# orex = re.compile("""-?(.*\w)(\s{1,}?[ABCD].$)|-?(.*\w)([ABCD].$)""")
# https://stackoverflow.com/questions/38974589/regex-search-and-replace-how-to-move-characters-in-a-block-of-text
rex = re.compile(r"(.*)([ABCD].$)", re.MULTILINE)
s = re.sub(rex, r'\2 \1', s)
rex = re.compile(r"(.*)(\s{1,}?([ABCD].$))", re.MULTILINE)
s = re.sub(rex, r'\3 \1', s)
return s
def removeurls(s):
return re.sub(URL_REGEX, "", s)
def removepagenums(s):
pgrex = r'~\d{1,3}~'
return re.sub(pgrex, "", s)
def format_newline(s):
"""
Put `newline` characters back which were replaced as `~` earlier in `cleantext`
"""
dic = {"~": "\n"}
return multiple_replace(dic, s)
def InsightsPreQuestions(f):
# SaveRawText(f)
s = getTikaAppText(f)
s = cleantext(s)
savetofile(s, "s.txt")
QAJ = parsing(s)
print("totalQuestions = {0}".format(len(QAJ)))
q = format_newline(Ques(QAJ))
questions = options_in_front(q)
with open("Qs.txt", "w") as qs:
qs.write(questions)
print("Qs saved")
if __name__ == "__main__":
fire.Fire(InsightsPreQuestions)
from helper import multiple_replace, getText, cleantext
import PyPDF2
import re
import fire
def listToFile(textList):
with open('ans.txt', "w") as outF:
# write range parsed
start = str(textList[0]).split(".")
end = str(textList[-1:]).split(".")
s = "".join(start)
e = "".join(end)
outF.write(s + " to " + e + "\n")
for line in textList:
# write line to output file
val = line.split(".")[1]
outF.write(val)
outF.write("\n")
def colon_filter(colon):
if colon.strip() == ":":
return False
else:
return True
def cl(ll):
ll = [filter(bool, l) for l in ll]
ll = [filter(colon_filter, l) for l in ll]
return ll
#########################
# testing for ipython
# A7 = "Test-7a.pdf"
# A7s = getText(A7)
# A7s = cleantext(A7s)
# %store A7s >> A7s.txt
#########################
def insightsSolutionstoFile(textList, ansonly=False):
with open('insights.txt', "w") as outF:
for line in textList:
# write line to output file
val = re.split(r" |:|\.", line)
val = list(cl([val])[0])
print(val)
if ansonly:
outF.write(val[2])
else:
outF.write(val[0]+" " + val[2])
# outF.write("\n")
def pit(f):
"""
take file, regex and cleanit (ie. remove '\n' to make text from pdf as a proper string)
"""
# reg = r"(\d+\.)(.*?)(Solution:\s\w)"
# reg = r"\s{6,}(\d+\.)(.*?)(Solution:\s\w)"
# reg = r"(PRELIMS\s2019\s{3,}(\d{1,4}\.)(.*?)(Solution:\s\w)) | (\s{6,}(\d+\.)(.*?)(Solution:\s\w))"
# reg = r"(PRELIMS\s2019\s{3,}(\d{1,4}\.)(.*?)(Solution:\s\w)) | (\s{6,}(\d+\.)(.*?)(Solution:\s\w))"
# https://regex101.com/r/UkoD3M/2
# reg = r"(PRELIMS\s2019\s{3,}(\d{1,4}\.)(.*?)(Solution:\s{1,3}\w)) | (\s{6,}(\d+\.)(.*?)(Solution:\s{1,3}\w))"
# https://regex101.com/r/UkoD3M/1
# https://regex101.com/r/UkoD3M/3
# learning optional regex https://regex101.com/r/oA39ME/1
# reg = r"-?(PRELIMS\s2019\s{3,}(\d{1,4}\.)(.*?)(Solution:\s{1,3}\w))|-?(\s{6,}(\d{1,4}\.)(.*?)(Solution:\s{1,3}\w))"
# https://regex101.com/r/aG4Dwh/2 for optional ":" after 'Solution'. It handles both 'Solution: B' and 'Solution B'
reg = r"-?(PRELIMS\s2019\s{3,}(\d{1,4}\.)(.*?)(Solution(:?)\s{1,3}\w))|-?(\s{5,}(\d{1,4}\.)(.*?)(Solution(:?)\s{1,3}\w))"
s = getText(f)
s = cleantext(s)
ans = re.findall(reg, s)
# print(ans)
return ans
def InsightsSolutions(f, ans=False):
"""
revision quiz for prelims : https://www.insightsonindia.com/revision-quiz-for-upsc-prelims-2019/
Currently can successfully parse insights revision test at https://www.insightsonindia.com/2019/03/24/download-insightsias-revision-tests-for-upsc-civil-services-preliminary-exam-2019-tests-5-to-8/
"""
def parseit(f):
"""
pit : take file, regex and remove '\n'
cl : filter list from junk like space and colon
pattern match (question, answer ,solution)
"""
# regex matches `Solution: B` and `Solution B` from insights revision plan
an = pit(f)
# cleaning the list using `cl`
s = []
clean_filter = cl(an)
cleanlist = [list(l) for l in clean_filter]
# for each (Q,A,S) list in cleanList
for a in cleanlist:
print(a)
# using this template to pattern match (question, answer ,solution)
# Answer is stored in A, just in case you need.
(Q, A, S) = a[-3:]
s.append(Q + " " + S + "\n")
return s
s = parseit(f)
# print(s)
# save solution to file
insightsSolutionstoFile(s, ans)
if __name__ == "__main__":
fire.Fire(InsightsSolutions)
from solutions import getText
import fire
import re
def listToFile(textList):
with open('sans.txt', "w") as outF:
# write range parsed
for line in textList:
# write line to output file
outF.write(line)
outF.write("\n")
def solutionText(f):
tt = getText(f)
# tt = tt.replace("\n"," " )
forumsolution = r"Q\.(.+?)Source"
ss = tt.strip().replace("\n", " ")
ans = re.findall(forumsolution, ss, re.MULTILINE)
print(len(ans))
listToFile(ans)
if __name__ == "__main__":
fire.Fire(solutionText)
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# taken from https://github.com/baruchel/txt2pdf
import argparse
import reportlab.lib.pagesizes
from reportlab.pdfgen.canvas import Canvas
from reportlab.lib import units
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
import re
import sys
import os
class Margins(object):
def __init__(self, right, left, top, bottom):
self._right = right
self._left = left
self._top = top
self._bottom = bottom
@property
def right(self):
return self._right * units.cm
@property
def left(self):
return self._left * units.cm
@property
def top(self):
return self._top * units.cm
@property
def bottom(self):
return self._bottom * units.cm
def adjustLeft(self, width):
self._left -= width / units.cm
class PDFCreator(object):
appName = "txt2pdf (version 1.0)"
def __init__(self, args, margins):
pageWidth, pageHeight = reportlab.lib.pagesizes.__dict__[args.media]
if args.landscape:
pageWidth, pageHeight = reportlab.lib.pagesizes.landscape(
(pageWidth, pageHeight))
self.author = args.author
self.title = args.title
self.keywords = args.keywords
self.subject = args.subject
self.canvas = Canvas(args.output, pagesize=(pageWidth, pageHeight))
self.canvas.setCreator(self.appName)
if len(args.author) > 0:
self.canvas.setAuthor(args.author)
if len(args.title) > 0:
self.canvas.setTitle(args.title)
if len(args.subject) > 0:
self.canvas.setSubject(args.subject)
if len(args.keywords) > 0:
self.canvas.setKeywords(args.keywords)
self.fontSize = args.font_size
if args.font not in ('Courier'):
self.font = 'myFont'
pdfmetrics.registerFont(TTFont('myFont', args.font))
else:
self.font = args.font
self.kerning = args.kerning
self.margins = margins
self.leading = (args.extra_vertical_space + 1.2) * self.fontSize
self.linesPerPage = int(
(self.leading + pageHeight
- margins.top - margins.bottom - self.fontSize) / self.leading)
self.lppLen = len(str(self.linesPerPage))
fontWidth = self.canvas.stringWidth(
".", fontName=self.font, fontSize=self.fontSize)
self.lineNumbering = args.line_numbers
if self.lineNumbering:
margins.adjustLeft(fontWidth * (self.lppLen + 2))
contentWidth = pageWidth - margins.left - margins.right
self.charsPerLine = int(
(contentWidth + self.kerning) / (fontWidth + self.kerning))
self.top = pageHeight - margins.top - self.fontSize
self.filename = args.filename
self.verbose = not args.quiet
self.breakOnBlanks = args.break_on_blanks
self.encoding = args.encoding
self.pageNumbering = args.page_numbers
if self.pageNumbering:
self.pageNumberPlacement = \
(pageWidth / 2, margins.bottom / 2)
def _process(self, data):
flen = os.fstat(data.fileno()).st_size
lineno = 0
read = 0
for line in data:
lineno += 1
if sys.version_info.major == 2:
read += len(line)
yield flen == \
read, lineno, line.decode(self.encoding).rstrip('\r\n')
else:
read += len(line.encode(self.encoding))
yield flen == read, lineno, line.rstrip('\r\n')
def _readDocument(self):
with open(self.filename, 'r') as data:
for done, lineno, line in self._process(data):
if len(line) > self.charsPerLine:
self._scribble(
"Warning: wrapping line %d in %s" %
(lineno + 1, self.filename))
while len(line) > self.charsPerLine:
yield done, line[:self.charsPerLine]
line = line[self.charsPerLine:]
yield done, line
def _newpage(self):
textobject = self.canvas.beginText()
textobject.setFont(self.font, self.fontSize, leading=self.leading)
textobject.setTextOrigin(self.margins.left, self.top)
textobject.setCharSpace(self.kerning)
if self.pageNumbering:
self.canvas.drawString(
self.pageNumberPlacement[0],
self.pageNumberPlacement[1],
str(self.canvas.getPageNumber()))
return textobject
def _scribble(self, text):
if self.verbose:
sys.stderr.write(text + os.linesep)
def generate(self):
self._scribble(
"Writing '%s' with %d characters per "
"line and %d lines per page..." %
(self.filename, self.charsPerLine, self.linesPerPage)
)
if self.breakOnBlanks:
pageno = self._generateBob(self._readDocument())
else:
pageno = self._generatePlain(self._readDocument())
self._scribble("PDF document: %d pages" % pageno)
def _generatePlain(self, data):
pageno = 1
lineno = 0
page = self._newpage()
for _, line in data:
lineno += 1
# Handle form feed characters.
(line, pageBreakCount) = re.subn(r'\f', r'', line)
if pageBreakCount > 0 and lineno >= args.minimum_page_length:
for _ in range(pageBreakCount):
self.canvas.drawText(page)
self.canvas.showPage()
lineno = 0
pageno += 1
page = self._newpage()
if args.minimum_page_length > 0:
break
page.textLine(line)
if lineno == self.linesPerPage:
self.canvas.drawText(page)
self.canvas.showPage()
lineno = 0
pageno += 1
page = self._newpage()
if lineno > 0:
self.canvas.drawText(page)
else:
pageno -= 1
self.canvas.save()
return pageno
def _writeChunk(self, page, chunk, lineno):
if self.lineNumbering:
formatstr = '%%%dd: %%s' % self.lppLen
for index, line in enumerate(chunk):
page.textLine(
formatstr % (lineno - len(chunk) + index + 1, line))
else:
for line in chunk:
page.textLine(line)
def _generateBob(self, data):
pageno = 1
lineno = 0
page = self._newpage()
chunk = list()
for last, line in data:
if lineno == self.linesPerPage:
self.canvas.drawText(page)
self.canvas.showPage()
lineno = len(chunk)
pageno += 1
page = self._newpage()
lineno += 1
chunk.append(line)
if last or len(line.strip()) == 0:
self._writeChunk(page, chunk, lineno)
chunk = list()
if lineno > 0:
self.canvas.drawText(page)
self.canvas.showPage()
else:
pageno -= 1
if len(chunk) > 0:
page = self._newpage()
self.canvas.drawText(page)
self.canvas.showPage()
pageno += 1
self.canvas.save()
return pageno
parser = argparse.ArgumentParser()
parser.add_argument('filename')
parser.add_argument(
'--font',
'-f',
default='Courier',
help='Select a font (True Type format) by its full path')
parser.add_argument(
'--font-size',
'-s',
type=float,
default=10.0,
help='Size of the font')
parser.add_argument(
'--extra-vertical-space',
'-v',
type=float,
default=0.0,
help='Extra vertical space between lines')
parser.add_argument(
'--kerning',
'-k',
type=float,
default=0.0,
help='Extra horizontal space between characters')
parser.add_argument(
'--media',
'-m',
default='A4',
help='Select the size of the page (A4, A3, etc.)')
parser.add_argument(
'--minimum-page-length',
'-M',
type=int,
default=10,
help='The minimum number of lines before a form feed character will change the page')
parser.add_argument(
'--landscape',
'-l',
action="store_true",
default=False,
help='Select landscape mode')
parser.add_argument(
'--margin-left',
'-L',
type=float,
default=2.0,
help='Left margin (in cm unit)')
parser.add_argument(
'--margin-right',
'-R',
type=float,
default=2.0,
help='Right margin (in cm unit)')
parser.add_argument(
'--margin-top',
'-T',
type=float,
default=2.0,
help='Top margin (in cm unit)')
parser.add_argument(
'--margin-bottom',
'-B',
type=float,
default=2.0,
help='Bottom margin (in cm unit)')
parser.add_argument(
'--output',
'-o',
default='output.pdf',
help='Output file')
parser.add_argument(
'--author',
default='',
help='Author of the PDF document')
parser.add_argument(
'--title',
default='',
help='Title of the PDF document')
parser.add_argument(
'--quiet',
'-q',
action='store_true',
default=False,
help='Hide detailed information')
parser.add_argument('--subject', default='',
help='Subject of the PDF document')
parser.add_argument('--keywords', default='',
help='Keywords of the PDF document')
parser.add_argument(
'--break-on-blanks',
'-b',
action='store_true',
default=False,
help='Only break page on blank lines')
parser.add_argument(
'--encoding',
'-e',
type=str,
default='utf8',
help='Input encoding')
parser.add_argument(
'--page-numbers',
'-n',
action='store_true',
help='Add page numbers')
parser.add_argument(
'--line-numbers',
action='store_true',
help='Add line numbers')
args = parser.parse_args()
PDFCreator(args, Margins(
args.margin_right,
args.margin_left,
args.margin_top,
args.margin_bottom)).generate()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment