Skip to content

Instantly share code, notes, and snippets.

@jg-you jg-you/extraction.py
Last active Oct 14, 2015

Embed
What would you like to do?
Extract parts of a pdf file as a png (dirty hack)
#!/usr/bin/env python3
# Author: Jean-Gabriel Young
# Email: jean.gabriel.young@gmail.com
# -*- coding: utf-8 -*-
import argparse
import subprocess
import os
from PIL import Image
def extract(upper_bound=1, lower_bound=0, margin_x=0.075, margin_y=0.055,
page=1, onecolumn=False, column=1, dpi=300,
infile=None, outfile=None):
print(dpi, margin_x, margin_y, upper_bound, lower_bound, page, onecolumn, dpi, infile, outfile)
# Process pdf and create a PNG of the page that contains the figure.
tmp_pdf_path = "/tmp/extraction_tmp.pdf"
tmp_png_path = "/tmp/extraction_tmp.png"
subprocess.call(["pdfseparate",
"-f", str(page), # First page
"-l", str(page), # Last page
infile, # Input file
tmp_pdf_path]) # Output file
subprocess.call(["pdftocairo",
"-png", # Output in cairo png
"-r", str(dpi), # DPI
"-singlefile", # Do not rename
tmp_pdf_path, # Input file
"/tmp/extraction_tmp"]) # Output file
# Load page in PIL and crop it.
with Image.open(tmp_png_path) as page:
width = page.size[0]
height = page.size[1]
accesible = height * (1 - 2 * margin_y)
get_pixel = lambda y: int(height * margin_y + accesible * y)
if onecolumn:
bounding_box = (int(width * margin_x),
get_pixel(upper_bound),
int(width * (1 - margin_x)),
get_pixel(lower_bound))
else:
if column == 1:
bounding_box = (int(width * margin_x),
get_pixel(upper_bound),
int(width * 0.49),
get_pixel(lower_bound))
else:
bounding_box = (int(width * 0.51),
get_pixel(upper_bound),
int(width * (1 - margin_x)),
get_pixel(lower_bound))
figure = page.crop(bounding_box)
figure.save(outfile)
# Cleanup
os.remove(tmp_png_path)
os.remove(tmp_pdf_path)
if __name__ == '__main__':
# Options parser.
prs = argparse.ArgumentParser(description='Crop parts of a PDF file and\
output to PNG.')
prs.add_argument('--upper', '-u', type=float, default=0.0,
help='Upper bound of the crop box (passed as a fraction\
of the page (excluding margin). The origin is\
located in the top left hand corner of the page.')
prs.add_argument('--lower', '-l', type=float, default=1.0,
help='Low bound of the crop box (passed as a fraction\
of the page (excluding margin). The origin is\
located in the top left hand corner of the page.')
prs.add_argument('--margin_x', '-mx', type=float, default=0.075,
help='Width of the margin (x)')
prs.add_argument('--margin_y', '-my', type=float, default=0.055,
help='Height of the margin (y)')
prs.add_argument('--page', '-p', type=int, default=1,
help='Page of the pdf.')
prs.add_argument('--onecolumn', '-t', action='store_true',
help='The cropped area spans two column.')
prs.add_argument('--column', '-c', type=int, default=1,
help='Column index (if the crop box span one column).')
prs.add_argument('--dpi', '-d', type=int, default=300,
help='Resolution of the image.')
prs.add_argument('infile', type=str, nargs=1,
help='Path to input pdf.')
prs.add_argument('outfile', type=str, nargs=1,
help='Path to output pdf.')
args = prs.parse_args()
extract(upper_bound=args.upper,
lower_bound=args.lower,
margin_x=args.margin_x,
margin_y=args.margin_y,
page=args.page,
onecolumn=args.onecolumn,
column=args.column,
dpi=args.dpi,
infile=args.infile[0],
outfile=args.outfile[0])
#!/usr/bin/env python3
# Author: Jean-Gabriel Young
# Email: jean.gabriel.young@gmail.com
# -*- coding: utf-8 -*-
import argparse
from extraction import extract
def simple_extract(anchor=0, page=1, onecolumn=False, column=1,
infile=None, outfile=None):
if onecolumn:
extract(upper_bound=anchor,
lower_bound=round(anchor + 0.0975, 4),
margin_x=0.063,
page=page,
onecolumn=True,
column=0,
dpi=210,
infile=infile,
outfile=outfile)
else:
extract(upper_bound=anchor,
lower_bound=round(anchor + 0.0457, 4),
margin_x=0.0803,
page=page,
onecolumn=False,
column=column,
dpi=448,
infile=infile,
outfile=outfile)
if __name__ == '__main__':
prs = argparse.ArgumentParser(description='Wrapper around extraction.py.')
prs.add_argument('--anchor', '-a', type=float, default=0.0,
help='y anchor of the cropbox.')
prs.add_argument('--page', '-p', type=int, default=1,
help='Page of the pdf.')
prs.add_argument('--onecolumn', '-t', action='store_true',
help='The cropped area spans two column.')
prs.add_argument('--column', '-c', type=int, default=1,
help='Column index (if the crop box span one column).')
prs.add_argument('infile', type=str, nargs=1,
help='Path to input pdf.')
prs.add_argument('outfile', type=str, nargs=1,
help='Path to output pdf.')
args = prs.parse_args()
simple_extract(anchor=args.anchor,
page=args.page,
onecolumn=args.onecolumn,
column=args.column,
infile=args.infile[0],
outfile=args.outfile[0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.