jg-you/extraction.py

## extraction.py
#!/usr/bin/env python3
# Author: Jean-Gabriel Young
# Email: jean.gabriel.young@gmail.com
# -*- coding: utf-8 -*-
import argparse
import subprocess
import os
from PIL import Image


def extract(upper_bound=1, lower_bound=0, margin_x=0.075, margin_y=0.055,
            page=1, onecolumn=False, column=1, dpi=300,
            infile=None, outfile=None):
    print(dpi, margin_x, margin_y, upper_bound, lower_bound, page, onecolumn, dpi, infile, outfile)
    # Process pdf and create a PNG of the page that contains the figure.
    tmp_pdf_path = "/tmp/extraction_tmp.pdf"
    tmp_png_path = "/tmp/extraction_tmp.png"
    subprocess.call(["pdfseparate",
                     "-f", str(page),    # First page
                     "-l", str(page),    # Last page
                     infile,             # Input file
                     tmp_pdf_path])      # Output file
    subprocess.call(["pdftocairo",
                     "-png",                  # Output in cairo png
                     "-r", str(dpi),          # DPI
                     "-singlefile",           # Do not rename
                     tmp_pdf_path,            # Input file
                     "/tmp/extraction_tmp"])  # Output file
    # Load page in PIL and crop it.
    with Image.open(tmp_png_path) as page:
        width = page.size[0]
        height = page.size[1]

        accesible = height * (1 - 2 * margin_y)
        get_pixel = lambda y: int(height * margin_y + accesible * y)

        if onecolumn:
            bounding_box = (int(width * margin_x),
                            get_pixel(upper_bound),
                            int(width * (1 - margin_x)),
                            get_pixel(lower_bound))
        else:
            if column == 1:
                bounding_box = (int(width * margin_x),
                                get_pixel(upper_bound),
                                int(width * 0.49),
                                get_pixel(lower_bound))
            else:
                bounding_box = (int(width * 0.51),
                                get_pixel(upper_bound),
                                int(width * (1 - margin_x)),
                                get_pixel(lower_bound))
        figure = page.crop(bounding_box)
        figure.save(outfile)
    # Cleanup
    os.remove(tmp_png_path)
    os.remove(tmp_pdf_path)


if __name__ == '__main__':
    # Options parser.
    prs = argparse.ArgumentParser(description='Crop parts of a PDF file and\
                                               output to PNG.')
    prs.add_argument('--upper', '-u', type=float, default=0.0,
                     help='Upper bound of the crop box (passed as a fraction\
                           of the page (excluding margin). The origin is\
                           located in the top left hand corner of the page.')
    prs.add_argument('--lower', '-l', type=float, default=1.0,
                     help='Low bound of the crop box (passed as a fraction\
                           of the page (excluding margin). The origin is\
                           located in the top left hand corner of the page.')
    prs.add_argument('--margin_x', '-mx', type=float, default=0.075,
                     help='Width of the margin (x)')
    prs.add_argument('--margin_y', '-my', type=float, default=0.055,
                     help='Height of the margin (y)')
    prs.add_argument('--page', '-p', type=int, default=1,
                     help='Page of the pdf.')
    prs.add_argument('--onecolumn', '-t', action='store_true',
                     help='The cropped area spans two column.')
    prs.add_argument('--column', '-c', type=int, default=1,
                     help='Column index (if the crop box span one column).')
    prs.add_argument('--dpi', '-d', type=int, default=300,
                     help='Resolution of the image.')
    prs.add_argument('infile', type=str, nargs=1,
                     help='Path to input pdf.')
    prs.add_argument('outfile', type=str, nargs=1,
                     help='Path to output pdf.')
    args = prs.parse_args()
    extract(upper_bound=args.upper,
            lower_bound=args.lower,
            margin_x=args.margin_x,
            margin_y=args.margin_y,
            page=args.page,
            onecolumn=args.onecolumn,
            column=args.column,
            dpi=args.dpi,
            infile=args.infile[0],
            outfile=args.outfile[0])

## simple_extraction.py
#!/usr/bin/env python3
# Author: Jean-Gabriel Young
# Email: jean.gabriel.young@gmail.com
# -*- coding: utf-8 -*-
import argparse
from extraction import extract


def simple_extract(anchor=0, page=1, onecolumn=False, column=1,
                   infile=None, outfile=None):
    if onecolumn:
        extract(upper_bound=anchor,
                lower_bound=round(anchor + 0.0975, 4),
                margin_x=0.063,
                page=page,
                onecolumn=True,
                column=0,
                dpi=210,
                infile=infile,
                outfile=outfile)
    else:
        extract(upper_bound=anchor,
                lower_bound=round(anchor + 0.0457, 4),
                margin_x=0.0803,
                page=page,
                onecolumn=False,
                column=column,
                dpi=448,
                infile=infile,
                outfile=outfile)


if __name__ == '__main__':
    prs = argparse.ArgumentParser(description='Wrapper around extraction.py.')
    prs.add_argument('--anchor', '-a', type=float, default=0.0,
                     help='y anchor of the cropbox.')
    prs.add_argument('--page', '-p', type=int, default=1,
                     help='Page of the pdf.')
    prs.add_argument('--onecolumn', '-t', action='store_true',
                     help='The cropped area spans two column.')
    prs.add_argument('--column', '-c', type=int, default=1,
                     help='Column index (if the crop box span one column).')
    prs.add_argument('infile', type=str, nargs=1,
                     help='Path to input pdf.')
    prs.add_argument('outfile', type=str, nargs=1,
                     help='Path to output pdf.')
    args = prs.parse_args()
    simple_extract(anchor=args.anchor,
                   page=args.page,
                   onecolumn=args.onecolumn,
                   column=args.column,
                   infile=args.infile[0],
                   outfile=args.outfile[0])
	#!/usr/bin/env python3
	# Author: Jean-Gabriel Young
	# Email: jean.gabriel.young@gmail.com
	# -- coding: utf-8 --
	import argparse
	import subprocess
	import os
	from PIL import Image


	def extract(upper_bound=1, lower_bound=0, margin_x=0.075, margin_y=0.055,
	page=1, onecolumn=False, column=1, dpi=300,
	infile=None, outfile=None):
	print(dpi, margin_x, margin_y, upper_bound, lower_bound, page, onecolumn, dpi, infile, outfile)
	# Process pdf and create a PNG of the page that contains the figure.
	tmp_pdf_path = "/tmp/extraction_tmp.pdf"
	tmp_png_path = "/tmp/extraction_tmp.png"
	subprocess.call(["pdfseparate",
	"-f", str(page), # First page
	"-l", str(page), # Last page
	infile, # Input file
	tmp_pdf_path]) # Output file
	subprocess.call(["pdftocairo",
	"-png", # Output in cairo png
	"-r", str(dpi), # DPI
	"-singlefile", # Do not rename
	tmp_pdf_path, # Input file
	"/tmp/extraction_tmp"]) # Output file
	# Load page in PIL and crop it.
	with Image.open(tmp_png_path) as page:
	width = page.size[0]
	height = page.size[1]

	accesible = height * (1 - 2 * margin_y)
	get_pixel = lambda y: int(height * margin_y + accesible * y)

	if onecolumn:
	bounding_box = (int(width * margin_x),
	get_pixel(upper_bound),
	int(width * (1 - margin_x)),
	get_pixel(lower_bound))
	else:
	if column == 1:
	bounding_box = (int(width * margin_x),
	get_pixel(upper_bound),
	int(width * 0.49),
	get_pixel(lower_bound))
	else:
	bounding_box = (int(width * 0.51),
	get_pixel(upper_bound),
	int(width * (1 - margin_x)),
	get_pixel(lower_bound))
	figure = page.crop(bounding_box)
	figure.save(outfile)
	# Cleanup
	os.remove(tmp_png_path)
	os.remove(tmp_pdf_path)


	if __name__ == '__main__':
	# Options parser.
	prs = argparse.ArgumentParser(description='Crop parts of a PDF file and\
	output to PNG.')
	prs.add_argument('--upper', '-u', type=float, default=0.0,
	help='Upper bound of the crop box (passed as a fraction\
	of the page (excluding margin). The origin is\
	located in the top left hand corner of the page.')
	prs.add_argument('--lower', '-l', type=float, default=1.0,
	help='Low bound of the crop box (passed as a fraction\
	of the page (excluding margin). The origin is\
	located in the top left hand corner of the page.')
	prs.add_argument('--margin_x', '-mx', type=float, default=0.075,
	help='Width of the margin (x)')
	prs.add_argument('--margin_y', '-my', type=float, default=0.055,
	help='Height of the margin (y)')
	prs.add_argument('--page', '-p', type=int, default=1,
	help='Page of the pdf.')
	prs.add_argument('--onecolumn', '-t', action='store_true',
	help='The cropped area spans two column.')
	prs.add_argument('--column', '-c', type=int, default=1,
	help='Column index (if the crop box span one column).')
	prs.add_argument('--dpi', '-d', type=int, default=300,
	help='Resolution of the image.')
	prs.add_argument('infile', type=str, nargs=1,
	help='Path to input pdf.')
	prs.add_argument('outfile', type=str, nargs=1,
	help='Path to output pdf.')
	args = prs.parse_args()
	extract(upper_bound=args.upper,
	lower_bound=args.lower,
	margin_x=args.margin_x,
	margin_y=args.margin_y,
	page=args.page,
	onecolumn=args.onecolumn,
	column=args.column,
	dpi=args.dpi,
	infile=args.infile[0],
	outfile=args.outfile[0])