DiKorsch/convert_synced_pdf_to_images.py

## convert_synced_pdf_to_images.py
from converter import extract
from os import path
import argparse, simplejson as json

def generateJson(lectureId, outdir, images):
  if not outdir.endswith("/"): outdir += "/"
  sync_file = path.join(outdir, "sync")
  json_obj = {"lectureId": lectureId, "images": []}
  with open(sync_file) as f:
    for line in f:
      slide_num, time = line.rstrip().split()
      json_obj["images"].append({
        "path": images.get(int(slide_num)).replace(outdir, ""),
        "start": time, "unique": True, "textlines": []
      })
  json_file = path.join(outdir, "slides.json")
  json.dump(json_obj, open(json_file, "w"), indent = 2)

  return json_file

if __name__ == '__main__':
  parser = argparse.ArgumentParser(description='this extracts from the given PDF the slides')
  parser.add_argument('-p', '--pdf',    type=str, required = True, help='slides in PDF format')
  parser.add_argument('-o', '--output', type=str, required = True, help='output directory')
  parser.add_argument('--id', type=str, required = True, help='ID of the slide. needed for JSON generation')
  parser.add_argument('-s', '--size',   type=str, default='1280x720', help='size of the extracted images. format: "WIDTHxHEIGHT"')
  parser.add_argument('--overwrite',    action='store_true')
  args = parser.parse_args()

  images = extract(args.pdf, args.size, args.overwrite, args.output)
  json_file = generateJson(args.id, args.output, images)

  print "slide info saved under \"{}\"".format(json_file)

## converter.py
import os, PyPDF2, re
from os import path
from wand.image import Image


def getPageNames(pdf_name):
  f = PyPDF2.PdfFileReader(open(pdf_name, "rb"))
  return map(lambda c: "{}[{}]".format(pdf_name, c), range(f.numPages))

def parseSize(size_as_string): return map(int, re.match(r"(\d+)x(\d+)", size_as_string).groups())

def adjust_size(img, width, height):
  img_ratio = float(img.width) / img.height
  if img_ratio != float(width) / height:
    height = int(width / img_ratio)

  return width, height

def extract(fpath, size_as_string, overwrite = False, outdir = None):
  images = []
  fname, dirpath = path.basename(fpath), path.dirname(fpath)
  width, height = parseSize(size_as_string)
  for idx, pageName in enumerate(getPageNames(fpath), 1):
    page_img_name = path.join(outdir or dirpath, "slides", "{}.jpg".format(idx))
    images.append((idx, page_img_name))
    if not overwrite and path.isfile(page_img_name): continue
    if not path.isdir(path.dirname(page_img_name)):
      os.mkdir(path.dirname(page_img_name))
    with Image(filename = pageName, resolution=200) as img:
      img.compression_quality = 99
      img.alpha_channel=False
      img.resize(*adjust_size(img, width, height))
      img.save(filename = page_img_name)
  return dict(images)

## requirements.txt
Wand
PyPDF2
argparse
simplejson
	from converter import extract
	from os import path
	import argparse, simplejson as json

	def generateJson(lectureId, outdir, images):
	if not outdir.endswith("/"): outdir += "/"
	sync_file = path.join(outdir, "sync")
	json_obj = {"lectureId": lectureId, "images": []}
	with open(sync_file) as f:
	for line in f:
	slide_num, time = line.rstrip().split()
	json_obj["images"].append({
	"path": images.get(int(slide_num)).replace(outdir, ""),
	"start": time, "unique": True, "textlines": []
	})
	json_file = path.join(outdir, "slides.json")
	json.dump(json_obj, open(json_file, "w"), indent = 2)

	return json_file

	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='this extracts from the given PDF the slides')
	parser.add_argument('-p', '--pdf', type=str, required = True, help='slides in PDF format')
	parser.add_argument('-o', '--output', type=str, required = True, help='output directory')
	parser.add_argument('--id', type=str, required = True, help='ID of the slide. needed for JSON generation')
	parser.add_argument('-s', '--size', type=str, default='1280x720', help='size of the extracted images. format: "WIDTHxHEIGHT"')
	parser.add_argument('--overwrite', action='store_true')
	args = parser.parse_args()

	images = extract(args.pdf, args.size, args.overwrite, args.output)
	json_file = generateJson(args.id, args.output, images)

	print "slide info saved under \"{}\"".format(json_file)
	import os, PyPDF2, re
	from os import path
	from wand.image import Image


	def getPageNames(pdf_name):
	f = PyPDF2.PdfFileReader(open(pdf_name, "rb"))
	return map(lambda c: "{}[{}]".format(pdf_name, c), range(f.numPages))

	def parseSize(size_as_string): return map(int, re.match(r"(\d+)x(\d+)", size_as_string).groups())

	def adjust_size(img, width, height):
	img_ratio = float(img.width) / img.height
	if img_ratio != float(width) / height:
	height = int(width / img_ratio)

	return width, height

	def extract(fpath, size_as_string, overwrite = False, outdir = None):
	images = []
	fname, dirpath = path.basename(fpath), path.dirname(fpath)
	width, height = parseSize(size_as_string)
	for idx, pageName in enumerate(getPageNames(fpath), 1):
	page_img_name = path.join(outdir or dirpath, "slides", "{}.jpg".format(idx))
	images.append((idx, page_img_name))
	if not overwrite and path.isfile(page_img_name): continue
	if not path.isdir(path.dirname(page_img_name)):
	os.mkdir(path.dirname(page_img_name))
	with Image(filename = pageName, resolution=200) as img:
	img.compression_quality = 99
	img.alpha_channel=False
	img.resize(*adjust_size(img, width, height))
	img.save(filename = page_img_name)
	return dict(images)