Skip to content

Instantly share code, notes, and snippets.

@DiKorsch
Last active June 8, 2016 12:31
Show Gist options
  • Save DiKorsch/35d8366790c7af77dfc16e8b5066b35c to your computer and use it in GitHub Desktop.
Save DiKorsch/35d8366790c7af77dfc16e8b5066b35c to your computer and use it in GitHub Desktop.
from converter import extract
from os import path
import argparse, simplejson as json
def generateJson(lectureId, outdir, images):
if not outdir.endswith("/"): outdir += "/"
sync_file = path.join(outdir, "sync")
json_obj = {"lectureId": lectureId, "images": []}
with open(sync_file) as f:
for line in f:
slide_num, time = line.rstrip().split()
json_obj["images"].append({
"path": images.get(int(slide_num)).replace(outdir, ""),
"start": time, "unique": True, "textlines": []
})
json_file = path.join(outdir, "slides.json")
json.dump(json_obj, open(json_file, "w"), indent = 2)
return json_file
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='this extracts from the given PDF the slides')
parser.add_argument('-p', '--pdf', type=str, required = True, help='slides in PDF format')
parser.add_argument('-o', '--output', type=str, required = True, help='output directory')
parser.add_argument('--id', type=str, required = True, help='ID of the slide. needed for JSON generation')
parser.add_argument('-s', '--size', type=str, default='1280x720', help='size of the extracted images. format: "WIDTHxHEIGHT"')
parser.add_argument('--overwrite', action='store_true')
args = parser.parse_args()
images = extract(args.pdf, args.size, args.overwrite, args.output)
json_file = generateJson(args.id, args.output, images)
print "slide info saved under \"{}\"".format(json_file)
import os, PyPDF2, re
from os import path
from wand.image import Image
def getPageNames(pdf_name):
f = PyPDF2.PdfFileReader(open(pdf_name, "rb"))
return map(lambda c: "{}[{}]".format(pdf_name, c), range(f.numPages))
def parseSize(size_as_string): return map(int, re.match(r"(\d+)x(\d+)", size_as_string).groups())
def adjust_size(img, width, height):
img_ratio = float(img.width) / img.height
if img_ratio != float(width) / height:
height = int(width / img_ratio)
return width, height
def extract(fpath, size_as_string, overwrite = False, outdir = None):
images = []
fname, dirpath = path.basename(fpath), path.dirname(fpath)
width, height = parseSize(size_as_string)
for idx, pageName in enumerate(getPageNames(fpath), 1):
page_img_name = path.join(outdir or dirpath, "slides", "{}.jpg".format(idx))
images.append((idx, page_img_name))
if not overwrite and path.isfile(page_img_name): continue
if not path.isdir(path.dirname(page_img_name)):
os.mkdir(path.dirname(page_img_name))
with Image(filename = pageName, resolution=200) as img:
img.compression_quality = 99
img.alpha_channel=False
img.resize(*adjust_size(img, width, height))
img.save(filename = page_img_name)
return dict(images)
Wand
PyPDF2
argparse
simplejson
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment