Skip to content

Instantly share code, notes, and snippets.

@takuya
Created April 3, 2017 16:33
Show Gist options
  • Save takuya/40f436fd95a42e75b3480cd72f760964 to your computer and use it in GitHub Desktop.
Save takuya/40f436fd95a42e75b3480cd72f760964 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import urllib,urllib.request
import lxml.html
from io import StringIO, BytesIO
import json
import re
import subprocess
import os
import tempfile
def main( url=None, show_json=False, show_img_urls=False ) :
req = urllib.request.Request(url=url)
f = urllib.request.urlopen(req)
page = f.read().decode('utf-8')
doc = lxml.html.parse(StringIO(page))
title = doc.xpath('//title/text()')[0]
json_data_link = doc.xpath('//link[@rel="alternate"][@type="application/json+oembed"]')[0]
pdf_link = doc.xpath('//*[@id="share_pdf"]')[0]
req = urllib.request.Request(url= json_data_link.attrib['href'])
f = urllib.request.urlopen(req)
str = f.read().decode('utf-8')
obj = json.load( StringIO( str ) )
if show_json :
print( json.dumps(obj) )
exit(0)
title = obj["title"]
author = obj["author_name"]
iframe = obj["html"]
doc = lxml.html.parse(StringIO(iframe))
src = 'http:'+(doc.xpath('//iframe/@src'))[0]
id = os.path.basename(urllib.parse.urlparse(src).path)
req = urllib.request.Request(url= src)
f = urllib.request.urlopen(req)
str = f.read().decode('utf-8')
doc = lxml.html.parse(StringIO(str))
src = doc.xpath('//head//script[contains(./text(), "var talk")]')[0]
src = src.text
data = re.search( re.compile('var talk\s?=\s?({.+});'), src).group(1)
if show_img_urls :
print( data )
exit(0)
data = json.load( StringIO( data ) )
urls = []
for p in data['slides'] :
urls.append( p['original'] )
old_wd = os.getcwd()
os.chdir( tempfile.mkdtemp() )
print( os.getcwd() )
cmd = "xargs -P5 -I@ curl -sLJO @ "
p = subprocess.Popen(cmd.strip().split(" "), stdin=subprocess.PIPE)
for (index,url) in enumerate(urls) :
p.stdin.write(('%s\n' % url ).encode('utf8'))
p.stdin.close()
p.wait()
cmd = "convert slide*.jpg out.pdf "
p = subprocess.Popen(cmd, shell=True )
p.wait()
content = open('out.pdf', 'rb').read()
cmd = "rm *.jpg *.pdf "
p = subprocess.Popen(cmd, shell=True )
p.wait()
tmp = os.getcwd()
os.chdir( old_wd )
os.removedirs(tmp)
f_name = '%s - %s.pdf' % ( title.replace("/",'|'), author.replace("/",'|') )
open( f_name , 'wb').write(content)
if __name__ == '__main__' :
import argparse
parser = argparse.ArgumentParser(description=u'SpeakerDeckからデータ取ってくる')
parser.add_argument( '-j ', '--json', action="store_true", help=u'jsonデータ取得')
parser.add_argument( '-i ', '--imgs', action="store_true", help=u'スライド画像URL一覧')
parser.add_argument('urls', metavar='URLs', type=str, nargs='+', help='対象の URL ')
args = parser.parse_args()
for url in vars(args)['urls'] :
main( url, show_json=vars(args)["json"], show_img_urls=vars(args)['imgs'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment