zerok/djangoconeu-extractor.py

## djangoconeu-extractor.py
import html5lib
import shutil
import pathlib
import json
import collections
import requests
import re
import sys
from icalendar.cal import Calendar

from slugify import slugify

WISTIA_RE = re.compile(r'//fast.wistia.com/embed/medias/(.+)\.jsonp')


root_folder = pathlib.Path('data') / 'djangocon-eu-2016'
videos_folder = root_folder / 'videos'
talks_file = pathlib.Path('djangocon-europe-2016.html')
ical_file = pathlib.Path('djangocon-europe-2016.ics')
schedule_file = pathlib.Path('djangocon-europe-2016-schedule.html')

if not talks_file.exists():
    talks_file.write_text(requests.get('http://opbeat.com/community/events/djangocon-europe-2016/').text)
if not ical_file.exists():
    ical_file.write_text(requests.get('https://2016.djangocon.eu/schedule/ical/').text)
if not schedule_file.exists():
    schedule_file.write_text(requests.get('https://2016.djangocon.eu/schedule/').text)

shutil.rmtree(str(root_folder), ignore_errors=True)

root_folder.mkdir()
videos_folder.mkdir()


def get_embed_url(absolute_url):
    tmp_file = pathlib.Path('/tmp') / slugify(absolute_url)
    if not tmp_file.exists():
        with open(str(tmp_file), 'w+') as fp:
            fp.write(requests.get(absolute_url).text)
    with open(str(tmp_file)) as fp:
        for line in fp:
            mo = WISTIA_RE.search(line)
            if mo is not None:
                return 'https://fast.wistia.com/embed/iframe/{}'.format(mo.group(1))

talk_info = {}
talk_urls = {}

def mkkey(s):
    return s.replace(' ', '').replace('"', '').replace("'", '')


with open(str(schedule_file)) as fp:
    doc = html5lib.parse(fp.read())
    for link in doc.findall('.//{http://www.w3.org/1999/xhtml}a'):
        target = link.attrib.get('href', '')
        title = link.text
        if target.startswith('/speaker'):
            talk_urls[mkkey(title)] = 'https://2016.djangocon.eu{}'.format(target)

with open(str(ical_file)) as fp:
    cal = Calendar.from_ical(fp.read())
    lt_days = {
        30: 1,
        31: 2,
        1: 3,
    }
    for evt in cal.subcomponents:
        summary = evt['SUMMARY']
        summary_comps = summary.split(' - ')
        if len(summary_comps) > 1:
            title = ' - '.join(summary_comps[1:]).rstrip().lstrip()
        else:
            title = summary_comps[0]
        if not title:
            continue
        start = evt['DTSTART'].dt
        key = mkkey(title)
        if 'Lightning talks' in title:
            key = 'Lightningtalks,day{}'.format(lt_days[start.date().day])

        talk_info[key] = {
            'title': title,
            'start': start,
        }


with open(str(root_folder / 'category.json'), 'w+') as fp:
    json.dump(collections.OrderedDict([
        ('description', ''),
        ('start_date', '2016-03-30'),
        ('title', 'DjangoCon Europe 2016'),
        ('url', 'https://2016.djangocon.eu/'),
    ]), fp, indent=2)


with open(str(talks_file)) as fp:
    doc = html5lib.parse(fp)
    video_list = doc.find('.//*[@class="video-list"]')
    videos = video_list.findall('.//*[@class="post video"]')
    for video in videos:
        title = video.find('{http://www.w3.org/1999/xhtml}h3').text
        link = video.find('{http://www.w3.org/1999/xhtml}a').attrib['href']
        absolute_url = 'http://opbeat.com{}'.format(link)
        title_elements = title.split(' by ')
        if len(title_elements) > 2:
            speakers = title_elements[-1]
            title = ' by '.join(title_elements[:-1])
        elif len(title_elements) > 1:
            title, speakers = title_elements
        else:
            title = title_elements[0]
            speakers = []
        if speakers:
            speakers = speakers.split(' & ')

        info = talk_info[mkkey(title)]
        recorded = info['start'].date().isoformat()
        talk_url = talk_urls.get(mkkey(title))

        slug = slugify(title)

        embed_url = get_embed_url(absolute_url)
        if embed_url is None:
            raise Exception("No embed_url found for {}".format(slug))

        description = "This video is hosted by `opbeat.com <{}>`_.".format(absolute_url)

        if talk_url:
            description = "You can find more about this talk on `djangocon.eu <{}>`_. {}".format(talk_url, description)

        video_file = videos_folder / '{}.json'.format(slug)
        with open(str(video_file), 'w+') as fp:
            json.dump(collections.OrderedDict([
                ('description', description),
                ('language', 'eng'),
                ('recorded', recorded),
                ('speakers', speakers),
                ('tags', ['django']),
                ('title', title),
                ('videos', [
                    collections.OrderedDict([
                        ('type', 'wistia'),
                        ('url', embed_url)
                    ])
                ]),
            ]), fp, indent=2)
	import html5lib
	import shutil
	import pathlib
	import json
	import collections
	import requests
	import re
	import sys
	from icalendar.cal import Calendar

	from slugify import slugify

	WISTIA_RE = re.compile(r'//fast.wistia.com/embed/medias/(.+)\.jsonp')


	root_folder = pathlib.Path('data') / 'djangocon-eu-2016'
	videos_folder = root_folder / 'videos'
	talks_file = pathlib.Path('djangocon-europe-2016.html')
	ical_file = pathlib.Path('djangocon-europe-2016.ics')
	schedule_file = pathlib.Path('djangocon-europe-2016-schedule.html')

	if not talks_file.exists():
	talks_file.write_text(requests.get('http://opbeat.com/community/events/djangocon-europe-2016/').text)
	if not ical_file.exists():
	ical_file.write_text(requests.get('https://2016.djangocon.eu/schedule/ical/').text)
	if not schedule_file.exists():
	schedule_file.write_text(requests.get('https://2016.djangocon.eu/schedule/').text)

	shutil.rmtree(str(root_folder), ignore_errors=True)

	root_folder.mkdir()
	videos_folder.mkdir()


	def get_embed_url(absolute_url):
	tmp_file = pathlib.Path('/tmp') / slugify(absolute_url)
	if not tmp_file.exists():
	with open(str(tmp_file), 'w+') as fp:
	fp.write(requests.get(absolute_url).text)
	with open(str(tmp_file)) as fp:
	for line in fp:
	mo = WISTIA_RE.search(line)
	if mo is not None:
	return 'https://fast.wistia.com/embed/iframe/{}'.format(mo.group(1))

	talk_info = {}
	talk_urls = {}

	def mkkey(s):
	return s.replace(' ', '').replace('"', '').replace("'", '')


	with open(str(schedule_file)) as fp:
	doc = html5lib.parse(fp.read())
	for link in doc.findall('.//{http://www.w3.org/1999/xhtml}a'):
	target = link.attrib.get('href', '')
	title = link.text
	if target.startswith('/speaker'):
	talk_urls[mkkey(title)] = 'https://2016.djangocon.eu{}'.format(target)

	with open(str(ical_file)) as fp:
	cal = Calendar.from_ical(fp.read())
	lt_days = {
	30: 1,
	31: 2,
	1: 3,
	}
	for evt in cal.subcomponents:
	summary = evt['SUMMARY']
	summary_comps = summary.split(' - ')
	if len(summary_comps) > 1:
	title = ' - '.join(summary_comps[1:]).rstrip().lstrip()
	else:
	title = summary_comps[0]
	if not title:
	continue
	start = evt['DTSTART'].dt
	key = mkkey(title)
	if 'Lightning talks' in title:
	key = 'Lightningtalks,day{}'.format(lt_days[start.date().day])

	talk_info[key] = {
	'title': title,
	'start': start,
	}


	with open(str(root_folder / 'category.json'), 'w+') as fp:
	json.dump(collections.OrderedDict([
	('description', ''),
	('start_date', '2016-03-30'),
	('title', 'DjangoCon Europe 2016'),
	('url', 'https://2016.djangocon.eu/'),
	]), fp, indent=2)


	with open(str(talks_file)) as fp:
	doc = html5lib.parse(fp)
	video_list = doc.find('.//*[@class="video-list"]')
	videos = video_list.findall('.//*[@class="post video"]')
	for video in videos:
	title = video.find('{http://www.w3.org/1999/xhtml}h3').text
	link = video.find('{http://www.w3.org/1999/xhtml}a').attrib['href']
	absolute_url = 'http://opbeat.com{}'.format(link)
	title_elements = title.split(' by ')
	if len(title_elements) > 2:
	speakers = title_elements[-1]
	title = ' by '.join(title_elements[:-1])
	elif len(title_elements) > 1:
	title, speakers = title_elements
	else:
	title = title_elements[0]
	speakers = []
	if speakers:
	speakers = speakers.split(' & ')

	info = talk_info[mkkey(title)]
	recorded = info['start'].date().isoformat()
	talk_url = talk_urls.get(mkkey(title))

	slug = slugify(title)

	embed_url = get_embed_url(absolute_url)
	if embed_url is None:
	raise Exception("No embed_url found for {}".format(slug))

	description = "This video is hosted by `opbeat.com <{}>`_.".format(absolute_url)

	if talk_url:
	description = "You can find more about this talk on `djangocon.eu <{}>`_. {}".format(talk_url, description)

	video_file = videos_folder / '{}.json'.format(slug)
	with open(str(video_file), 'w+') as fp:
	json.dump(collections.OrderedDict([
	('description', description),
	('language', 'eng'),
	('recorded', recorded),
	('speakers', speakers),
	('tags', ['django']),
	('title', title),
	('videos', [
	collections.OrderedDict([
	('type', 'wistia'),
	('url', embed_url)
	])
	]),
	]), fp, indent=2)