Skip to content

Instantly share code, notes, and snippets.

@debiatan
Last active August 29, 2015 14:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save debiatan/4a49d4b852e200f76783 to your computer and use it in GitHub Desktop.
Save debiatan/4a49d4b852e200f76783 to your computer and use it in GitHub Desktop.
Create annotation templates for Handmade Hero episodes
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import codecs
import sys
import requests
from HTMLParser import HTMLParser
import time
import datetime
import StringIO
class MyHTMLParser(HTMLParser):
def __init__(self):
self.inside_entry_tag = False
self.entry = []
self.entries = []
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
if tag == 'entry':
self.inside_entry_tag = True
elif tag in ('title', 'id', 'published'):
self.record_data = True
else:
self.record_data = False
def handle_endtag(self, tag):
if tag == 'entry':
self.inside_entry_tag = False
self.entries.append(self.entry)
self.entry = []
def handle_data(self, data):
if self.inside_entry_tag and self.record_data:
text = data.strip()
if text:
self.entry.append(text)
def canonical(s):
words = []
word = []
for c in s.lower():
if c.isalpha():
word.append(c)
elif word:
words.append(''.join(word))
word = []
# last word
if word:
words.append(''.join(word))
return words
def best_match(text, sentence):
text_len = len(text)
sentence_len = len(sentence)
best_match_score = -1.
best_match_index = None
for start_word_index in range(0, text_len-sentence_len):
subtext = text[start_word_index:start_word_index+sentence_len]
score = 0.
for a, b in zip(subtext, sentence):
if a == b:
score += 1
score /= sentence_len
if score > best_match_score:
best_match_score = score
best_match_index = start_word_index
return best_match_score, best_match_index
# Choose episode
if len(sys.argv) != 2:
print('Usage:', sys.argv[0], 'episode_number')
sys.exit(1) ######################################## EXIT
episode_number = sys.argv[1]
print('Looking for episode {}'.format(episode_number))
# Get list of last videos
r = requests.get('https://www.youtube.com/feeds/videos.xml?user=handmadeheroarchive')
# Extract title, URL and date of videos
# Look for target episode
parser = MyHTMLParser()
parser.feed(r.text)
youtube_video_id = title = data = None
found = False
for youtube_video_id, title, date in parser.entries:
if episode_number in title.split():
found = True
break
if not found:
print('Could not find target episode')
sys.exit(1) ######################################## EXIT
title = title[title.find('-')+1 :].strip()
youtube_video_id = youtube_video_id[youtube_video_id.rfind(':')+1:]
print('Title: {}'.format(title))
print('Youtube video id: {}'.format(youtube_video_id))
# Write basic template for video
with open('day%3d.html.md'%(int(episode_number)), 'w') as f:
f.write('---\n')
f.write('title: "{}"\n'.format(title))
f.write('videoId: "{}"\n'.format(youtube_video_id))
f.write('---\n')
# See if there are automatic captions for this episode
keepsubs_request = 'http://keepsubs.com/?url=https://www.youtube.com/watch?v={}'
r = requests.get(keepsubs_request.format(youtube_video_id))
youtube_captions = None
for line in r.text.split('\n'):
if 'automatic captions' in line.lower():
has_captions = True
start_index = line.find('a href="http://keepsubs.com/subs/youtube.com.php') + 8
length = line[start_index:].find('"')
url = line[start_index:start_index+length]
r = requests.get(url)
youtube_captions = r.text
break
if not youtube_captions:
print('No automatic youtube captions found')
else:
print('Found automatic youtube captions')
# Get twitch chat log
headers = {'Accept': 'application/vnd.twitchtv.v3+json'}
url = 'https://api.twitch.tv/kraken/channels/handmade_hero/videos?limit=100&broadcasts=true'
r = requests.get(url, headers=headers)
r_json = r.json()
n_videos = len(r_json['videos'])
youtube_timestamp = time.mktime(datetime.datetime.strptime(date[:date.find('+')], '%Y-%m-%dT%H:%M:%S').timetuple())
target_twitch_video_entries = []
for video_desc in r_json['videos']:
twitch_time = video_desc['recorded_at']
twitch_timestamp = time.mktime(datetime.datetime.strptime(twitch_time, '%Y-%m-%dT%H:%M:%SZ').timetuple())
# twitch video time prior to youtube post, but not more than 24h
if (youtube_timestamp > twitch_timestamp and
youtube_timestamp-twitch_timestamp < 24*60*60):
target_twitch_video_entries.append((video_desc['_id'], twitch_time))
target_twitch_video_entries = sorted(target_twitch_video_entries)
if not target_twitch_video_entries:
print('Could not locate matching twitch chat logs')
sys.exit(1) ######################################## EXIT
print('Found matching twitch chat logs:')
for twitch_video_id, twitch_time in target_twitch_video_entries:
print('\t{} - {}'.format(twitch_video_id, twitch_time))
# Parse twitch questions
process = False
twitch_qs = []
for twitch_video_id, twitch_video_time in target_twitch_video_entries:
url = 'http://search.rechat.org/videos/{}?include_jtv=true&after={}'
r = requests.get(url.format(twitch_video_id, twitch_video_time))
while len(r.json()['hits']['hits']):
for hit in r.json()['hits']['hits']:
if hit['_type'] == 'message':
user = hit['_source']['from']
message = hit['_source']['message']
if user == 'Miblo' and 'NOTE(annotator)' in message:
print(hit['_source']['recieved_at'], message)
# Only parse questions after Casey triggers the Q&A
if not process and user == 'cmuratori' and '!qa' in message:
process = True
if process and user != 'hmh_bot':
m = message.lower().strip()
words = m.split()
if (('q:' in m or '@cmuratori' in m or '@handmade_hero' in m) and
len(words) >= 4 and ('thanks' not in words or len >= 6)):
words_to_filter = ['q:', '@cmuratori', '@handmade_hero']
q = ' '.join([w for w in message.replace('"', '\\"').split() if w.lower() not in words_to_filter])
twitch_qs.append((user, q))
twitch_video_time = r.json()['hits']['hits'][-1]['_source']['recieved_at'] # sic
r = requests.get(url.format(twitch_video_id, twitch_video_time))
if not len(twitch_qs):
print('No questions found on chat log')
sys.exit(1) ######################################## EXIT
print('Number of questions: {}'.format(len(twitch_qs)))
questions = []
qa_hour, qa_minute, qa_second = 1, 0, 0
if youtube_captions:
# Process youtube captions
f = StringIO.StringIO(youtube_captions)
youtube_lines = []
youtube_times = []
while True:
number_s = f.readline().strip()
if not number_s:
break
number = int(number_s.strip())
time = f.readline().strip()
message = f.readline().strip()
string = f.readline().strip()
while string:
message = ' '.join((message, string))
string = f.readline().strip()
youtube_lines.append(message)
youtube_times.append(time)
caption_words = []
caption_times = []
for sentence, time in zip(youtube_lines, youtube_times):
word = []
for c in sentence.lower():
if c.isalpha():
word.append(c)
elif word:
caption_words.append(''.join(word))
caption_times.append(time)
word = []
# last word
if word:
caption_words.append(''.join(word))
caption_times.append(time)
# Look for time of Q&A
qa_index = caption_words.index('q')
if qa_index != -1:
time = caption_times[qa_index]
qa_hour, qa_minute, qa_second = [int(e.split(',')[0]) for e in time.split(':')[:3]]
qa_time = qa_hour*3600 + qa_minute*60 + qa_second
for user, q in twitch_qs:
words = canonical(q)
score, word_index = best_match(caption_words, words)
if score > 0.3:
#print 'Think I found one!'
#print 'It goes like this:', caption_words[word_index:word_index+len(words)]
time = caption_times[word_index]
hour, minute, second = [int(e.split(',')[0]) for e in time.split(':')[:3]]
q_time = hour*3600 + minute*60 + second
if q_time > qa_time:
questions.append((hour, minute, second, q))
#print 'And it happens at: %02d:%02d:%02d'%(hour, minute, second)
questions = sorted(questions)
# Write basic template for video
with codecs.open('day%3d.html.md'%(int(episode_number)), 'w', encoding='utf-8') as f:
f.write('---\n')
f.write('title: "{}"\n'.format(title))
f.write('videoId: "{}"\n'.format(youtube_video_id))
f.write('markers:\n')
f.write(' "%02d:%02d:%02d": "Q&A"\n'%(qa_hour, qa_minute, qa_second))
if youtube_captions:
for hour, minute, second, q in questions:
f.write(' "%02d:%02d:%02d": "%s"\n'%(hour, minute, second, q))
f.write('---\n')
f.write(' <!---\n')
f.write(' HERE ARE ALL THE Qs, IN CASE YOU NEED TO COPY ANY OF THEM\n')
for user, q in twitch_qs:
f.write(' '+q+'\n')
f.write(' -->\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment