Create a gist now

Instantly share code, notes, and snippets.

@nvbn /
Last active Jun 20, 2018

What would you like to do?
from collections import Counter
import requests
from import SRTReader
import lxml.html
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lang = 'en-US'
path = ''
output_path = ''
stock_key = ''
stock_secret = ''
start_slide = 0
end_slide = 0
def read_subtitles(path, lang):
with open(path) as f:
data =
return SRTReader().read(data, lang=lang)
def to_text(raw_text):
return lxml.html.document_fromstring(raw_text).text_content()
def tokenize_lemmatize(text):
tokens = word_tokenize(text)
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(token.lower())
for token in tokens if token.isalpha()]
stop_words = set(stopwords.words("english"))
return [lemma for lemma in lemmatized if lemma not in stop_words]
def get_most_popular(captions):
full_text = '\n'.join(to_text(caption.get_text()) for caption in captions)
tokens = tokenize_lemmatize(full_text)
return Counter(tokens)
def get_keywords(most_popular, text, n=2):
tokens = sorted(tokenize_lemmatize(text), key=lambda x: -most_popular[x])
return tokens[:n]
def get_stock_image_url(query):
response = requests.get(
'query': query,
'sort': 'popular',
'view': 'minimal',
'safe': 'false',
'per_page': '1',
'image_type': 'photo',
auth=(stock_key, stock_secret),
data = response.json()
return data['data'][0]['assets']['preview']['url']
except (IndexError, KeyError):
return None
def make_slide(most_popular, caption):
text = to_text(caption.get_text())
if not text:
return None
keywords = get_keywords(most_popular, text)
query = ' '.join(keywords)
if not query:
return None
stock_image = get_stock_image_url(query)
if not stock_image:
return None
return text, stock_image
def make_html_output(slides):
html = '<html><head><link rel="stylesheet" href="./style.css"></head><body>'
for (text, stock_image) in slides:
html += f'''<div class="box">
<img src="{stock_image}" />
html += '</body></html>'
return html
subtitles = read_subtitles(path, lang)
captions = subtitles.get_captions(lang)
most_popular = get_most_popular(captions)
interesting_slides = [make_slide(most_popular, caption)
for caption in captions[start_slide:end_slide]]
interesting_slides = [slide for slide in interesting_slides if slide]
with open(output_path, 'w') as f:
output = make_html_output(interesting_slides)
html {
background: black;
.box {
margin: auto;
width: 450px;
padding-top: 100px;
padding-bottom: 100px;
.box:first-child {
padding-top: 0;
.box * {
display: block;
.box span {
font-size: 32px;
text-align: center;
color: #fff;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment