Skip to content

Instantly share code, notes, and snippets.

@nvbn
Last active June 20, 2018 07:14
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save nvbn/72949069d0e3bb01bf5549c4f2dc9cb3 to your computer and use it in GitHub Desktop.
Save nvbn/72949069d0e3bb01bf5549c4f2dc9cb3 to your computer and use it in GitHub Desktop.
fimstrip.py
from collections import Counter
import requests
from pycaption.srt import SRTReader
import lxml.html
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lang = 'en-US'
path = ''
output_path = ''
stock_key = ''
stock_secret = ''
start_slide = 0
end_slide = 0
def read_subtitles(path, lang):
with open(path) as f:
data = f.read()
return SRTReader().read(data, lang=lang)
def to_text(raw_text):
return lxml.html.document_fromstring(raw_text).text_content()
def tokenize_lemmatize(text):
tokens = word_tokenize(text)
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(token.lower())
for token in tokens if token.isalpha()]
stop_words = set(stopwords.words("english"))
return [lemma for lemma in lemmatized if lemma not in stop_words]
def get_most_popular(captions):
full_text = '\n'.join(to_text(caption.get_text()) for caption in captions)
tokens = tokenize_lemmatize(full_text)
return Counter(tokens)
def get_keywords(most_popular, text, n=2):
tokens = sorted(tokenize_lemmatize(text), key=lambda x: -most_popular[x])
return tokens[:n]
def get_stock_image_url(query):
response = requests.get(
"https://api.shutterstock.com/v2/images/search",
params={
'query': query,
'sort': 'popular',
'view': 'minimal',
'safe': 'false',
'per_page': '1',
'image_type': 'photo',
},
auth=(stock_key, stock_secret),
)
data = response.json()
try:
return data['data'][0]['assets']['preview']['url']
except (IndexError, KeyError):
return None
def make_slide(most_popular, caption):
text = to_text(caption.get_text())
if not text:
return None
keywords = get_keywords(most_popular, text)
query = ' '.join(keywords)
if not query:
return None
stock_image = get_stock_image_url(query)
if not stock_image:
return None
return text, stock_image
def make_html_output(slides):
html = '<html><head><link rel="stylesheet" href="./style.css"></head><body>'
for (text, stock_image) in slides:
html += f'''<div class="box">
<img src="{stock_image}" />
<span>{text}</span>
</div>'''
html += '</body></html>'
return html
subtitles = read_subtitles(path, lang)
captions = subtitles.get_captions(lang)
most_popular = get_most_popular(captions)
interesting_slides = [make_slide(most_popular, caption)
for caption in captions[start_slide:end_slide]]
interesting_slides = [slide for slide in interesting_slides if slide]
with open(output_path, 'w') as f:
output = make_html_output(interesting_slides)
f.write(output)
nltk
requests
pycaption
lxml
html {
background: black;
}
.box {
margin: auto;
width: 450px;
padding-top: 100px;
padding-bottom: 100px;
}
.box:first-child {
padding-top: 0;
}
.box * {
display: block;
}
.box span {
font-size: 32px;
text-align: center;
color: #fff;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment