fimstrip.py
from collections import Counter | |
import requests | |
from pycaption.srt import SRTReader | |
import lxml.html | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
from nltk.stem import WordNetLemmatizer | |
lang = 'en-US' | |
path = '' | |
output_path = '' | |
stock_key = '' | |
stock_secret = '' | |
start_slide = 0 | |
end_slide = 0 | |
def read_subtitles(path, lang): | |
with open(path) as f: | |
data = f.read() | |
return SRTReader().read(data, lang=lang) | |
def to_text(raw_text): | |
return lxml.html.document_fromstring(raw_text).text_content() | |
def tokenize_lemmatize(text): | |
tokens = word_tokenize(text) | |
lemmatizer = WordNetLemmatizer() | |
lemmatized = [lemmatizer.lemmatize(token.lower()) | |
for token in tokens if token.isalpha()] | |
stop_words = set(stopwords.words("english")) | |
return [lemma for lemma in lemmatized if lemma not in stop_words] | |
def get_most_popular(captions): | |
full_text = '\n'.join(to_text(caption.get_text()) for caption in captions) | |
tokens = tokenize_lemmatize(full_text) | |
return Counter(tokens) | |
def get_keywords(most_popular, text, n=2): | |
tokens = sorted(tokenize_lemmatize(text), key=lambda x: -most_popular[x]) | |
return tokens[:n] | |
def get_stock_image_url(query): | |
response = requests.get( | |
"https://api.shutterstock.com/v2/images/search", | |
params={ | |
'query': query, | |
'sort': 'popular', | |
'view': 'minimal', | |
'safe': 'false', | |
'per_page': '1', | |
'image_type': 'photo', | |
}, | |
auth=(stock_key, stock_secret), | |
) | |
data = response.json() | |
try: | |
return data['data'][0]['assets']['preview']['url'] | |
except (IndexError, KeyError): | |
return None | |
def make_slide(most_popular, caption): | |
text = to_text(caption.get_text()) | |
if not text: | |
return None | |
keywords = get_keywords(most_popular, text) | |
query = ' '.join(keywords) | |
if not query: | |
return None | |
stock_image = get_stock_image_url(query) | |
if not stock_image: | |
return None | |
return text, stock_image | |
def make_html_output(slides): | |
html = '<html><head><link rel="stylesheet" href="./style.css"></head><body>' | |
for (text, stock_image) in slides: | |
html += f'''<div class="box"> | |
<img src="{stock_image}" /> | |
<span>{text}</span> | |
</div>''' | |
html += '</body></html>' | |
return html | |
subtitles = read_subtitles(path, lang) | |
captions = subtitles.get_captions(lang) | |
most_popular = get_most_popular(captions) | |
interesting_slides = [make_slide(most_popular, caption) | |
for caption in captions[start_slide:end_slide]] | |
interesting_slides = [slide for slide in interesting_slides if slide] | |
with open(output_path, 'w') as f: | |
output = make_html_output(interesting_slides) | |
f.write(output) |
nltk | |
requests | |
pycaption | |
lxml |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment