Skip to content

Instantly share code, notes, and snippets.

View TDehaene's full-sized avatar

Thomas Dehaene TDehaene

View GitHub Profile
@TDehaene
TDehaene / google_drive_sheets
Created November 12, 2018 13:18
Download file from google drive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
gauth = GoogleAuth()
gauth.LocalWebserverAuth()
drive = GoogleDrive(gauth)
file_id = "1QI2vmYXH-yPWGWmXMBC35fzmeOTVdxG62B85BEsUkjc"
file6 = drive.CreateFile({'id': file_id})
file6.GetContentFile('C:/Users/Peter/testie.xslx', mimetype='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
term count
christmas 136
love 69
countdown 26
wedding 18
heart 18
june 18
mystery 16
summer 13
woman 13
import imdb
ia = imdb.IMDb()
movies = ia.search_movie("Harry Potter")
from pythonopensubtitles.opensubtitles import OpenSubtitles
ost = OpenSubtitles()
token = ost.login(YOUR_EMAIL, YOUR_PASSWORD)
assert type(token) == str
def get_download_link(imdb_id):
link = None
if imdb_id:
data = ost.search_subtitles([{'sublanguageid': 'en', 'imdbid':str(imdb_id)}])
def download_unzip_subtitle(link, name):
if not pd.isnull(link):
try:
time.sleep(1)
gz_name = name + '.gz'
with open(gz_name, "wb") as f:
r = requests.get(link)
f.write(r.content)
from gensim.models import LdaMulticore
from gensim import corpora
num_topics = 6
dictionary = corpora.Dictionary(processed_docs)
# filter out words that appear in 30 documents or less,
# or in more than 50% of documents
# then keep the 1500 most occurring ones
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# custom tokenizer for TFIDF, since we already tokenized the text
def identity_tokenizer(text):
return text
# vectorize text
tfidf = TfidfVectorizer(max_df=0.5, min_df=30, tokenizer=identity_tokenizer, lowercase=False)
processed_docs_tfidf = tfidf.fit_transform(processed_docs_filtered)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
# Vectorize the documents
def identity_tokenizer(text):
return text
tfidf = TfidfVectorizer(max_df=0.5, min_df=30, tokenizer=identity_tokenizer, lowercase=False)
processed_docs_tfidf = tfidf.fit_transform(processed_docs_filtered)
# create mapping dict between character and int
chars = sorted(list(set(all_text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
# cut the text in semi-redundant sequences of N (=maxlen) characters
maxlen = 40
step = 5
sentences = []
from keras.layers import LSTM, Dense
from keras.models import Sequential
maxlen = 40
layers = 2
dropout = 0.7
n_hidden = 512
model = Sequential()
for i in range(layers-1):