This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def tokenizeDoc(doc): | |
tokens= [token.lemma_.strip() for token in nlp(doc.lower()) if not token.is_stop and token.lemma_.strip() not in string.punctuation ] | |
return tokens | |
def buildDictAndModelLDA(docs, numberOfTopics=5, numberOfPasses=100): | |
dictionary = corpora.Dictionary(docs) | |
corpus = [dictionary.doc2bow(text) for text in docs] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def produceModel(X_train, Y_train, X_eval, Y_eval, params): | |
print("current p:", params) | |
optim = Adam(lr=params["lr"]) | |
text_input = Input(shape=(inputs.shape[1],inputs.shape[2]), name='text') | |
base_input = Input(shape=(4,),name='bases') | |
lstm_input=LSTM(params["layer_1_text_input_neuron"])(text_input) | |
dense_input=Dense(params["layer_1_base_input_neuron"], activation='relu')(base_input) | |
concatenated = concatenate([lstm_input, dense_input], axis=-1) | |
concatenated=RepeatVector(outputs.shape[1])(concatenated) | |
concatenated_lstm=LSTM(params["concatenated_layer_neuron"], return_sequences=True)(concatenated) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def queryWords(positive=None, negative=None, topn=5): | |
try: | |
print(model.wv.most_similar_cosmul(positive, negative, topn)) | |
except KeyError: | |
print("It looks like one of the words you entered is not present in the entire vocabulary") | |
queryWords(positive=["drug"], negative=None, topn=40) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model=None | |
def createAndSaveModel(data, subreddit): | |
global model | |
model=Word2Vec( | |
data, | |
workers=4, | |
size=500, | |
min_count=5, | |
window=10 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
tokens=[] | |
import string | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
def tokenizeWords(sentence): | |
return casual_tokenize(sentence, preserve_case=False, reduce_len=True, strip_handles=True) | |
def removePunctuation(sent, punctuationTable): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
vectorizer_counter=None | |
vocabulary=None; | |
def prepareCountVectorizer(): | |
global vocabulary | |
global vectorizer_counter | |
vectorizer_counter=CountVectorizer(lowercase=True, tokenizer=casual_tokenize, stop_words='english', ngram_range=(1,2)) | |
vectorizer_counter.fit(raw_documents=cache.text) | |
word_counts=[] | |
for key in vectorizer_counter.vocabulary_.keys(): | |
word_counts.append(vectorizer_counter.vocabulary_[key]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from psaw import PushshiftAPI | |
import json | |
subreddit="vancouver" | |
cache = None | |
def getAndSaveCommentData(subreddit="vancouver", comment_limit=100000): | |
comment_limit=comment_limit | |
rapi = PushshiftAPI() | |
gen = rapi.search_comments(subreddit=subreddit) | |
global cache |
NewerOlder