Skip to content

Instantly share code, notes, and snippets.

View ichenjia's full-sized avatar
💭
Kube is flowing

Jia chen ichenjia

💭
Kube is flowing
View GitHub Profile
def tokenizeDoc(doc):
tokens= [token.lemma_.strip() for token in nlp(doc.lower()) if not token.is_stop and token.lemma_.strip() not in string.punctuation ]
return tokens
def buildDictAndModelLDA(docs, numberOfTopics=5, numberOfPasses=100):
dictionary = corpora.Dictionary(docs)
corpus = [dictionary.doc2bow(text) for text in docs]
@ichenjia
ichenjia / multiinput_seq2seq_keras_talos.py
Created February 10, 2019 07:28
Multi-input seq2seq with Keras and Talos
def produceModel(X_train, Y_train, X_eval, Y_eval, params):
print("current p:", params)
optim = Adam(lr=params["lr"])
text_input = Input(shape=(inputs.shape[1],inputs.shape[2]), name='text')
base_input = Input(shape=(4,),name='bases')
lstm_input=LSTM(params["layer_1_text_input_neuron"])(text_input)
dense_input=Dense(params["layer_1_base_input_neuron"], activation='relu')(base_input)
concatenated = concatenate([lstm_input, dense_input], axis=-1)
concatenated=RepeatVector(outputs.shape[1])(concatenated)
concatenated_lstm=LSTM(params["concatenated_layer_neuron"], return_sequences=True)(concatenated)
@ichenjia
ichenjia / reddit_comment_word2vec_query_model.py
Created January 3, 2019 01:19
Query Reddit Comment Word2Vec model
def queryWords(positive=None, negative=None, topn=5):
try:
print(model.wv.most_similar_cosmul(positive, negative, topn))
except KeyError:
print("It looks like one of the words you entered is not present in the entire vocabulary")
queryWords(positive=["drug"], negative=None, topn=40)
@ichenjia
ichenjia / reddit_comment_word2vec_create_model.py
Last active January 6, 2019 02:19
Creating Word2Vec Model with Gensim for Reddit Comments
model=None
def createAndSaveModel(data, subreddit):
global model
model=Word2Vec(
data,
workers=4,
size=500,
min_count=5,
window=10
@ichenjia
ichenjia / tokenize_reddit_comments.py
Last active January 6, 2019 23:35
Tokenize Reddit Comments
tokens=[]
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
def tokenizeWords(sentence):
return casual_tokenize(sentence, preserve_case=False, reduce_len=True, strip_handles=True)
def removePunctuation(sent, punctuationTable):
@ichenjia
ichenjia / reddit_comment_countvectorizer.py
Created January 2, 2019 17:49
Reddit Comment Count Vectorizer
vectorizer_counter=None
vocabulary=None;
def prepareCountVectorizer():
global vocabulary
global vectorizer_counter
vectorizer_counter=CountVectorizer(lowercase=True, tokenizer=casual_tokenize, stop_words='english', ngram_range=(1,2))
vectorizer_counter.fit(raw_documents=cache.text)
word_counts=[]
for key in vectorizer_counter.vocabulary_.keys():
word_counts.append(vectorizer_counter.vocabulary_[key])
@ichenjia
ichenjia / acquire_reddit_comment.py
Last active January 1, 2019 00:29
Acquiring reddit comments
from psaw import PushshiftAPI
import json
subreddit="vancouver"
cache = None
def getAndSaveCommentData(subreddit="vancouver", comment_limit=100000):
comment_limit=comment_limit
rapi = PushshiftAPI()
gen = rapi.search_comments(subreddit=subreddit)
global cache