Skip to content

Instantly share code, notes, and snippets.

View ayushoriginal's full-sized avatar
😀
Looking for Full Time opportunities

Ayush Pareek ayushoriginal

😀
Looking for Full Time opportunities
View GitHub Profile
@ayushoriginal
ayushoriginal / tokenizer.py
Created June 24, 2019 05:46
tokenize and lowercase
def tokenize(self):
from nltk import word_tokenize
for i,tweet in tqdm(enumerate(self.data),'Tokenization'):
self.data[i] = word_tokenize(tweet.lower())
return self.data
@ayushoriginal
ayushoriginal / stopword.py
Created June 24, 2019 05:53
remove noise and stopwords
def remove_stopwords(self):
from nltk.corpus import stopwords
import re
stop = set(stopwords.words("english"))
noise = ['user']
for i,tweet in tqdm(enumerate(self.data),'Stopwords Removal'):
self.data[i] = [w for w in tweet if w not in stop and not re.match(r"[^a-zA-Z\d\s]+", w) and w not in noise]
return self.data
def lemmatize(self):
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
for i, tweet in tqdm(enumerate(self.data),'Lemmatization'):
for j, word in enumerate(tweet):
self.data[i][j] = wnl.lemmatize(word, pos=self.get_pos(word))
return self.data
def one_hot(self,labels):
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
return encoder.fit_transform(np.array(labels).reshape(-1,1)).toarray()
We couldn’t find that file to show.
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
def encode_corpus(self,data):
encoded_docs = [one_hot(' '.join(d), self.vocab_length) for d in data]
vocab_file = "vocab_mapping.txt"+version
with open(vocab_file,'w') as f:
for i in range(len(data)):
for j in range(len(data[i])):
ss = data[i][j]+" "+str(encoded_docs[i][j])+"\n"
fellow 10739
patriot 7589
yeah 8002
take 2770
gun 7944
control 4475
policy 6949
advice 7074
jean 12534
company 573
def CNN(self):
model = Sequential()
model.add(Embedding(self.vocab_length, 30, input_length=self.max_len)) # Max Length of Tweet
model.add(Convolution1D(64,5,activation="relu"))
model.add(Dropout(0.5))
model.add(Convolution1D(32,3,activation="relu"))
model.add(Dropout(0.5))
model.add(Convolution1D(16,3,activation="sigmoid"))
model.add(MaxPooling1D(5))
model.add(Flatten())
def LSTM(self):
model = Sequential()
model.add(Embedding(self.vocab_length, 30, input_length=self.max_len))
model.add(LSTM(200))
model.add(Dense(self.max_len, activation='relu', W_regularizer=l2(0.90)))
model.add(Dense(self.tr_labels.shape[1], activation='softmax', W_regularizer=l2(0.1)))
adam_1 = Adam(lr=0.008)
model.compile(loss='categorical_crossentropy', optimizer=adam_1,metrics=['accuracy'])
model.summary()
self.model = model
# Let the keras model be 'model'
# OPTION 1: Convert Keras model to ONNX and convert ONNX model to CoreML model
import onnxmltools
onnx_model = onnxmltools.convert_keras(model) #Keras to ONNX
from onnx_coreml import convert
mlmodel = convert(onnx_model) # ONNX to CoreML
mlmodel.save('hate_coreml_model.mlmodel')