Ayush Pareek ayushoriginal

## tokenizer.py
    def tokenize(self):
        from nltk import word_tokenize
        for i,tweet in tqdm(enumerate(self.data),'Tokenization'):
            self.data[i] = word_tokenize(tweet.lower())
        return self.data

## stopword.py
    def remove_stopwords(self):
        from nltk.corpus import stopwords
        import re
        stop = set(stopwords.words("english"))
        noise = ['user']
        for i,tweet in tqdm(enumerate(self.data),'Stopwords Removal'):
            self.data[i] = [w for w in tweet if w not in stop and not re.match(r"[^a-zA-Z\d\s]+", w) and w not in noise]
        return self.data

## lemmatizer.py
    def lemmatize(self):
        from nltk.stem import WordNetLemmatizer
        wnl = WordNetLemmatizer()
        for i, tweet in tqdm(enumerate(self.data),'Lemmatization'):
            for j, word in enumerate(tweet):
                self.data[i][j] = wnl.lemmatize(word, pos=self.get_pos(word))
        return self.data

## one_hot_label_encoding.py
    def one_hot(self,labels):
        from sklearn.preprocessing import OneHotEncoder
        encoder = OneHotEncoder()
        return encoder.fit_transform(np.array(labels).reshape(-1,1)).toarray()

## gist:8f715eafe4129ba811363fde0a4414a8

      
        
          
            
              
              0 files
            
          
          
            
              
              0 forks
            
          
          
            
              
              0 comments
            
          
          
            
              
              0 stars
            
          
        
        
          
              
          
          
            
                ayushoriginal
                / gist:8f715eafe4129ba811363fde0a4414a8
            
            
              Created
              June 24, 2019 06:13
            
              
                encode_corpus
              
          
        
      
        
          
            
              
              We couldn’t find that file to show.
              
            
          
        
    

## encode_corpus.py
    from keras.preprocessing.text import one_hot
    from keras.preprocessing.sequence import pad_sequences

    def encode_corpus(self,data):
        encoded_docs = [one_hot(' '.join(d), self.vocab_length) for d in data]
        vocab_file = "vocab_mapping.txt"+version
        with open(vocab_file,'w') as f:
            for i in range(len(data)):
                for j in range(len(data[i])):
                    ss = data[i][j]+" "+str(encoded_docs[i][j])+"\n"

## vocab_mapping.txt
fellow 10739
patriot 7589
yeah 8002
take 2770
gun 7944
control 4475
policy 6949
advice 7074
jean 12534
company 573

## CNN.py
    def CNN(self):
        model = Sequential()
        model.add(Embedding(self.vocab_length, 30, input_length=self.max_len)) # Max Length of Tweet
        model.add(Convolution1D(64,5,activation="relu"))
        model.add(Dropout(0.5))
        model.add(Convolution1D(32,3,activation="relu"))
        model.add(Dropout(0.5))
        model.add(Convolution1D(16,3,activation="sigmoid"))
        model.add(MaxPooling1D(5))
        model.add(Flatten())

## LSTM.py
    def LSTM(self):
        model = Sequential()
        model.add(Embedding(self.vocab_length, 30, input_length=self.max_len))
        model.add(LSTM(200))
        model.add(Dense(self.max_len, activation='relu', W_regularizer=l2(0.90)))
        model.add(Dense(self.tr_labels.shape[1], activation='softmax', W_regularizer=l2(0.1)))
        adam_1 = Adam(lr=0.008)
        model.compile(loss='categorical_crossentropy', optimizer=adam_1,metrics=['accuracy'])
        model.summary()
        self.model = model

## model_convert.py
# Let the keras model be 'model'

# OPTION 1: Convert Keras model to ONNX and convert ONNX model to CoreML model
import onnxmltools
onnx_model = onnxmltools.convert_keras(model) #Keras to ONNX

from onnx_coreml import convert
mlmodel = convert(onnx_model) # ONNX to CoreML
mlmodel.save('hate_coreml_model.mlmodel')
	def tokenize(self):
	from nltk import word_tokenize
	for i,tweet in tqdm(enumerate(self.data),'Tokenization'):
	self.data[i] = word_tokenize(tweet.lower())
	return self.data
	def remove_stopwords(self):
	from nltk.corpus import stopwords
	import re
	stop = set(stopwords.words("english"))
	noise = ['user']
	for i,tweet in tqdm(enumerate(self.data),'Stopwords Removal'):
	self.data[i] = [w for w in tweet if w not in stop and not re.match(r"[^a-zA-Z\d\s]+", w) and w not in noise]
	return self.data
	def lemmatize(self):
	from nltk.stem import WordNetLemmatizer
	wnl = WordNetLemmatizer()
	for i, tweet in tqdm(enumerate(self.data),'Lemmatization'):
	for j, word in enumerate(tweet):
	self.data[i][j] = wnl.lemmatize(word, pos=self.get_pos(word))
	return self.data
	def one_hot(self,labels):
	from sklearn.preprocessing import OneHotEncoder
	encoder = OneHotEncoder()
	return encoder.fit_transform(np.array(labels).reshape(-1,1)).toarray()
	from keras.preprocessing.text import one_hot
	from keras.preprocessing.sequence import pad_sequences

	def encode_corpus(self,data):
	encoded_docs = [one_hot(' '.join(d), self.vocab_length) for d in data]
	vocab_file = "vocab_mapping.txt"+version
	with open(vocab_file,'w') as f:
	for i in range(len(data)):
	for j in range(len(data[i])):
	ss = data[i][j]+" "+str(encoded_docs[i][j])+"\n"
	fellow 10739
	patriot 7589
	yeah 8002
	take 2770
	gun 7944
	control 4475
	policy 6949
	advice 7074
	jean 12534
	company 573
	def CNN(self):
	model = Sequential()
	model.add(Embedding(self.vocab_length, 30, input_length=self.max_len)) # Max Length of Tweet
	model.add(Convolution1D(64,5,activation="relu"))
	model.add(Dropout(0.5))
	model.add(Convolution1D(32,3,activation="relu"))
	model.add(Dropout(0.5))
	model.add(Convolution1D(16,3,activation="sigmoid"))
	model.add(MaxPooling1D(5))
	model.add(Flatten())
	def LSTM(self):
	model = Sequential()
	model.add(Embedding(self.vocab_length, 30, input_length=self.max_len))
	model.add(LSTM(200))
	model.add(Dense(self.max_len, activation='relu', W_regularizer=l2(0.90)))
	model.add(Dense(self.tr_labels.shape[1], activation='softmax', W_regularizer=l2(0.1)))
	adam_1 = Adam(lr=0.008)
	model.compile(loss='categorical_crossentropy', optimizer=adam_1,metrics=['accuracy'])
	model.summary()
	self.model = model
	# Let the keras model be 'model'

	# OPTION 1: Convert Keras model to ONNX and convert ONNX model to CoreML model
	import onnxmltools
	onnx_model = onnxmltools.convert_keras(model) #Keras to ONNX

	from onnx_coreml import convert
	mlmodel = convert(onnx_model) # ONNX to CoreML
	mlmodel.save('hate_coreml_model.mlmodel')