Skip to content

Instantly share code, notes, and snippets.

View abhishek-shrm's full-sized avatar

ABHISHEK SHARMA abhishek-shrm

  • ZS Associates
  • New Delhi, India
View GitHub Profile
vocabulary=len(tokenizer.word_index)+1
print('Vocabulary Size=>',vocabulary)
from keras.preprocessing.sequence import pad_sequences
# Padding with zero
train_seq=pad_sequences(train_seq,maxlen=100,padding='post')
test_seq=pad_sequences(test_seq,maxlen=100,padding='post')
import matplotlib.pyplot as plt
comment_word_count = []
# Populate the lists with length of comments
for i in df_train['cleaned']:
comment_word_count.append(len(i.split()))
# Create a dataframe with length of comments
length_df = pd.DataFrame({'Comment Length':comment_word_count})
# Converting word sequence to integer sequence
train_seq = tokenizer.texts_to_sequences(df_train['cleaned'])
test_seq = tokenizer.texts_to_sequences(df_test['cleaned'])
print('Vocabulary Size=>',len(tokenizer.word_index))
from keras.preprocessing.text import Tokenizer
# Instantiating Tokenizer
tokenizer = Tokenizer()
# Creating index for words
tokenizer.fit_on_texts(df_train['cleaned'])
# For working with regular expressions
import re
# Function for cleaning text
def cleaner(text):
# Lowercasing text
text=text.lower()
# Keeping only words
text=re.sub("[^a-z]+"," ",text)
# Removing extra spaces
# Printing sample comments
for i,v in enumerate(df_train['comment_text'].sample(5).values):
print('Comment ',i+1,'=>',repr(v))
# Printing class distribution in percentage
for i in ['toxic','severe_toxic','obscene','threat','insult','identity_hate']:
print(df_train[i].value_counts(normalize=True)*100)
# Loading Test set
df_test=pd.read_csv('./test.csv')
print('Shape=>',df_test.shape)
df_test.head()