ABHISHEK SHARMA abhishek-shrm

## Multi-label Text Classification CNN-13.py
vocabulary=len(tokenizer.word_index)+1
print('Vocabulary Size=>',vocabulary)

## Multi-label Text Classification CNN-12.py
from keras.preprocessing.sequence import pad_sequences

# Padding with zero
train_seq=pad_sequences(train_seq,maxlen=100,padding='post')
test_seq=pad_sequences(test_seq,maxlen=100,padding='post')

## Multi-label Text Classification CNN-11.py
import matplotlib.pyplot as plt

comment_word_count = []
# Populate the lists with length of comments
for i in df_train['cleaned']:
      comment_word_count.append(len(i.split()))

# Create a dataframe with length of comments
length_df = pd.DataFrame({'Comment Length':comment_word_count})

## Multi-label Text Classification CNN-10.py
# Converting word sequence to integer sequence
train_seq = tokenizer.texts_to_sequences(df_train['cleaned'])
test_seq = tokenizer.texts_to_sequences(df_test['cleaned'])

## Multi-label Text Classification CNN-9.py
print('Vocabulary Size=>',len(tokenizer.word_index))

## Multi-label Text Classification CNN-8.py
from keras.preprocessing.text import Tokenizer

# Instantiating Tokenizer
tokenizer = Tokenizer()
# Creating index for words
tokenizer.fit_on_texts(df_train['cleaned'])

## Multi-label Text Classification CNN-7.py
# For working with regular expressions
import re

# Function for cleaning text
def cleaner(text):
    # Lowercasing text
    text=text.lower()
    # Keeping only words
    text=re.sub("[^a-z]+"," ",text)
    # Removing extra spaces

## Multi-label Text Classification CNN-6.py
# Printing sample comments
for i,v in enumerate(df_train['comment_text'].sample(5).values):
    print('Comment ',i+1,'=>',repr(v))

## Multi-label Text Classification CNN-5.py
# Printing class distribution in percentage
for i in ['toxic','severe_toxic','obscene','threat','insult','identity_hate']:
    print(df_train[i].value_counts(normalize=True)*100)

## Multi-label Text Classification CNN-4.py
# Loading Test set
df_test=pd.read_csv('./test.csv')
print('Shape=>',df_test.shape)
df_test.head()
	vocabulary=len(tokenizer.word_index)+1
	print('Vocabulary Size=>',vocabulary)
	from keras.preprocessing.sequence import pad_sequences

	# Padding with zero
	train_seq=pad_sequences(train_seq,maxlen=100,padding='post')
	test_seq=pad_sequences(test_seq,maxlen=100,padding='post')
	import matplotlib.pyplot as plt

	comment_word_count = []
	# Populate the lists with length of comments
	for i in df_train['cleaned']:
	comment_word_count.append(len(i.split()))

	# Create a dataframe with length of comments
	length_df = pd.DataFrame({'Comment Length':comment_word_count})
	# Converting word sequence to integer sequence
	train_seq = tokenizer.texts_to_sequences(df_train['cleaned'])
	test_seq = tokenizer.texts_to_sequences(df_test['cleaned'])
	from keras.preprocessing.text import Tokenizer

	# Instantiating Tokenizer
	tokenizer = Tokenizer()
	# Creating index for words
	tokenizer.fit_on_texts(df_train['cleaned'])
	# For working with regular expressions
	import re

	# Function for cleaning text
	def cleaner(text):
	# Lowercasing text
	text=text.lower()
	# Keeping only words
	text=re.sub("[^a-z]+"," ",text)
	# Removing extra spaces
	# Printing sample comments
	for i,v in enumerate(df_train['comment_text'].sample(5).values):
	print('Comment ',i+1,'=>',repr(v))
	# Printing class distribution in percentage
	for i in ['toxic','severe_toxic','obscene','threat','insult','identity_hate']:
	print(df_train[i].value_counts(normalize=True)*100)
	# Loading Test set
	df_test=pd.read_csv('./test.csv')
	print('Shape=>',df_test.shape)
	df_test.head()