amankharwal/language classification Secret

## language classification
# Define a list of commonly found punctuations
punc = ('!', "," ,"\'" ,";" ,"\"", ".", "-" ,"?")
vowels=['a','e','i','o','u']
# Define a list of double consecutive vowels which are typically found in Dutch and Afrikaans languages
same_consecutive_vowels = ['aa','ee', 'ii', 'oo', 'uu']
consecutive_vowels = [''.join(p) for p in permutations(vowels,2)]
dutch_combos = ['ij']

# Create a pre-defined set of features based on the "text" column in order to allow us to characterize the string
df['word_count'] = df['text'].apply(lambda x : len(x.split()))
df['character_count'] = df['text'].apply(lambda x : len(x.replace(" ","")))
df['word_density'] = df['word_count'] / (df['character_count'] + 1)
df['punc_count'] = df['text'].apply(lambda x : len([a for a in x if a in punc]))
df['v_char_count'] = df['text'].apply(lambda x : len([a for a in x if a.casefold() == 'v']))
df['w_char_count'] = df['text'].apply(lambda x : len([a for a in x if a.casefold() == 'w']))
df['ij_char_count'] = df['text'].apply(lambda x : sum([any(d_c in a for d_c in dutch_combos) for a in x.split()]))
df['num_double_consec_vowels'] = df['text'].apply(lambda x : sum([any(c_v in a for c_v in same_consecutive_vowels) for a in x.split()]))
df['num_consec_vowels'] = df['text'].apply(lambda x : sum([any(c_v in a for c_v in consecutive_vowels) for a in x.split()]))
df['num_vowels'] = df['text'].apply(lambda x : sum([any(v in a for v in vowels) for a in x.split()]))
df['vowel_density'] = df['num_vowels']/df['word_count']
df['capitals'] = df['text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['character_count']),axis=1)
df['num_exclamation_marks'] =df['text'].apply(lambda x: x.count('!'))
df['num_question_marks'] = df['text'].apply(lambda x: x.count('?'))
df['num_punctuation'] = df['text'].apply(lambda x: sum(x.count(w) for w in punc))
df['num_unique_words'] = df['text'].apply(lambda x: len(set(w for w in x.split())))
df['num_repeated_words'] = df['text'].apply(lambda x: len([w for w in collections.Counter(x.split()).values() if w > 1]))
df['words_vs_unique'] = df['num_unique_words'] / df['word_count']
df['encode_ascii'] = np.nan
for i in range(len(df)):
    try:
        df['text'].iloc[i].encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        df['encode_ascii'].iloc[i] = 0
    else:
        df['encode_ascii'].iloc[i] = 1
	# Define a list of commonly found punctuations
	punc = ('!', "," ,"\'" ,";" ,"\"", ".", "-" ,"?")
	vowels=['a','e','i','o','u']
	# Define a list of double consecutive vowels which are typically found in Dutch and Afrikaans languages
	same_consecutive_vowels = ['aa','ee', 'ii', 'oo', 'uu']
	consecutive_vowels = [''.join(p) for p in permutations(vowels,2)]
	dutch_combos = ['ij']

	# Create a pre-defined set of features based on the "text" column in order to allow us to characterize the string
	df['word_count'] = df['text'].apply(lambda x : len(x.split()))
	df['character_count'] = df['text'].apply(lambda x : len(x.replace(" ","")))
	df['word_density'] = df['word_count'] / (df['character_count'] + 1)
	df['punc_count'] = df['text'].apply(lambda x : len([a for a in x if a in punc]))
	df['v_char_count'] = df['text'].apply(lambda x : len([a for a in x if a.casefold() == 'v']))
	df['w_char_count'] = df['text'].apply(lambda x : len([a for a in x if a.casefold() == 'w']))
	df['ij_char_count'] = df['text'].apply(lambda x : sum([any(d_c in a for d_c in dutch_combos) for a in x.split()]))
	df['num_double_consec_vowels'] = df['text'].apply(lambda x : sum([any(c_v in a for c_v in same_consecutive_vowels) for a in x.split()]))
	df['num_consec_vowels'] = df['text'].apply(lambda x : sum([any(c_v in a for c_v in consecutive_vowels) for a in x.split()]))
	df['num_vowels'] = df['text'].apply(lambda x : sum([any(v in a for v in vowels) for a in x.split()]))
	df['vowel_density'] = df['num_vowels']/df['word_count']
	df['capitals'] = df['text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
	df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['character_count']),axis=1)
	df['num_exclamation_marks'] =df['text'].apply(lambda x: x.count('!'))
	df['num_question_marks'] = df['text'].apply(lambda x: x.count('?'))
	df['num_punctuation'] = df['text'].apply(lambda x: sum(x.count(w) for w in punc))
	df['num_unique_words'] = df['text'].apply(lambda x: len(set(w for w in x.split())))
	df['num_repeated_words'] = df['text'].apply(lambda x: len([w for w in collections.Counter(x.split()).values() if w > 1]))
	df['words_vs_unique'] = df['num_unique_words'] / df['word_count']
	df['encode_ascii'] = np.nan
	for i in range(len(df)):
	try:
	df['text'].iloc[i].encode(encoding='utf-8').decode('ascii')
	except UnicodeDecodeError:
	df['encode_ascii'].iloc[i] = 0
	else:
	df['encode_ascii'].iloc[i] = 1