brenoimatos/lyrics_nlp_cleaning.py

## lyrics_nlp_cleaning.py
# Checando se temos algum NaN
print(df_raw[df_raw['lyrics'].isna() == True])

# Excluindo os NaN
df_valid = df_raw.dropna()
print(df_valid.isna().sum())

# Criando a função para limpar o dataframe
def cleaning_text(text):

    regex = re.compile('[%s]' % re.escape(string.punctuation.replace('\'','')))
    text = text.lower().replace('’', '')
    text = unidecode.unidecode(text)
    text = re.sub('\[(.+?)\]', '', text) #excluir palavras entre []
    text = re.sub('\((.+?)\)', '', text) #excluir palavras entre ()
    text = re.sub(regex, ' ', text) #excluir pontuação
    text = re.sub('\s+',' ', text).strip() #substituir espaços maiores que 1 por 1.

    return text

#Agrupando as letras por artista e aplicando a função para limpar
df = df_valid.groupby('artist')['lyrics'].agg(''.join).reset_index()
df['lyrics_clean'] = df['lyrics'].apply(lambda x: cleaning_text(x))
df.head(3)
	# Checando se temos algum NaN
	print(df_raw[df_raw['lyrics'].isna() == True])

	# Excluindo os NaN
	df_valid = df_raw.dropna()
	print(df_valid.isna().sum())

	# Criando a função para limpar o dataframe
	def cleaning_text(text):

	regex = re.compile('[%s]' % re.escape(string.punctuation.replace('\'','')))
	text = text.lower().replace('’', '')
	text = unidecode.unidecode(text)
	text = re.sub('\[(.+?)\]', '', text) #excluir palavras entre []
	text = re.sub('\((.+?)\)', '', text) #excluir palavras entre ()
	text = re.sub(regex, ' ', text) #excluir pontuação
	text = re.sub('\s+',' ', text).strip() #substituir espaços maiores que 1 por 1.

	return text

	#Agrupando as letras por artista e aplicando a função para limpar
	df = df_valid.groupby('artist')['lyrics'].agg(''.join).reset_index()
	df['lyrics_clean'] = df['lyrics'].apply(lambda x: cleaning_text(x))
	df.head(3)