Skip to content

Instantly share code, notes, and snippets.

@brenoimatos
Last active January 7, 2021 21:35
Show Gist options
  • Save brenoimatos/7a3448854a3d59416cf17e81094a724a to your computer and use it in GitHub Desktop.
Save brenoimatos/7a3448854a3d59416cf17e81094a724a to your computer and use it in GitHub Desktop.
# Checando se temos algum NaN
print(df_raw[df_raw['lyrics'].isna() == True])
# Excluindo os NaN
df_valid = df_raw.dropna()
print(df_valid.isna().sum())
# Criando a função para limpar o dataframe
def cleaning_text(text):
regex = re.compile('[%s]' % re.escape(string.punctuation.replace('\'','')))
text = text.lower().replace('’', '')
text = unidecode.unidecode(text)
text = re.sub('\[(.+?)\]', '', text) #excluir palavras entre []
text = re.sub('\((.+?)\)', '', text) #excluir palavras entre ()
text = re.sub(regex, ' ', text) #excluir pontuação
text = re.sub('\s+',' ', text).strip() #substituir espaços maiores que 1 por 1.
return text
#Agrupando as letras por artista e aplicando a função para limpar
df = df_valid.groupby('artist')['lyrics'].agg(''.join).reset_index()
df['lyrics_clean'] = df['lyrics'].apply(lambda x: cleaning_text(x))
df.head(3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment