Skip to content

Instantly share code, notes, and snippets.

View brenoimatos's full-sized avatar

Breno Matos brenoimatos

View GitHub Profile
import scrapy
class LyricsSpider(scrapy.Spider):
name = 'rap'
urls = []
artists = '3030 1kilo adl-mcs afro-x azzy baco-exu-do-blues bin bk black-alien bnegao c4bal cacife-clandestino cartel-mcs choice chris cone-crew-diretoria coruja-bc1 costa-gold criolo cynthia-luz dalsin delacruz de-leve dfideliz diomedes-chinaski djonga don-l drik-barbosa dudu-mc emicida fabio-brazza faccao-central felp22 flora-matos froid gaab gabriel-pensador gloria-groove haikaiss hungria-hip-hop jaya-luuck je-santiago kamau karol-conka kayua kiaz l7nnon luccas-carlos makalister-renton mano-brown marcelo-d2 matue mc-hariel-sp mc-marechal mc-orochi mr-thug mv-bill nabrisa-tonett negra-li nill nill ogi oriente pele-milflows projota quinto-andar racionais-mcs rael rappin-hood rashid ret rincon-sapiencia sabotage sant shawlin sidoka slim-rimografia speed-freaks tulio-dek ucl xama'.split()
for artist in artists:
urls.append(f'https://www.letras.mus.br/{artist}/')
import pandas as pd
import nltk
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator
import unidecode
import re
import string
import matplotlib.ticker as mtick
import os
# Checando se temos algum NaN
print(df_raw[df_raw['lyrics'].isna() == True])
# Excluindo os NaN
df_valid = df_raw.dropna()
print(df_valid.isna().sum())
# Criando a função para limpar o dataframe
def cleaning_text(text):
df["tokens"] = df.lyrics_clean.str.split()
df.set_index('artist', inplace = True)
nltk.download('stopwords')
stopwords_nltk = nltk.corpus.stopwords.words('portuguese')
update = ["tá",'pra','tô', 'cê','pro', 'então', "meu", "em",
"você", "de", "ao", "os",'vou', 'vai', 'vem', 'mim',
'uns', 'sei', 'quero', 'ser', 'ver', 'aqui','faz']
# Concatenando as duas listas
stopwords_raw = [*stopwords_nltk, *update]
df['Unique'] = df['tokens'].apply(lambda x: len(set(x)))
df['Total'] = df['tokens'].apply(lambda x: len(x))
df['Relative'] = df['tokens'].apply(lambda x: len(set(x))/len(x))
# Para pegar o path do seu diretório de trabalho
path = os.getcwd()
# Definindo o nome da pasta que vai ser salvo os png
wordcloud_folder = 'wordcloud'
for artist in df.index:
wordcloud = WordCloud(background_color="black",
max_words = 500,
width=1600, height=800).generate(df.loc[artist]['wordcloud'])
# função adjust_text importada de https://github.com/Phlya/adjustText/blob/master/adjustText/__init__.py
x = df['Unique']
y = df['Relative']
annot = df.index
fig, ax = plt.subplots(figsize=(12, 15))
ax.scatter(x, y)
texts = []