This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
folium.Choropleth( | |
geo_data=geojson_counties, | |
name='choropleth', | |
data=df_casos, | |
columns=['CCAA', 'Casos'], | |
key_on='feature.properties.texto', #'features.properties.comunidade_autonoma' | |
fill_color='YlGn', | |
fill_opacity=0.5, | |
line_opacity=0.2, | |
legend_name='Número de Casoso de la COVID-19' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Barplot of most freq Tri-grams | |
import seaborn as sns | |
sns.set(rc={'figure.figsize':(13,8)}) | |
j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df) | |
j.set_xticklabels(j.get_xticklabels(), rotation=45); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Barplot of most freq Bi-grams | |
import seaborn as sns | |
sns.set(rc={'figure.figsize':(13,8)}) | |
h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df); | |
h.set_xticklabels(h.get_xticklabels(), rotation=45); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Most frequently occuring Tri-grams | |
def get_top_n3_words(corpus, n=None): | |
vec1 = CountVectorizer(max_df=0.8,stop_words=stop_words, ngram_range=(3,3), | |
max_features=2000).fit(corpus) | |
bag_of_words = vec1.transform(corpus) | |
sum_words = bag_of_words.sum(axis=0) | |
words_freq = [(word, sum_words[0, idx]) for word, idx in | |
vec1.vocabulary_.items()] | |
words_freq =sorted(words_freq, key = lambda x: x[1], | |
reverse=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Las palabras compuestas Bi-gramas más frecuentes | |
def get_top_n2_words(corpus, n=None): | |
vec1 = CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=20000, ngram_range=(2,2)).fit(corpus) | |
bag_of_words = vec1.transform(corpus) | |
sum_words = bag_of_words.sum(axis=0) | |
words_freq = [(word, sum_words[0, idx]) for word, idx in | |
vec1.vocabulary_.items()] | |
words_freq =sorted(words_freq, key = lambda x: x[1], | |
reverse=True) | |
return words_freq[:n] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Convert the most frequent words to a df to generate a bar chart | |
top_words = get_top_n_words(corpus, n=20) | |
top_df = pd.DataFrame(top_words) | |
top_df.columns=["Word", "Freq"] | |
#Barplot of the most common words after filtering | |
import seaborn as sns | |
sns.set(rc={'figure.figsize':(13,8)}) | |
g = sns.barplot(x="Word", y="Freq", data=top_df) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#We create a cloud of words to get an idea of the most common | |
from os import path | |
from PIL import Image | |
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator | |
import matplotlib.pyplot as plt | |
%matplotlib inline | |
wordcloud = WordCloud( | |
background_color='white', | |
stopwords=stop_words, | |
max_words=100, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
corpus = [] | |
for i in range(0, len(df_tve)): | |
#Remove punctuations | |
text = re.sub('[^a-zA-ZÑñáéíóú]', ' ', df_tve['content'][i]) | |
#Convert to lowercase | |
text = text.lower() | |
#remove tags (">","<") | |
text=re.sub("</?.*?>"," <> ",text) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Normalización: Stemming & Lematización | |
#Import NLP libraries | |
import re | |
import nltk | |
#nltk.download('stopwords') | |
from nltk.corpus import stopwords | |
from nltk.stem.porter import PorterStemmer | |
from nltk.tokenize import RegexpTokenizer | |
#nltk.download('wordnet') | |
from nltk.stem.wordnet import WordNetLemmatizer |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#If we look at the trend on the duration of aging news | |
df_plot_duration = df_tve.groupby(df_tve['date'].dt.year)['duration'].agg(['mean']) | |
plt.plot(df_plot_duration.index, df_plot_duration['mean']); | |
plt.ylabel('Duración media en min') | |
plt.xlabel('Años') | |
plt.xlim(2014,2019) | |
plt.xticks(rotation='vertical'); |