Jose Luis Fernández Nuevo JLFDataScience

## folium_choroplet_map.py
folium.Choropleth(
    geo_data=geojson_counties,
    name='choropleth',
    data=df_casos,
    columns=['CCAA', 'Casos'],
    key_on='feature.properties.texto',   #'features.properties.comunidade_autonoma'
    fill_color='YlGn',
    fill_opacity=0.5,
    line_opacity=0.2,
    legend_name='Número de Casoso de la COVID-19'

## barplot_trigrams.py
#Barplot of most freq Tri-grams
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df)
j.set_xticklabels(j.get_xticklabels(), rotation=45);

## barplot_bigrams.py
#Barplot of most freq Bi-grams
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df);
h.set_xticklabels(h.get_xticklabels(), rotation=45);

## tri-grams.py
#Most frequently occuring Tri-grams
def get_top_n3_words(corpus, n=None):
    vec1 = CountVectorizer(max_df=0.8,stop_words=stop_words, ngram_range=(3,3),
           max_features=2000).fit(corpus)
    bag_of_words = vec1.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1],
                reverse=True)

## bi-grams.py
#Las palabras compuestas Bi-gramas más frecuentes
def get_top_n2_words(corpus, n=None):
    vec1 = CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=20000, ngram_range=(2,2)).fit(corpus)
    bag_of_words = vec1.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1],
                reverse=True)
    return words_freq[:n]

## barplot_frequence_words.py
#Convert the most frequent words to a df to generate a bar chart
top_words = get_top_n_words(corpus, n=20)
top_df = pd.DataFrame(top_words)
top_df.columns=["Word", "Freq"]


#Barplot of the most common words after filtering
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
g = sns.barplot(x="Word", y="Freq", data=top_df)

## World_Cloud.py
#We create a cloud of words to get an idea of the most common
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
%matplotlib inline
wordcloud = WordCloud(
                          background_color='white',
                          stopwords=stop_words,
                          max_words=100,

## Create_corpus.py
corpus = []
for i in range(0, len(df_tve)):
    #Remove punctuations
    text = re.sub('[^a-zA-ZÑñáéíóú]', ' ', df_tve['content'][i])

    #Convert to lowercase
    text = text.lower()

    #remove tags (">","<")
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)

## Create_stopwords_self.py
#Normalización: Stemming & Lematización
#Import NLP libraries
import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
#nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

## trend_duration_news.py
#If we look at the trend on the duration of aging news
df_plot_duration = df_tve.groupby(df_tve['date'].dt.year)['duration'].agg(['mean'])

plt.plot(df_plot_duration.index, df_plot_duration['mean']);
plt.ylabel('Duración media en min')
plt.xlabel('Años')
plt.xlim(2014,2019)
plt.xticks(rotation='vertical');
	folium.Choropleth(
	geo_data=geojson_counties,
	name='choropleth',
	data=df_casos,
	columns=['CCAA', 'Casos'],
	key_on='feature.properties.texto', #'features.properties.comunidade_autonoma'
	fill_color='YlGn',
	fill_opacity=0.5,
	line_opacity=0.2,
	legend_name='Número de Casoso de la COVID-19'
	#Barplot of most freq Tri-grams
	import seaborn as sns
	sns.set(rc={'figure.figsize':(13,8)})
	j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df)
	j.set_xticklabels(j.get_xticklabels(), rotation=45);
	#Barplot of most freq Bi-grams
	import seaborn as sns
	sns.set(rc={'figure.figsize':(13,8)})
	h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df);
	h.set_xticklabels(h.get_xticklabels(), rotation=45);
	#Most frequently occuring Tri-grams
	def get_top_n3_words(corpus, n=None):
	vec1 = CountVectorizer(max_df=0.8,stop_words=stop_words, ngram_range=(3,3),
	max_features=2000).fit(corpus)
	bag_of_words = vec1.transform(corpus)
	sum_words = bag_of_words.sum(axis=0)
	words_freq = [(word, sum_words[0, idx]) for word, idx in
	vec1.vocabulary_.items()]
	words_freq =sorted(words_freq, key = lambda x: x[1],
	reverse=True)
	#Las palabras compuestas Bi-gramas más frecuentes
	def get_top_n2_words(corpus, n=None):
	vec1 = CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=20000, ngram_range=(2,2)).fit(corpus)
	bag_of_words = vec1.transform(corpus)
	sum_words = bag_of_words.sum(axis=0)
	words_freq = [(word, sum_words[0, idx]) for word, idx in
	vec1.vocabulary_.items()]
	words_freq =sorted(words_freq, key = lambda x: x[1],
	reverse=True)
	return words_freq[:n]
	#Convert the most frequent words to a df to generate a bar chart
	top_words = get_top_n_words(corpus, n=20)
	top_df = pd.DataFrame(top_words)
	top_df.columns=["Word", "Freq"]


	#Barplot of the most common words after filtering
	import seaborn as sns
	sns.set(rc={'figure.figsize':(13,8)})
	g = sns.barplot(x="Word", y="Freq", data=top_df)
	#We create a cloud of words to get an idea of the most common
	from os import path
	from PIL import Image
	from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
	import matplotlib.pyplot as plt
	%matplotlib inline
	wordcloud = WordCloud(
	background_color='white',
	stopwords=stop_words,
	max_words=100,
	corpus = []
	for i in range(0, len(df_tve)):
	#Remove punctuations
	text = re.sub('[^a-zA-ZÑñáéíóú]', ' ', df_tve['content'][i])

	#Convert to lowercase
	text = text.lower()

	#remove tags (">","<")
	text=re.sub("</?.*?>"," <> ",text)
	#Normalización: Stemming & Lematización
	#Import NLP libraries
	import re
	import nltk
	#nltk.download('stopwords')
	from nltk.corpus import stopwords
	from nltk.stem.porter import PorterStemmer
	from nltk.tokenize import RegexpTokenizer
	#nltk.download('wordnet')
	from nltk.stem.wordnet import WordNetLemmatizer
	#If we look at the trend on the duration of aging news
	df_plot_duration = df_tve.groupby(df_tve['date'].dt.year)['duration'].agg(['mean'])

	plt.plot(df_plot_duration.index, df_plot_duration['mean']);
	plt.ylabel('Duración media en min')
	plt.xlabel('Años')
	plt.xlim(2014,2019)
	plt.xticks(rotation='vertical');