Skip to content

Instantly share code, notes, and snippets.

View JLFDataScience's full-sized avatar

Jose Luis Fernández Nuevo JLFDataScience

  • FGCSIC
View GitHub Profile
@JLFDataScience
JLFDataScience / folium_choroplet_map.py
Last active May 19, 2020 10:11
Generate choroplet map in Folium
folium.Choropleth(
geo_data=geojson_counties,
name='choropleth',
data=df_casos,
columns=['CCAA', 'Casos'],
key_on='feature.properties.texto', #'features.properties.comunidade_autonoma'
fill_color='YlGn',
fill_opacity=0.5,
line_opacity=0.2,
legend_name='Número de Casoso de la COVID-19'
#Barplot of most freq Tri-grams
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df)
j.set_xticklabels(j.get_xticklabels(), rotation=45);
#Barplot of most freq Bi-grams
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df);
h.set_xticklabels(h.get_xticklabels(), rotation=45);
#Most frequently occuring Tri-grams
def get_top_n3_words(corpus, n=None):
vec1 = CountVectorizer(max_df=0.8,stop_words=stop_words, ngram_range=(3,3),
max_features=2000).fit(corpus)
bag_of_words = vec1.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in
vec1.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1],
reverse=True)
#Las palabras compuestas Bi-gramas más frecuentes
def get_top_n2_words(corpus, n=None):
vec1 = CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=20000, ngram_range=(2,2)).fit(corpus)
bag_of_words = vec1.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in
vec1.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1],
reverse=True)
return words_freq[:n]
#Convert the most frequent words to a df to generate a bar chart
top_words = get_top_n_words(corpus, n=20)
top_df = pd.DataFrame(top_words)
top_df.columns=["Word", "Freq"]
#Barplot of the most common words after filtering
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
g = sns.barplot(x="Word", y="Freq", data=top_df)
@JLFDataScience
JLFDataScience / World_Cloud.py
Created March 11, 2020 19:45
We create a cloud of words to get an idea of the most common
#We create a cloud of words to get an idea of the most common
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
%matplotlib inline
wordcloud = WordCloud(
background_color='white',
stopwords=stop_words,
max_words=100,
@JLFDataScience
JLFDataScience / Create_corpus.py
Created March 11, 2020 19:41
Create corpus y lemmatisation
corpus = []
for i in range(0, len(df_tve)):
#Remove punctuations
text = re.sub('[^a-zA-ZÑñáéíóú]', ' ', df_tve['content'][i])
#Convert to lowercase
text = text.lower()
#remove tags (">","<")
text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
@JLFDataScience
JLFDataScience / Create_stopwords_self.py
Created March 11, 2020 19:36
Create our own words to the stopword list
#Normalización: Stemming & Lematización
#Import NLP libraries
import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
#nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
@JLFDataScience
JLFDataScience / trend_duration_news.py
Created March 11, 2020 19:21
trend on the duration of aging news
#If we look at the trend on the duration of aging news
df_plot_duration = df_tve.groupby(df_tve['date'].dt.year)['duration'].agg(['mean'])
plt.plot(df_plot_duration.index, df_plot_duration['mean']);
plt.ylabel('Duración media en min')
plt.xlabel('Años')
plt.xlim(2014,2019)
plt.xticks(rotation='vertical');