Skip to content

Instantly share code, notes, and snippets.

@conormm
conormm / read_stats_data.py
Last active April 23, 2017 16:56
read_stats_data
text title paper automated_readability_index coleman_liau_index flesch_kincaid_grade_level flesch_readability_ease gunning_fog_index n_chars n_polysyllable_words n_sents n_syllables n_unique_words n_words smog_index
0 four us states are suing the trump administrat... Four states sue Trump administration over 'un-... https://www.theguardian.com 58.698156 15.299608 46.986883 -45.888247 51.064935 4108 136 7 1284 365 770 28.309659
1 sean spicer, the press secretary for the trump... Sean Spicer holds first Trump administration p... https://www.theguardian.com 18.558549 15.390357 15.867059 31.768431 17.921569 466 14 3 147 66
articles = get_articles(the_guardian, breitbart, title_topic="Trump")
articles["text"] = articles.text.map(clean_text)
articles["text"] = preprocess_articles(articles.text)
def get_readability_stats(parsed_articles):
stats_list = []
for ix, article in enumerate(parsed_articles):
doc = tcy.Doc(article)
readability_stats = tcy.text_stats.readability_stats(doc)
library(readxl)
library(janitor)
library(ggraph)
library(igraph)
df <- readxl::read_excel("/eurovision_song_contest_1975_2016.xlsx") %>%
clean_names()
df <- df[, c(5, 6, 1:4, 7)]
@conormm
conormm / read_data.py
Created July 11, 2017 10:39
Extracting word vectors from spaCy
import numpy as np
import spacy
from sklearn.decomposition import PCA
nlp = spacy.load("en")
animals = "dog cat hamster lion tiger elephant cheetah monkey gorilla antelope rabbit mouse rat zoo home pet fluffy wild domesticated"
animal_tokens = nlp(animals)
animal_vectors = np.vstack([word.vector for word in animal_tokens if word.has_vector])
library(readr)
library(dplyr)
library(ggplot2)
library(janitor)
library(broom)
library(lubridate)
# available at https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data
temp_loc <- "/Users/conormcdonald/Downloads/GlobalLandTemperatures/GlobalLandTemperaturesByCountry.csv"
imdb_score_summary <- df %>%
summarise_each(funs(min, max, mean, median, sd), imdb_score) %>%
gather(stat, value)
df %>%
ggplot(aes(imdb_score)) +
geom_histogram(bins = 50, colour = "black", fill = "red", alpha = 0.4) +
theme_minimal() +
annotation_custom(tableGrob(imdb_score_summary, rows = NULL),
xmin = 2.5,
library(tidyverse)
library(here)
library(broom)
library(corrr)
library(forcats)
library(stringr)
library(lubridate)
library(gridExtra)
df <- read_csv("movie_metadata.csv")
df %>% map_df(~sum(is.na(.))) %>% glimpse()
Observations: 1
Variables: 28
$ color <int> 19
$ director_name <int> 104
$ num_critic_for_reviews <int> 50
$ duration <int> 15
$ director_facebook_likes <int> 104
$ actor_3_facebook_likes <int> 23
num_df <- num_df %>%
mutate(years_since_release = 2017 - title_year) %>%
select(-title_year)
df %>%
group_by(title_year) %>%
summarise(mean_rating = mean(imdb_score),
upper_rating = quantile(imdb_score, 0.975),
lower_rating = quantile(imdb_score, 0.0275)) %>%
ggplot(aes(title_year, mean_rating)) +
geom_line(colour = "dodger blue") +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", colour = "red", alpha = 0.6, se = FALSE) +
geom_errorbar(aes(ymin = upper_rating, ymax = lower_rating)) +