Conor conormm

## read_stats_data.py

                                                  text                                              title                        paper  automated_readability_index  coleman_liau_index  flesch_kincaid_grade_level  flesch_readability_ease  gunning_fog_index  n_chars  n_polysyllable_words  n_sents  n_syllables  n_unique_words  n_words  smog_index
0    four us states are suing the trump administrat...  Four states sue Trump administration over 'un-...  https://www.theguardian.com                    58.698156           15.299608                   46.986883               -45.888247          51.064935     4108                   136        7         1284             365      770   28.309659
1    sean spicer, the press secretary for the trump...  Sean Spicer holds first Trump administration p...  https://www.theguardian.com                    18.558549           15.390357                   15.867059                31.768431          17.921569      466                    14        3          147              66

## apply_newspaper_preprocessing_funcs.py
articles = get_articles(the_guardian, breitbart, title_topic="Trump")
articles["text"] = articles.text.map(clean_text)
articles["text"] = preprocess_articles(articles.text)

def get_readability_stats(parsed_articles):

    stats_list = []
    for ix, article in enumerate(parsed_articles):
        doc = tcy.Doc(article)
        readability_stats = tcy.text_stats.readability_stats(doc)

## euroviznetwork_2016.R
library(readxl)
library(janitor)
library(ggraph)
library(igraph)

df <- readxl::read_excel("/eurovision_song_contest_1975_2016.xlsx") %>%
  clean_names()

df <- df[, c(5, 6, 1:4, 7)]

## read_data.py
import numpy as np
import spacy
from sklearn.decomposition import PCA

nlp = spacy.load("en")

animals = "dog cat hamster lion tiger elephant cheetah monkey gorilla antelope rabbit mouse rat zoo home pet fluffy wild domesticated"

animal_tokens = nlp(animals)
animal_vectors = np.vstack([word.vector for word in animal_tokens if word.has_vector])

## land_temps.R
library(readr)
library(dplyr)
library(ggplot2)
library(janitor)
library(broom)
library(lubridate)

# available at https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data
temp_loc <- "/Users/conormcdonald/Downloads/GlobalLandTemperatures/GlobalLandTemperaturesByCountry.csv"

## imdb_5000_hist.r
imdb_score_summary <- df %>%
  summarise_each(funs(min, max, mean, median, sd), imdb_score) %>%
  gather(stat, value)

df %>%
  ggplot(aes(imdb_score)) +
  geom_histogram(bins = 50, colour = "black", fill = "red", alpha = 0.4) +
  theme_minimal() +
  annotation_custom(tableGrob(imdb_score_summary, rows = NULL),
                    xmin = 2.5,

## glimpse_imdb_5000.r
library(tidyverse)
library(here)
library(broom)
library(corrr)
library(forcats)
library(stringr)
library(lubridate)
library(gridExtra)

df <- read_csv("movie_metadata.csv")

## glimpse_nas.r
df %>% map_df(~sum(is.na(.))) %>% glimpse()

Observations: 1
Variables: 28
$ color                     <int> 19
$ director_name             <int> 104
$ num_critic_for_reviews    <int> 50
$ duration                  <int> 15
$ director_facebook_likes   <int> 104
$ actor_3_facebook_likes    <int> 23

## create_years_since_var.r
num_df <- num_df %>%
  mutate(years_since_release = 2017 - title_year) %>%
  select(-title_year)

## time_score_plt.r
df %>%
  group_by(title_year) %>%
  summarise(mean_rating = mean(imdb_score),
            upper_rating = quantile(imdb_score, 0.975),
            lower_rating = quantile(imdb_score, 0.0275)) %>%
  ggplot(aes(title_year, mean_rating)) +
  geom_line(colour = "dodger blue") +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", colour = "red", alpha = 0.6, se = FALSE) +
  geom_errorbar(aes(ymin = upper_rating, ymax = lower_rating)) +

	text title paper automated_readability_index coleman_liau_index flesch_kincaid_grade_level flesch_readability_ease gunning_fog_index n_chars n_polysyllable_words n_sents n_syllables n_unique_words n_words smog_index
	0 four us states are suing the trump administrat... Four states sue Trump administration over 'un-... https://www.theguardian.com 58.698156 15.299608 46.986883 -45.888247 51.064935 4108 136 7 1284 365 770 28.309659
	1 sean spicer, the press secretary for the trump... Sean Spicer holds first Trump administration p... https://www.theguardian.com 18.558549 15.390357 15.867059 31.768431 17.921569 466 14 3 147 66
	articles = get_articles(the_guardian, breitbart, title_topic="Trump")
	articles["text"] = articles.text.map(clean_text)
	articles["text"] = preprocess_articles(articles.text)

	def get_readability_stats(parsed_articles):

	stats_list = []
	for ix, article in enumerate(parsed_articles):
	doc = tcy.Doc(article)
	readability_stats = tcy.text_stats.readability_stats(doc)
	library(readxl)
	library(janitor)
	library(ggraph)
	library(igraph)

	df <- readxl::read_excel("/eurovision_song_contest_1975_2016.xlsx") %>%
	clean_names()

	df <- df[, c(5, 6, 1:4, 7)]
	import numpy as np
	import spacy
	from sklearn.decomposition import PCA

	nlp = spacy.load("en")

	animals = "dog cat hamster lion tiger elephant cheetah monkey gorilla antelope rabbit mouse rat zoo home pet fluffy wild domesticated"

	animal_tokens = nlp(animals)
	animal_vectors = np.vstack([word.vector for word in animal_tokens if word.has_vector])
	library(readr)
	library(dplyr)
	library(ggplot2)
	library(janitor)
	library(broom)
	library(lubridate)

	# available at https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data
	temp_loc <- "/Users/conormcdonald/Downloads/GlobalLandTemperatures/GlobalLandTemperaturesByCountry.csv"
	imdb_score_summary <- df %>%
	summarise_each(funs(min, max, mean, median, sd), imdb_score) %>%
	gather(stat, value)

	df %>%
	ggplot(aes(imdb_score)) +
	geom_histogram(bins = 50, colour = "black", fill = "red", alpha = 0.4) +
	theme_minimal() +
	annotation_custom(tableGrob(imdb_score_summary, rows = NULL),
	xmin = 2.5,
	library(tidyverse)
	library(here)
	library(broom)
	library(corrr)
	library(forcats)
	library(stringr)
	library(lubridate)
	library(gridExtra)

	df <- read_csv("movie_metadata.csv")
	df %>% map_df(~sum(is.na(.))) %>% glimpse()

	Observations: 1
	Variables: 28
	$ color <int> 19
	$ director_name <int> 104
	$ num_critic_for_reviews <int> 50
	$ duration <int> 15
	$ director_facebook_likes <int> 104
	$ actor_3_facebook_likes <int> 23
	num_df <- num_df %>%
	mutate(years_since_release = 2017 - title_year) %>%
	select(-title_year)
	df %>%
	group_by(title_year) %>%
	summarise(mean_rating = mean(imdb_score),
	upper_rating = quantile(imdb_score, 0.975),
	lower_rating = quantile(imdb_score, 0.0275)) %>%
	ggplot(aes(title_year, mean_rating)) +
	geom_line(colour = "dodger blue") +
	geom_point(alpha = 0.5) +
	geom_smooth(method = "lm", colour = "red", alpha = 0.6, se = FALSE) +
	geom_errorbar(aes(ymin = upper_rating, ymax = lower_rating)) +