Conor conormm

## r-to-python-data-wrangling-basics.md

      
              1 file
            
          
              101 forks
            
          
              38 comments
            
          
              401 stars
            
          
                conormm
                / r-to-python-data-wrangling-basics.md
            
            
              Last active
              March 26, 2024 11:44
            
              
                R to Python: Data wrangling with dplyr and pandas
              
          
    R to python data wrangling snippets

The dplyr package in R makes data wrangling significantly easier.
The beauty of dplyr is that, by design, the options available are limited.
Specifically, a set of key verbs form the core of the package.
Using these verbs you can solve a wide range of data problems effectively in a shorter timeframe.
Whilse transitioning to Python I have greatly missed the ease with which I can think through and solve problems using dplyr in R.
The purpose of this document is to demonstrate how to execute the key dplyr verbs when manipulating data using Python (with the pandas package).
dplyr is organised around six key verbs:

  
## articles_extraction_preprocess_funcs.py
import pandas as pd
import textacy
import re
import newspaper
import textacy as tcy
import matplotlib.pyplot as plt
from collections import defaultdict

the_guardian = "https://www.theguardian.com"
breitbart = "http://www.breitbart.com"

## apply_newspaper_preprocessing_funcs.py
articles = get_articles(the_guardian, breitbart, title_topic="Trump")
articles["text"] = articles.text.map(clean_text)
articles["text"] = preprocess_articles(articles.text)

def get_readability_stats(parsed_articles):

    stats_list = []
    for ix, article in enumerate(parsed_articles):
        doc = tcy.Doc(article)
        readability_stats = tcy.text_stats.readability_stats(doc)

## read_stats_data.py

                                                  text                                              title                        paper  automated_readability_index  coleman_liau_index  flesch_kincaid_grade_level  flesch_readability_ease  gunning_fog_index  n_chars  n_polysyllable_words  n_sents  n_syllables  n_unique_words  n_words  smog_index
0    four us states are suing the trump administrat...  Four states sue Trump administration over 'un-...  https://www.theguardian.com                    58.698156           15.299608                   46.986883               -45.888247          51.064935     4108                   136        7         1284             365      770   28.309659
1    sean spicer, the press secretary for the trump...  Sean Spicer holds first Trump administration p...  https://www.theguardian.com                    18.558549           15.390357                   15.867059                31.768431          17.921569      466                    14        3          147              66

## euroviznetwork_2016.R
library(readxl)
library(janitor)
library(ggraph)
library(igraph)

df <- readxl::read_excel("/eurovision_song_contest_1975_2016.xlsx") %>%
  clean_names()

df <- df[, c(5, 6, 1:4, 7)]

## read_data.py
import numpy as np
import spacy
from sklearn.decomposition import PCA

nlp = spacy.load("en")

animals = "dog cat hamster lion tiger elephant cheetah monkey gorilla antelope rabbit mouse rat zoo home pet fluffy wild domesticated"

animal_tokens = nlp(animals)
animal_vectors = np.vstack([word.vector for word in animal_tokens if word.has_vector])

## land_temps.R
library(readr)
library(dplyr)
library(ggplot2)
library(janitor)
library(broom)
library(lubridate)

# available at https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data
temp_loc <- "/Users/conormcdonald/Downloads/GlobalLandTemperatures/GlobalLandTemperaturesByCountry.csv"

## glimpse_imdb_5000.r
library(tidyverse)
library(here)
library(broom)
library(corrr)
library(forcats)
library(stringr)
library(lubridate)
library(gridExtra)

df <- read_csv("movie_metadata.csv")

## imdb_5000_hist.r
imdb_score_summary <- df %>%
  summarise_each(funs(min, max, mean, median, sd), imdb_score) %>%
  gather(stat, value)

df %>%
  ggplot(aes(imdb_score)) +
  geom_histogram(bins = 50, colour = "black", fill = "red", alpha = 0.4) +
  theme_minimal() +
  annotation_custom(tableGrob(imdb_score_summary, rows = NULL),
                    xmin = 2.5,

## glimpse_nas.r
df %>% map_df(~sum(is.na(.))) %>% glimpse()

Observations: 1
Variables: 28
$ color                     <int> 19
$ director_name             <int> 104
$ num_critic_for_reviews    <int> 50
$ duration                  <int> 15
$ director_facebook_likes   <int> 104
$ actor_3_facebook_likes    <int> 23
	import pandas as pd
	import textacy
	import re
	import newspaper
	import textacy as tcy
	import matplotlib.pyplot as plt
	from collections import defaultdict

	the_guardian = "https://www.theguardian.com"
	breitbart = "http://www.breitbart.com"
	articles = get_articles(the_guardian, breitbart, title_topic="Trump")
	articles["text"] = articles.text.map(clean_text)
	articles["text"] = preprocess_articles(articles.text)

	def get_readability_stats(parsed_articles):

	stats_list = []
	for ix, article in enumerate(parsed_articles):
	doc = tcy.Doc(article)
	readability_stats = tcy.text_stats.readability_stats(doc)

	text title paper automated_readability_index coleman_liau_index flesch_kincaid_grade_level flesch_readability_ease gunning_fog_index n_chars n_polysyllable_words n_sents n_syllables n_unique_words n_words smog_index
	0 four us states are suing the trump administrat... Four states sue Trump administration over 'un-... https://www.theguardian.com 58.698156 15.299608 46.986883 -45.888247 51.064935 4108 136 7 1284 365 770 28.309659
	1 sean spicer, the press secretary for the trump... Sean Spicer holds first Trump administration p... https://www.theguardian.com 18.558549 15.390357 15.867059 31.768431 17.921569 466 14 3 147 66
	library(readxl)
	library(janitor)
	library(ggraph)
	library(igraph)

	df <- readxl::read_excel("/eurovision_song_contest_1975_2016.xlsx") %>%
	clean_names()

	df <- df[, c(5, 6, 1:4, 7)]
	import numpy as np
	import spacy
	from sklearn.decomposition import PCA

	nlp = spacy.load("en")

	animals = "dog cat hamster lion tiger elephant cheetah monkey gorilla antelope rabbit mouse rat zoo home pet fluffy wild domesticated"

	animal_tokens = nlp(animals)
	animal_vectors = np.vstack([word.vector for word in animal_tokens if word.has_vector])
	library(readr)
	library(dplyr)
	library(ggplot2)
	library(janitor)
	library(broom)
	library(lubridate)

	# available at https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data
	temp_loc <- "/Users/conormcdonald/Downloads/GlobalLandTemperatures/GlobalLandTemperaturesByCountry.csv"
	library(tidyverse)
	library(here)
	library(broom)
	library(corrr)
	library(forcats)
	library(stringr)
	library(lubridate)
	library(gridExtra)

	df <- read_csv("movie_metadata.csv")
	imdb_score_summary <- df %>%
	summarise_each(funs(min, max, mean, median, sd), imdb_score) %>%
	gather(stat, value)

	df %>%
	ggplot(aes(imdb_score)) +
	geom_histogram(bins = 50, colour = "black", fill = "red", alpha = 0.4) +
	theme_minimal() +
	annotation_custom(tableGrob(imdb_score_summary, rows = NULL),
	xmin = 2.5,
	df %>% map_df(~sum(is.na(.))) %>% glimpse()

	Observations: 1
	Variables: 28
	$ color <int> 19
	$ director_name <int> 104
	$ num_critic_for_reviews <int> 50
	$ duration <int> 15
	$ director_facebook_likes <int> 104
	$ actor_3_facebook_likes <int> 23