Jake Hofman jhofman

## compare_figure_md5sums.R
#
# file: compare_figure_md5sums.R
#
# description: minimal example to show that md5sums of two files
# containing the same figure are different when saved as pdfs but not
# pngs due to the "created at" timestamp embedded in the pdf metadata
#
# usage: Rscript compare_figure_md5sums.R
#
# requirements: tidyverse

## filter_by_group_id.R
library(tidyverse)
library(digest)

# create a dummy dataframe with 100,000 groups and 1,000,000 rows
# where group ids are md5 hash of integers from 1 to 100,000
set.seed(42)
md5 <- Vectorize(function(x) digest(x, algo="md5"))
df <- data.frame(group_id=sample(md5(1:1e4), 1e6, replace=T),
                 val=sample(1:100, 1e6, replace=T))

## scrapple_x.py
#!/usr/bin/env python
#
# file: scrapple_x.py
#
# description: checks apple.come for iphone x in-store pickup availability
#
# usage: ./scrapple.py [zip] [att|verizon|sprint|tmobile] [64|256] [grey|silver]
#
#  or in a crontab:
#  */5 * * * * /path/to/scrapple.py 12345 tmobile 64 grey && mailx -s "iphone x" 2125551212@vtext.com

## changing_ggplot_legends.R
library(tidyverse)
library(forcats)

# The original plot
## This has an ugly legend title, maybe we should remove it and modify the labels
ggplot(mtcars, aes(x = mpg, y = disp, col = as.factor(cyl))) +
  geom_point()

# Approach 1: Modify the plot

## Makefile
all: pediacities_nyc_neighborhood_populations.csv

pediacities_nyc_neighborhood_populations.csv: pediacities_nyc_neighborhoods.json extract_neighborhood_populations.sh
	extract_neighborhood_populations.sh

pediacities_nyc_neighborhoods.json: download_neighborhood_pages.sh
	download_neighborhood_pages.sh

## dplyr_filter_ungroup.R
library(dplyr)

# create a dummy dataframe with 100,000 groups and 1,000,000 rows
# and partition by group_id
df <- data.frame(group_id=sample(1:1e5, 1e6, replace=T),
                 val=sample(1:100, 1e6, replace=T)) %>%
      group_by(group_id)

# filter rows with a value of 1 naively
system.time(df %>% filter(val == 1))

## scrape_income_dist.sh
#!/bin/bash
#
# Scrape income distribution data from whatsmypercent.com
#
# Output is in incomes.csv (percentile,income)
#

# start at $100 / year
income=100

## track_changes.sh
#!/bin/bash
#
# script to show word-style "track changes" from a previous git revision
#

if [ $# -lt 2 ]
then
    echo "usage: $0 <rev> <maintex>"
    echo "  rev is the prefix of a git revision hash"
    echo "  see 'git log' for revision hashes"

## bookmark_starred.py
#!/usr/bin/env python
#
# file: bookmark_starred.py
#
# description: mirrors starred github repos to delicious
#
# usage: bookmark_starred.py GITHUB_USER DELICIOUS_USER
#
# requirements: requests

## scrapple.py
#!/usr/bin/env python
#
# file: scrapple.py
#
# description: checks apple.come for iphone 5s in-store pickup availability
#
# usage: ./scrapple.py [zip] [att|verizon|sprint] [16|32|64] [grey|silver|gold]
#
#  or in a crontab:
#  */5 * * * * /path/to/scrapple.py 10012 verizon 32 grey && mailx -s 5s 2125551212@vtext.com
	#
	# file: compare_figure_md5sums.R
	#
	# description: minimal example to show that md5sums of two files
	# containing the same figure are different when saved as pdfs but not
	# pngs due to the "created at" timestamp embedded in the pdf metadata
	#
	# usage: Rscript compare_figure_md5sums.R
	#
	# requirements: tidyverse
	library(tidyverse)
	library(digest)

	# create a dummy dataframe with 100,000 groups and 1,000,000 rows
	# where group ids are md5 hash of integers from 1 to 100,000
	set.seed(42)
	md5 <- Vectorize(function(x) digest(x, algo="md5"))
	df <- data.frame(group_id=sample(md5(1:1e4), 1e6, replace=T),
	val=sample(1:100, 1e6, replace=T))
	#!/usr/bin/env python
	#
	# file: scrapple_x.py
	#
	# description: checks apple.come for iphone x in-store pickup availability
	#
	# usage: ./scrapple.py [zip] [att\|verizon\|sprint\|tmobile] [64\|256] [grey\|silver]
	#
	# or in a crontab:
	# /5 * * * /path/to/scrapple.py 12345 tmobile 64 grey && mailx -s "iphone x" 2125551212@vtext.com
	library(tidyverse)
	library(forcats)

	# The original plot
	## This has an ugly legend title, maybe we should remove it and modify the labels
	ggplot(mtcars, aes(x = mpg, y = disp, col = as.factor(cyl))) +
	geom_point()

	# Approach 1: Modify the plot
	all: pediacities_nyc_neighborhood_populations.csv

	pediacities_nyc_neighborhood_populations.csv: pediacities_nyc_neighborhoods.json extract_neighborhood_populations.sh
	extract_neighborhood_populations.sh

	pediacities_nyc_neighborhoods.json: download_neighborhood_pages.sh
	download_neighborhood_pages.sh
	library(dplyr)

	# create a dummy dataframe with 100,000 groups and 1,000,000 rows
	# and partition by group_id
	df <- data.frame(group_id=sample(1:1e5, 1e6, replace=T),
	val=sample(1:100, 1e6, replace=T)) %>%
	group_by(group_id)

	# filter rows with a value of 1 naively
	system.time(df %>% filter(val == 1))
	#!/bin/bash
	#
	# Scrape income distribution data from whatsmypercent.com
	#
	# Output is in incomes.csv (percentile,income)
	#

	# start at $100 / year
	income=100
	#!/bin/bash
	#
	# script to show word-style "track changes" from a previous git revision
	#

	if [ $# -lt 2 ]
	then
	echo "usage: $0 <rev> <maintex>"
	echo " rev is the prefix of a git revision hash"
	echo " see 'git log' for revision hashes"
	#!/usr/bin/env python
	#
	# file: bookmark_starred.py
	#
	# description: mirrors starred github repos to delicious
	#
	# usage: bookmark_starred.py GITHUB_USER DELICIOUS_USER
	#
	# requirements: requests
	#!/usr/bin/env python
	#
	# file: scrapple.py
	#
	# description: checks apple.come for iphone 5s in-store pickup availability
	#
	# usage: ./scrapple.py [zip] [att\|verizon\|sprint] [16\|32\|64] [grey\|silver\|gold]
	#
	# or in a crontab:
	# /5 * * * /path/to/scrapple.py 10012 verizon 32 grey && mailx -s 5s 2125551212@vtext.com