arvi1000/pie_extract.R

## pie_extract.R

library(png)
library(tidyverse)

# 1) read in data ----

to_matrix <- function(png_file) {

  # read PNG to raster  (3d array of x * y * color channel)
  the_png <- readPNG(png_file)

  # extract the rgb values from color channels 1 to 3 (4 is alpha)
  out_mat <- sapply(1:3, function(x) round(as.vector(the_png[,,x])*255))
  return(out_mat)
}

pie1 <- to_matrix('~/Desktop/pie1.png')
pie2 <- to_matrix('~/Desktop/pie2.png')

# 2) clustering and data extraction ----

# whoa we have 25k+ unique colors in here, likely due to artifacts from image
# compression
length(unique(pie1))

# let's use clustering to group similar colors. since we know exactly how many
# target colors we want, k-means is perfect for this.

# 11 colors from the 1st pie & 9 from the 2nd (includes +1 for the white
# background). we initialize with a large number of centroids
# to get a more stable result
km_fit1 <- kmeans(pie1, centers = 11, nstart = 100)
km_fit2 <- kmeans(pie2, centers = 9, nstart = 90)

# let's extract the key info (color values and pixel counts)
km_to_df <- function(km, label_col) {

  data.frame(counts = table(km$cluster),
             hex_value = rgb(km$centers, maxColorValue = 255),
             label = label_col,
             stringsAsFactors = F)

}

plot_dat <- rbind(km_to_df(km_fit1, 'pie 1'),
                  km_to_df(km_fit2, 'pie 2'))

# drop white, which is just the background
plot_dat <- filter(plot_dat, hex_value != '#FEFEFE')

# 3) plot our results for sanity check ----

# named vector so we can plot each hex value in the actual color
clr_scale <- unique(plot_dat$hex_value)
names(clr_scale) <- clr_scale

# let's take a look! plot for one pie, dropping the background white
do_plot <- function(which_pie) {

  plot_dat %>%
    filter(label==which_pie) %>%
    ggplot(aes(x=reorder(hex_value, counts.Freq),
               y=counts.Freq,
               fill=hex_value)) +
    geom_col() +
    scale_fill_manual(values=clr_scale) +
    theme_light() +
    theme(legend.position = 'none') +
    labs(title = paste0('Pixel Frequency by Color Cluster (', which_pie, ')'),
         y = 'count', x = 'cluster centroid')

}
do_plot('pie 1')
do_plot('pie 2')

# 4) data reassembly ----

# let's add the $/hr labels back by looking up the values in the original pie
# for each color (ordered by frequency)

# it's actually pretty hard to look up the dollar values from the colors,
# bc of the crazy leader lines!

pie1_pay <- c(15,12,10,13,16,13.50,14,15.50,16.50,14.50)
pie2_pay <- c(18,20,17,19,17.50,24,26,22)
plot_dat <- plot_dat %>%
  arrange(label, -counts.Freq) %>%
  mutate(wage = c(pie1_pay, pie2_pay))

# now we can finally make our better chart! let's get the percent at each wage,
# and percent making up to that wage

plot_dat <- plot_dat %>%
  arrange(wage) %>%
  mutate(pct = counts.Freq / sum(plot_dat$counts.Freq),
         pct_up_to = cumsum(pct))

# a bar plot is a safe starter bet

ggplot(plot_dat, aes(x=wage, y=100*pct)) +
  geom_col(fill='dodgerblue') +
  labs(title='Bike Shop Mechanic Hourly Wage',
       subtitle = 'based on survey by @ProBicycleMech, n=200',
       y = '% of Respondents', x = '$/hr') +
  theme_minimal() +
  theme(panel.grid.minor = element_blank())

# but i like a cumulative distribution. maybe less accessible, but
# imo the most informative in this case, esp w some labels

labeler <- function(x, y) {
  paste0(round(100*x), '% make less than $', y, '/hr')
}

# the final plot:

ggplot(plot_dat,
       aes(x=wage, y=100*pct_up_to)) +
  geom_point(color='dodgerblue') +
  geom_line(color='dodgerblue') +
  geom_text(data = filter(plot_dat, wage %in% c(15, 20)),
            aes(label = labeler(pct_up_to, wage)),
            vjust = 2, hjust=0) +
  labs(title='Bike Shop Mechanic Hourly Wage',
       subtitle = 'based on survey by @ProBicycleMech, n=200',
       y = '% of respondents making this much or less ', x = '$/hr') +
  theme_minimal() +
  theme(panel.grid.minor.y = element_blank())

	library(png)
	library(tidyverse)

	# 1) read in data ----

	to_matrix <- function(png_file) {

	# read PNG to raster (3d array of x * y * color channel)
	the_png <- readPNG(png_file)

	# extract the rgb values from color channels 1 to 3 (4 is alpha)
	out_mat <- sapply(1:3, function(x) round(as.vector(the_png[,,x])*255))
	return(out_mat)
	}

	pie1 <- to_matrix('~/Desktop/pie1.png')
	pie2 <- to_matrix('~/Desktop/pie2.png')

	# 2) clustering and data extraction ----

	# whoa we have 25k+ unique colors in here, likely due to artifacts from image
	# compression
	length(unique(pie1))

	# let's use clustering to group similar colors. since we know exactly how many
	# target colors we want, k-means is perfect for this.

	# 11 colors from the 1st pie & 9 from the 2nd (includes +1 for the white
	# background). we initialize with a large number of centroids
	# to get a more stable result
	km_fit1 <- kmeans(pie1, centers = 11, nstart = 100)
	km_fit2 <- kmeans(pie2, centers = 9, nstart = 90)

	# let's extract the key info (color values and pixel counts)
	km_to_df <- function(km, label_col) {

	data.frame(counts = table(km$cluster),
	hex_value = rgb(km$centers, maxColorValue = 255),
	label = label_col,
	stringsAsFactors = F)

	}

	plot_dat <- rbind(km_to_df(km_fit1, 'pie 1'),
	km_to_df(km_fit2, 'pie 2'))

	# drop white, which is just the background
	plot_dat <- filter(plot_dat, hex_value != '#FEFEFE')

	# 3) plot our results for sanity check ----

	# named vector so we can plot each hex value in the actual color
	clr_scale <- unique(plot_dat$hex_value)
	names(clr_scale) <- clr_scale

	# let's take a look! plot for one pie, dropping the background white
	do_plot <- function(which_pie) {

	plot_dat %>%
	filter(label==which_pie) %>%
	ggplot(aes(x=reorder(hex_value, counts.Freq),
	y=counts.Freq,
	fill=hex_value)) +
	geom_col() +
	scale_fill_manual(values=clr_scale) +
	theme_light() +
	theme(legend.position = 'none') +
	labs(title = paste0('Pixel Frequency by Color Cluster (', which_pie, ')'),
	y = 'count', x = 'cluster centroid')

	}
	do_plot('pie 1')
	do_plot('pie 2')

	# 4) data reassembly ----

	# let's add the $/hr labels back by looking up the values in the original pie
	# for each color (ordered by frequency)

	# it's actually pretty hard to look up the dollar values from the colors,
	# bc of the crazy leader lines!

	pie1_pay <- c(15,12,10,13,16,13.50,14,15.50,16.50,14.50)
	pie2_pay <- c(18,20,17,19,17.50,24,26,22)
	plot_dat <- plot_dat %>%
	arrange(label, -counts.Freq) %>%
	mutate(wage = c(pie1_pay, pie2_pay))

	# now we can finally make our better chart! let's get the percent at each wage,
	# and percent making up to that wage

	plot_dat <- plot_dat %>%
	arrange(wage) %>%
	mutate(pct = counts.Freq / sum(plot_dat$counts.Freq),
	pct_up_to = cumsum(pct))

	# a bar plot is a safe starter bet

	ggplot(plot_dat, aes(x=wage, y=100*pct)) +
	geom_col(fill='dodgerblue') +
	labs(title='Bike Shop Mechanic Hourly Wage',
	subtitle = 'based on survey by @ProBicycleMech, n=200',
	y = '% of Respondents', x = '$/hr') +
	theme_minimal() +
	theme(panel.grid.minor = element_blank())

	# but i like a cumulative distribution. maybe less accessible, but
	# imo the most informative in this case, esp w some labels

	labeler <- function(x, y) {
	paste0(round(100*x), '% make less than $', y, '/hr')
	}

	# the final plot:

	ggplot(plot_dat,
	aes(x=wage, y=100*pct_up_to)) +
	geom_point(color='dodgerblue') +
	geom_line(color='dodgerblue') +
	geom_text(data = filter(plot_dat, wage %in% c(15, 20)),
	aes(label = labeler(pct_up_to, wage)),
	vjust = 2, hjust=0) +
	labs(title='Bike Shop Mechanic Hourly Wage',
	subtitle = 'based on survey by @ProBicycleMech, n=200',
	y = '% of respondents making this much or less ', x = '$/hr') +
	theme_minimal() +
	theme(panel.grid.minor.y = element_blank())