Jas Sohi j450h1

## active-issues.sql
-- Active issues
-- Count of total active issues in the specified time frame
-- Source: githubarchive public data set via Google BigQuery http://githubarchive.org/

SELECT
	COUNT(DISTINCT JSON_EXTRACT_SCALAR(events.payload, '$.issue.id')) AS events_issue_count
FROM (SELECT * FROM TABLE_DATE_RANGE([githubarchive:day.],TIMESTAMP('2015-09-01'),TIMESTAMP('2016-08-31')))
 AS events

-- 10,723,492 active issues

## Country Currency Code Mappings
Country,CountryCode,Currency,Code
New Zealand,NZ,New Zealand Dollars,NZD
Cook Islands,CK,New Zealand Dollars,NZD
Niue,NU,New Zealand Dollars,NZD
Pitcairn,PN,New Zealand Dollars,NZD
Tokelau,TK,New Zealand Dollars,NZD
Australia,AU,Australian Dollars,AUD
Christmas Island,CX,Australian Dollars,AUD
Cocos (Keeling) Islands,CC,Australian Dollars,AUD
Heard and Mc Donald Islands,HM,Australian Dollars,AUD

## bb8_with_ggplot2.R
# BB-8 --------------------------------------------------------------------

# Inspired by Brian Hough in http://codepen.io/bhough/pen/wawrPL


# Packages ----------------------------------------------------------------

library("dplyr")
library("ggplot2")

## gist:fdfb1b8b236dbd68829d0b03804af9db
library(tidyverse)
library(paletteer)
library(gt)

pizzaplace %>%
  mutate(type = case_when(
    type == "chicken" ~ "chicken (pizzas with chicken as a major ingredient)",
    type == "classic" ~ "classic (classical pizzas)",
    type == "supreme" ~ "supreme (pizzas that try a little harder)",
    type == "veggie" ~ "veggie (pizzas without any meats whatsoever)",

## df.printSchema()
root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)

## df.show(5)
+--------------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+---------------+-------------+---------+--------------------+------+-------------+--------------------+------+
|              artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|           page| registration|sessionId|                song|status|           ts|           userAgent|userId|
+--------------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+---------------+-------------+---------+--------------------+------+-------------+--------------------+------+
|      Martha Tilston|Logged In|    Colin|     M|           50| Freeman|277.89016| paid|     Bakersfield, CA|   PUT|       NextSong|1538173362000|       29|           Rockpools|   200|1538352117000|Mozilla/5.0 (Wind...|    30|
|    Five Iron Frenzy|Logged In|    Micah|     M|           79|    Long|236.09424| free|Boston-Cambridge-...|   PUT|

## topn_values
def topn_values(column_name, n):
    """
    Take a column name and find the most frequent values
    """
    return df.groupby(column_name).count().sort('count', ascending=False).limit(n).toPandas()

## get_churned_users
def get_churned_users(df):
    """
    Find out the users that cancelled so we can identify who churned.
    Return updated dataframe with additional column identifying as such
    """
    cancelled_ids = df.filter('page == "Cancellation Confirmation"').select("userId").distinct()
    # Convert to list to be used to filter later
    cancelled_ids = cancelled_ids.toPandas()['userId'].tolist()
    # 1 when a user churned and 0 when they did not
    df = df.withColumn("Churn", when((col("userId").isin(cancelled_ids)),lit('1')).otherwise(lit('0')))

## aggregate_to_user_level
def aggregate_to_user_level(df):
    """
    Aggregate the selected features to the user level
    """
    exprs = [\
    sparkMax(col('churn')).alias('churn')\
    ,sparkMax(col('Gender')).alias('gender')\
    ,sparkMax(col('level')).alias('subscription_level')\
    ,sparkMax(col('device_type')).alias('device_type')\
    ,sparkMax(when(col("page") == 'Upgrade', 1).otherwise(0)).alias('page_upgraded')

## gist:57cef2992d91787f341fa92e193adc10
user_count = user_df.groupby('churn').count()
user_count = user_count.withColumn('percent', col('count')/sum('count').over(Window.partitionBy()))
# multiply by 100 and round
user_count = user_count.withColumn("percent", round(user_count["percent"] * 100, 2))
user_count.orderBy('percent', ascending=False).show()

+-----+-----+-------+
|churn|count|percent|
+-----+-----+-------+
|    0|  173|  76.89|
	-- Active issues
	-- Count of total active issues in the specified time frame
	-- Source: githubarchive public data set via Google BigQuery http://githubarchive.org/

	SELECT
	COUNT(DISTINCT JSON_EXTRACT_SCALAR(events.payload, '$.issue.id')) AS events_issue_count
	FROM (SELECT * FROM TABLE_DATE_RANGE([githubarchive:day.],TIMESTAMP('2015-09-01'),TIMESTAMP('2016-08-31')))
	AS events

	-- 10,723,492 active issues
	Country,CountryCode,Currency,Code
	New Zealand,NZ,New Zealand Dollars,NZD
	Cook Islands,CK,New Zealand Dollars,NZD
	Niue,NU,New Zealand Dollars,NZD
	Pitcairn,PN,New Zealand Dollars,NZD
	Tokelau,TK,New Zealand Dollars,NZD
	Australia,AU,Australian Dollars,AUD
	Christmas Island,CX,Australian Dollars,AUD
	Cocos (Keeling) Islands,CC,Australian Dollars,AUD
	Heard and Mc Donald Islands,HM,Australian Dollars,AUD
	# BB-8 --------------------------------------------------------------------

	# Inspired by Brian Hough in http://codepen.io/bhough/pen/wawrPL



	# Packages ----------------------------------------------------------------

	library("dplyr")
	library("ggplot2")
	library(tidyverse)
	library(paletteer)
	library(gt)

	pizzaplace %>%
	mutate(type = case_when(
	type == "chicken" ~ "chicken (pizzas with chicken as a major ingredient)",
	type == "classic" ~ "classic (classical pizzas)",
	type == "supreme" ~ "supreme (pizzas that try a little harder)",
	type == "veggie" ~ "veggie (pizzas without any meats whatsoever)",
	root
	\|-- artist: string (nullable = true)
	\|-- auth: string (nullable = true)
	\|-- firstName: string (nullable = true)
	\|-- gender: string (nullable = true)
	\|-- itemInSession: long (nullable = true)
	\|-- lastName: string (nullable = true)
	\|-- length: double (nullable = true)
	\|-- level: string (nullable = true)
	\|-- location: string (nullable = true)
	+--------------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+---------------+-------------+---------+--------------------+------+-------------+--------------------+------+
	\| artist\| auth\|firstName\|gender\|itemInSession\|lastName\| length\|level\| location\|method\| page\| registration\|sessionId\| song\|status\| ts\| userAgent\|userId\|
	+--------------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+---------------+-------------+---------+--------------------+------+-------------+--------------------+------+
	\| Martha Tilston\|Logged In\| Colin\| M\| 50\| Freeman\|277.89016\| paid\| Bakersfield, CA\| PUT\| NextSong\|1538173362000\| 29\| Rockpools\| 200\|1538352117000\|Mozilla/5.0 (Wind...\| 30\|
	\| Five Iron Frenzy\|Logged In\| Micah\| M\| 79\| Long\|236.09424\| free\|Boston-Cambridge-...\| PUT\|
	def topn_values(column_name, n):
	"""
	Take a column name and find the most frequent values
	"""
	return df.groupby(column_name).count().sort('count', ascending=False).limit(n).toPandas()
	def get_churned_users(df):
	"""
	Find out the users that cancelled so we can identify who churned.
	Return updated dataframe with additional column identifying as such
	"""
	cancelled_ids = df.filter('page == "Cancellation Confirmation"').select("userId").distinct()
	# Convert to list to be used to filter later
	cancelled_ids = cancelled_ids.toPandas()['userId'].tolist()
	# 1 when a user churned and 0 when they did not
	df = df.withColumn("Churn", when((col("userId").isin(cancelled_ids)),lit('1')).otherwise(lit('0')))
	def aggregate_to_user_level(df):
	"""
	Aggregate the selected features to the user level
	"""
	exprs = [\
	sparkMax(col('churn')).alias('churn')\
	,sparkMax(col('Gender')).alias('gender')\
	,sparkMax(col('level')).alias('subscription_level')\
	,sparkMax(col('device_type')).alias('device_type')\
	,sparkMax(when(col("page") == 'Upgrade', 1).otherwise(0)).alias('page_upgraded')
	user_count = user_df.groupby('churn').count()
	user_count = user_count.withColumn('percent', col('count')/sum('count').over(Window.partitionBy()))
	# multiply by 100 and round
	user_count = user_count.withColumn("percent", round(user_count["percent"] * 100, 2))
	user_count.orderBy('percent', ascending=False).show()

	+-----+-----+-------+
	\|churn\|count\|percent\|
	+-----+-----+-------+
	\| 0\| 173\| 76.89\|