Skip to content

Instantly share code, notes, and snippets.

Durga Gadiraju dgadiraju

Block or report user

Report or block dgadiraju

Hide content and notifications from this user.

Learn more about blocking users

Contact Support about this user’s behavior.

Learn more about reporting abuse

Report abuse
View GitHub Profile
View hive-window-aggregations-employees.sql
SELECT employee_id,
department_id,
salary,
sum(salary) OVER (PARTITION BY department_id) department_salary_expense,
round((salary/sum(salary) OVER (PARTITION BY department_id)) * 100, 2) pct_salary
FROM employees
ORDER BY department_id;
View 01_nyse_create_table_quickstart.sql
CREATE DATABASE nyse;
CREATE USER 'nyse_user' IDENTIFIED BY 'itversity';
GRANT ALL ON nyse.* TO nyse_user;
GRANT FILE ON *.* TO nyse_user;
GRANT SUPER ON *.* TO nyse_user;
FLUSH PRIVILEGES;
View spark-dataframes-getting-started.java
package spark2demo;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import static org.apache.spark.sql.functions.sum;
import static org.apache.spark.sql.functions.round;
public class GettingStarted {
public static void main(String[] args) { // your system
View pdc-01-create-hive-tables.sql
CREATE TABLE orders (
order_id INT,
order_date STRING,
order_customer_id INT,
order_status STRING
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION 's3://itversitydata/retail_db/orders';
CREATE TABLE order_items (
order_item_id INT,
View pyspark-dataframes-01-application.properties
[dev]
executionMode = local
input.base.dir = /Users/itversity/Research/data/retail_db
output.base.dir = /Users/itversity/Research/data/bootcamp/pyspark
[prod]
executionMode = yarn-client
input.base.dir = /public/retail_db
output.base.dir = /user/training/bootcamp/pyspark
View spark-dataframes-ranking-01-read-data.py
# employeesPath = '/Users/itversity/Research/data/hr_db/employees/part-00000'
employeesPath = '/mnt/c/data/hr_db/employees/part-00000'
employees = spark. \
read. \
format('csv'). \
option('sep', '\t'). \
schema('''employee_id INT,
first_name STRING,
last_name STRING,
View spark-dataframes-windowing-01-read-data.py
# employeesPath = '/Users/itversity/Research/data/hr_db/employees/part-00000'
employeesPath = '/mnt/c/data/hr_db/employees/part-00000'
employees = spark. \
read. \
format('csv'). \
option('sep', '\t'). \
schema('''employee_id INT,
first_name STRING,
last_name STRING,
View pyspark-dataframe-operations-window-functions-example.py
orderItems = spark. \
read. \
json('/Users/itversity/Research/data/retail_db_json/order_items')
from pyspark.sql.window import *
from pyspark.sql.functions import *
spark.conf.set('spark.sql.shuffle.partitions', '2')
# spec = Window.partitionBy('order_item_order_id')
View pyspark-01-rdd-wordcount.py
data = sc.textFile('/public/randomtextwriter/part-m-00000')
wc = data. \
flatMap(lambda line: line.split(' ')). \
map(lambda word: (word, 1)). \
reduceByKey(lambda x, y: x + y)
wc. \
map(lambda rec: rec[0] + ',' + str(rec[1])). \
saveAsTextFile('/user/training/core/wordcount')
You can’t perform that action at this time.