Skip to content

Instantly share code, notes, and snippets.

Durga Gadiraju dgadiraju

View GitHub Profile
View 01_nyse_create_table_quickstart.sql
CREATE DATABASE nyse;
CREATE USER 'nyse_user' IDENTIFIED BY 'itversity';
GRANT ALL ON nyse.* TO nyse_user;
GRANT FILE ON *.* TO nyse_user;
GRANT SUPER ON *.* TO nyse_user;
FLUSH PRIVILEGES;
View spark-dataframes-getting-started.java
package spark2demo;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import static org.apache.spark.sql.functions.sum;
import static org.apache.spark.sql.functions.round;
public class GettingStarted {
public static void main(String[] args) { // your system
View pdc-01-create-hive-tables.sql
CREATE TABLE orders (
order_id INT,
order_date STRING,
order_customer_id INT,
order_status STRING
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION 's3://itversitydata/retail_db/orders';
CREATE TABLE order_items (
order_item_id INT,
View pyspark-dataframes-01-application.properties
[dev]
executionMode = local
input.base.dir = /Users/itversity/Research/data/retail_db
output.base.dir = /Users/itversity/Research/data/bootcamp/pyspark
[prod]
executionMode = yarn-client
input.base.dir = /public/retail_db
output.base.dir = /user/training/bootcamp/pyspark
View spark-dataframes-ranking-01-read-data.py
# employeesPath = '/Users/itversity/Research/data/hr_db/employees/part-00000'
employeesPath = '/mnt/c/data/hr_db/employees/part-00000'
employees = spark. \
read. \
format('csv'). \
option('sep', '\t'). \
schema('''employee_id INT,
first_name STRING,
last_name STRING,
View spark-dataframes-windowing-01-read-data.py
# employeesPath = '/Users/itversity/Research/data/hr_db/employees/part-00000'
employeesPath = '/mnt/c/data/hr_db/employees/part-00000'
employees = spark. \
read. \
format('csv'). \
option('sep', '\t'). \
schema('''employee_id INT,
first_name STRING,
last_name STRING,
View pyspark-dataframe-operations-window-functions-example.py
orderItems = spark. \
read. \
json('/Users/itversity/Research/data/retail_db_json/order_items')
from pyspark.sql.window import *
from pyspark.sql.functions import *
spark.conf.set('spark.sql.shuffle.partitions', '2')
# spec = Window.partitionBy('order_item_order_id')
View pyspark-01-rdd-wordcount.py
data = sc.textFile('/public/randomtextwriter/part-m-00000')
wc = data. \
flatMap(lambda line: line.split(' ')). \
map(lambda word: (word, 1)). \
reduceByKey(lambda x, y: x + y)
wc. \
map(lambda rec: rec[0] + ',' + str(rec[1])). \
saveAsTextFile('/user/training/core/wordcount')
View javascript-fileio-and-db.js
const mongoose = require('mongoose');
const fs = require('fs');
const mongoUrl = 'mongodb://localhost:27017/retail';
mongoose.connect(mongoUrl, { useNewUrlParser: true });
const OrderSchema = mongoose
.Schema({
order_id: { type: Number, required: true, unique: true },
order_date: { type: String, required: true },
You can’t perform that action at this time.