Skip to content

Instantly share code, notes, and snippets.

View snehamehrin's full-sized avatar

Sneha Mehrin snehamehrin

  • League
  • Toronto
View GitHub Profile
<!DOCTYPE html>
<html>
<body>
<div class="headerContainer">
<div style="background-color:white;margin: 1rem;padding: 0.5rem;width:10%" id="heroku">
<div style="float:left;width: 24%;">
<svg class="slds-align_absolute-center" width="27" style="fill: #9E7CC1;padding-top: 4px;margin-left: 0px;" height="30" viewBox="0 0 27 30" xmlns="http://www.w3.org/2000/svg">
<title>heroku-logo</title>
<path d="M3 0C1.13 0 0 1.11 0 2.903v24.194C0 28.883 1.13 30 3 30h21c1.863 0 3-1.11 3-2.903V2.903C26.994 1.11 25.863 0 24 0H3zm21.042 2c.508.006.958.448.958.929V27.07c0 .487-.45.929-.958.929H2.958C2.45 28 2 27.558 2 27.071V2.93c0-.488.45-.93.958-.93h21.084zM20 25h-2.781v-8.506c0-.774-.237-1.048-.468-1.208-1.396-.959-5.414-.042-7.834.916L7 17.012 7.006 5h2.816v7.917a20.99 20.99 0 0 1 1.882-.482c2.988-.643 5.184-.47 6.616.505.787.536 1.68 1.59 1.68 3.554V25zm-6-15h3.293A16.109 16.109 0 0 0 20 5h-3.287c-.49 1.188-1.385 3.188-2.713 5zM7 25v-7l3 3.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
q = load "stack_overflow_dataset";
q = filter q by date('creation_date_Year', 'creation_date_Month', 'creation_date_Day') in ["current month".."current month"];
result = group q by 'creation_date_Day';
result = foreach result generate q.'creation_date_Day' as 'creation_date_Day', unique(q.'questionid') as 'A';
u = load "stack_overflow_dataset";
u = filter u by date('creation_date_Year', 'creation_date_Month', 'creation_date_Day') in ["1 year ago".."current month - 1 year"];
u = foreach u generate u.'creation_date_Day' as 'creation_date_Day', unique(u.'questionid') as 'B';
final_result =cogroup result by 'creation_date_Day' left,u by 'creation_date_Day';
final_result=foreach final_result generate result.'creation_date_Day' as 'Day',sum(result.'A') as 'Current_Year',coalesce(sum(u.'B'),0) as 'Previous_Year';
import boto3
import setup
def run_jobs():
client = boto3.client('emr', region_name='us-east-1')
response = client.list_clusters(
ClusterStates=['WAITING'
],
scp -i bigdata.pem RedshiftJDBC42-no-awssdk-1.2.20.1043.jar hadoop@ec2–3–88–110–90.compute-1.amazonaws.com:/home/hadoop/
scp -i bigdata.pem Execute.sh hadoop@ec2–3–88–110–90.compute-1.amazonaws.com:/home/hadoop/
scp -i bigdata.pem stack-processing.py hadoop@ec2–3–88–110–90.compute-1.amazonaws.com:/home/hadoop/
spark-submit --jars RedshiftJDBC42-no-awssdk-1.2.20.1043.jar
--packages org.apache.spark:spark-avro_2.11:2.4.3,com.databricks:spark-redshift_2.11:2.0.1 stack-processing.py
#Import All Functions
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import unix_timestamp, to_date, date_format, month, year, dayofyear, dayofweek, col
from pyspark.sql.types import TimestampType
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import unix_timestamp, to_date, date_format, month, year, dayofyear, dayofweek, col
from pyspark.sql.types import TimestampType
sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", "")
sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", "")
df_duplicates.write.format("com.databricks.spark.redshift")\
.option("url", "jdbc:redshift://redshift-cluster-1.c9lgtyzxfycf.us-east-1.redshift.amazonaws.com:5439/dev?user=awsuser&password=")\
.option("dbtable", "stackoverflow")\
.option("forward_spark_s3_credentials","true")\
.option("tempdir", "s3n://stack-overflow-bucket")\
.mode("append")\
.save()
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.