q = load "stack_overflow_dataset";
q = filter q by date('creation_date_Year', 'creation_date_Month', 'creation_date_Day') in ["current month".."current month"];
result = group q by 'creation_date_Day';
result = foreach result generate q.'creation_date_Day' as 'creation_date_Day', unique(q.'questionid') as 'A';
u = load "stack_overflow_dataset";
u = filter u by date('creation_date_Year', 'creation_date_Month', 'creation_date_Day') in ["1 year ago".."current month - 1 year"];
u = foreach u generate u.'creation_date_Day' as 'creation_date_Day', unique(u.'questionid') as 'B';
final_result =cogroup result by 'creation_date_Day' left,u by 'creation_date_Day';
final_result=foreach final_result generate result.'creation_date_Day' as 'Day',sum(result.'A') as 'Current_Year',coalesce(sum(u.'B'),0) as 'Previous_Year';
import boto3
import setup
def run_jobs():
client = boto3.client('emr', region_name='us-east-1')
response = client.list_clusters(
scp -i bigdata.pem RedshiftJDBC42-no-awssdk- hadoop@ec2–3–88–110–
scp -i bigdata.pem hadoop@ec2–3–88–110–
scp -i bigdata.pem hadoop@ec2–3–88–110–
spark-submit --jars RedshiftJDBC42-no-awssdk-
--packages org.apache.spark:spark-avro_2.11:2.4.3,com.databricks:spark-redshift_2.11:2.0.1
#Import All Functions
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import unix_timestamp, to_date, date_format, month, year, dayofyear, dayofweek, col
from pyspark.sql.types import TimestampType
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import unix_timestamp, to_date, date_format, month, year, dayofyear, dayofweek, col
from pyspark.sql.types import TimestampType
sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", "")
sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", "")
.option("url", "jdbc:redshift://")\
.option("dbtable", "stackoverflow")\
.option("tempdir", "s3n://stack-overflow-bucket")\
