Skip to content

Instantly share code, notes, and snippets.

@snehamehrin
Created August 3, 2020 15:29
Show Gist options
  • Save snehamehrin/1d2a97f037ffd35dbd07b667f210ae32 to your computer and use it in GitHub Desktop.
Save snehamehrin/1d2a97f037ffd35dbd07b667f210ae32 to your computer and use it in GitHub Desktop.
from pyspark.sql.functions import unix_timestamp, to_date, date_format, month, year, dayofyear, dayofweek, col
from pyspark.sql.types import TimestampType
#Intialize Spark Session
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName('Stack Overflow ML') \
.getOrCreate()
print('Session created')
sc = spark.sparkContext
#Import the file
stack = sc.textFile('mnt/stack-overflow-bucket/stack_firehose_stream', 1)
#Split the dictionary of rows into columns
df = spark.read.json(stack)
#Total Number of Records
df.count()
#Count The duplicate questions
import pyspark.sql.functions as F
df_no_dup=df_duplicates.select([col for col in df.columns]).groupBy('question_id').agg(F.count('question_id').alias('dup_cnt'))
df_no_dup.count()
#Drop Duplicates
df_duplicates=df.dropDuplicates(['question_id'])
#Statistics for answer_column
df_duplicates.select('answer_count').describe().show()
#Converting Dates to
df_duplicates=df_duplicates.withColumn("created_date", F.from_unixtime("creation_date", "dd/MM/yyyy"))
df_duplicates=df_duplicates.withColumn("last_activity_date", F.from_unixtime("last_activity_date", "dd/MM/yyyy"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment