lakshay-arora/sentiment_analysis_streaming_1.py

## sentiment_analysis_streaming_1.py
# importing required libraries
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.streaming import StreamingContext
import pyspark.sql.types as tp
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml.feature import StopWordsRemover, Word2Vec, RegexTokenizer
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import Row

# initializing spark session
sc = SparkContext(appName="PySparkShell")
spark = SparkSession(sc)

# define the schema
my_schema = tp.StructType([
  tp.StructField(name= 'id',          dataType= tp.IntegerType(),  nullable= True),
  tp.StructField(name= 'label',       dataType= tp.IntegerType(),  nullable= True),
  tp.StructField(name= 'tweet',       dataType= tp.StringType(),   nullable= True)
])


# read the dataset
my_data = spark.read.csv('twitter_sentiments.csv',
                         schema=my_schema,
                         header=True)

# view the data
my_data.show(5)

# print the schema of the file
my_data.printSchema()
	# importing required libraries
	from pyspark import SparkContext
	from pyspark.sql.session import SparkSession
	from pyspark.streaming import StreamingContext
	import pyspark.sql.types as tp
	from pyspark.ml import Pipeline
	from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler
	from pyspark.ml.feature import StopWordsRemover, Word2Vec, RegexTokenizer
	from pyspark.ml.classification import LogisticRegression
	from pyspark.sql import Row

	# initializing spark session
	sc = SparkContext(appName="PySparkShell")
	spark = SparkSession(sc)

	# define the schema
	my_schema = tp.StructType([
	tp.StructField(name= 'id', dataType= tp.IntegerType(), nullable= True),
	tp.StructField(name= 'label', dataType= tp.IntegerType(), nullable= True),
	tp.StructField(name= 'tweet', dataType= tp.StringType(), nullable= True)
	])


	# read the dataset
	my_data = spark.read.csv('twitter_sentiments.csv',
	schema=my_schema,
	header=True)

	# view the data
	my_data.show(5)

	# print the schema of the file
	my_data.printSchema()