Skip to content

Instantly share code, notes, and snippets.

@lakshay-arora
Last active December 9, 2019 11:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lakshay-arora/f4894d5128a85e2bad7b7d5abb1813c9 to your computer and use it in GitHub Desktop.
Save lakshay-arora/f4894d5128a85e2bad7b7d5abb1813c9 to your computer and use it in GitHub Desktop.
# importing required libraries
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.streaming import StreamingContext
import pyspark.sql.types as tp
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml.feature import StopWordsRemover, Word2Vec, RegexTokenizer
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import Row
# initializing spark session
sc = SparkContext(appName="PySparkShell")
spark = SparkSession(sc)
# define the schema
my_schema = tp.StructType([
tp.StructField(name= 'id', dataType= tp.IntegerType(), nullable= True),
tp.StructField(name= 'label', dataType= tp.IntegerType(), nullable= True),
tp.StructField(name= 'tweet', dataType= tp.StringType(), nullable= True)
])
# read the dataset
my_data = spark.read.csv('twitter_sentiments.csv',
schema=my_schema,
header=True)
# view the data
my_data.show(5)
# print the schema of the file
my_data.printSchema()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment