nithyadurai87/populations.py

## populations.py
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col
from matplotlib import pyplot
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import Row
from pyspark.ml.regression import LinearRegression

sc = SparkContext('local')
spark = SparkSession(sc)

df1 = spark.read.option('header','true')\
          .option('inferSchema','true')\
          .csv('file:///home/shrini/born_babies.csv')

print (df1)
print (df1.columns)
print (df1.toPandas())


df2 = df1.groupBy('yr')\
           .agg({'male' : 'sum', 'female' : 'sum'})\
           .select(col('yr'), (col('sum(male)')+col('sum(female)')).alias('populations'))\
           .orderBy('yr')

print (df2.toPandas())

pyplot.plot(df2.toPandas().yr, df2.toPandas().populations)
pyplot.xlabel('Year')
pyplot.ylabel('No. of babies')
pyplot.title('Population includes new born male and female babies')
pyplot.annotate('local max', xy=(2001, .0), xytext=(2002, 1.1), fontsize = 12,arrowprops=dict(facecolor='grey', shrink=0.05, linewidth = 2))
pyplot.show()

train = VectorAssembler(inputCols=['yr'], outputCol = 'features').transform(df2)\
            .withColumn('year',df2.yr)\
            .withColumn('label',df2.populations)
print (train.toPandas())

i = VectorAssembler(inputCols=['yr'], outputCol = 'features').transform(sc.parallelize(train.select('yr').rdd.map(lambda x: x[0]).collect()+[2019, 2020, 2021, 2022, 2023]).map(Row('yr')).toDF())
model = LinearRegression(maxIter=10).fit(train).transform(i).toPandas()
print (model)

pyplot.plot(model.yr,model.prediction)
pyplot.plot(train.select('yr').rdd.map(lambda x: x[0]).collect(), train.select('populations').rdd.map(lambda x: x[0]).collect())
pyplot.legend(loc = 4)
pyplot.title('Prediction on future population')
pyplot.show()
	from pyspark.context import SparkContext
	from pyspark.sql.session import SparkSession
	from pyspark.sql.functions import col
	from matplotlib import pyplot
	from pyspark.ml.feature import VectorAssembler
	from pyspark.sql.types import Row
	from pyspark.ml.regression import LinearRegression

	sc = SparkContext('local')
	spark = SparkSession(sc)

	df1 = spark.read.option('header','true')\
	.option('inferSchema','true')\
	.csv('file:///home/shrini/born_babies.csv')

	print (df1)
	print (df1.columns)
	print (df1.toPandas())


	df2 = df1.groupBy('yr')\
	.agg({'male' : 'sum', 'female' : 'sum'})\
	.select(col('yr'), (col('sum(male)')+col('sum(female)')).alias('populations'))\
	.orderBy('yr')

	print (df2.toPandas())

	pyplot.plot(df2.toPandas().yr, df2.toPandas().populations)
	pyplot.xlabel('Year')
	pyplot.ylabel('No. of babies')
	pyplot.title('Population includes new born male and female babies')
	pyplot.annotate('local max', xy=(2001, .0), xytext=(2002, 1.1), fontsize = 12,arrowprops=dict(facecolor='grey', shrink=0.05, linewidth = 2))
	pyplot.show()

	train = VectorAssembler(inputCols=['yr'], outputCol = 'features').transform(df2)\
	.withColumn('year',df2.yr)\
	.withColumn('label',df2.populations)
	print (train.toPandas())

	i = VectorAssembler(inputCols=['yr'], outputCol = 'features').transform(sc.parallelize(train.select('yr').rdd.map(lambda x: x[0]).collect()+[2019, 2020, 2021, 2022, 2023]).map(Row('yr')).toDF())
	model = LinearRegression(maxIter=10).fit(train).transform(i).toPandas()
	print (model)

	pyplot.plot(model.yr,model.prediction)
	pyplot.plot(train.select('yr').rdd.map(lambda x: x[0]).collect(), train.select('populations').rdd.map(lambda x: x[0]).collect())
	pyplot.legend(loc = 4)
	pyplot.title('Prediction on future population')
	pyplot.show()