Parul Pandey parulnith

## Bar Charts.py
chart = ctc.Bar("Cities")

chart.set_options(
    labels=list(cities.index),
    x_label='City',
    y_label='Count',
    colors=['#FFF1C5','#F7B7A3','#EA5F89','#9B3192','#57167E'],
 )
chart.add_series('Count',list(cities['values']))

## Doughnut chart.py
chart = ctc.Pie("Top 5 cities by the number of respondents")

chart.set_options(
 labels=list(cities.index),
 inner_radius=0.5,
    colors=['#FFF1C5','#F7B7A3','#EA5F89','#9B3192','#57167E','#47B39C','#00529B'],
 )
chart.add_series(list(cities['values']))

# Calling the load_javascript function when rendering chart first time.

## Pie Chart.py
chart = ctc.Pie("Gender of Respondents")

chart.set_options(
  labels=list(gender.index),
  inner_radius=0,
  colors=['#FFF1C1','#F7B7A3','#EA5F89'],
 )
chart.add_series(list(gender['values']))

# Calling the load_javascript function when rendering chart first time.

## employee_dataset.py
df = pd.DataFrame({
    'Gender' : ['Female', 'Male', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male','Male', 'Female','Male', 'Female'],
    'Age' : [41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 29],
    'EducationField': ['Life Sciences', 'Engineering', 'Life Sciences', 'Life Sciences', 'Medical', 'Life Sciences', 'Life Sciences', 'Life Sciences', 'Engineering', 'Medical', 'Life Sciences', 'Life Sciences'],
    'MonthlyIncome': [5993, 5130, 2090, 2909, 3468, 3068, 2670, 2693, 9526, 5237, 2426, 4193]
})

## species.py
df_Adelie = df[df['species'] == 'Adelie']
df_Gentoo = df[df['species'] == 'Gentoo']
df_Chinstrap = df[df['species'] == 'Chinstrap']

datasets = [df_Adelie,df_Gentoo,df_Chinstrap]
color = ['skyblue','red','orange']
zip_datasets_color = zip(datasets, color)
for d,c in zip_datasets_color:
    g = sns.lmplot(x = 'culmen_length_mm',
                   y = 'culmen_depth_mm',

## entire_population.py
sns.lmplot(x = 'culmen_length_mm',y = 'culmen_depth_mm', data = df);

# For calculating correlation coefficient and superimposing on the plot
r = stats.pearsonr(df['culmen_length_mm'], df['culmen_depth_mm'])[0]
ax = plt.gca()
ax.text(.03, 1, 'r={:.3f}'.format(r),
        transform=ax.transAxes)

#Displaying the plot
plt.show()

## evaluate.py
%matplotlib inline
from sklearn.metrics import roc_curve, precision_recall_curve, auc
import matplotlib.pyplot as plt
import numpy as np


def get_auc(labels, scores):
    fpr, tpr, thresholds = roc_curve(labels, scores)
    auc_score = auc(fpr, tpr)

## data.py
df = pd.read_csv("diabetes.csv")
df.head()

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

## timeseries.py
time_series = df.groupBy(F.year('datetime').alias('year'),
                         F.weekofyear('datetime').alias('week')) \
                .agg(rf_agg_mean('ndvi').alias('mean_ndvi'))

ts_pd = time_series.toPandas()

#Visualizing using matplotlib
ts_pd.sort_values(['year', 'week'], inplace=True)
# Create a compact label of year and week number yyyy_ww
ts_pd['year_week'] = ts_pd.apply(lambda r:'{0:g}_{1:02g}'.format(r.year, r.week), axis=1)

## spark_dataframe.py
df.select('red',
          'nir',
          'datetime',
          'id',
          rf_extent('red').alias('extent'),
          rf_crs('red').alias('crs')) \
    .filter(rf_no_data_cells(rf_with_no_data('red', 0)) < 800)
        # show tiles that have lots of valid data
	chart = ctc.Bar("Cities")

	chart.set_options(
	labels=list(cities.index),
	x_label='City',
	y_label='Count',
	colors=['#FFF1C5','#F7B7A3','#EA5F89','#9B3192','#57167E'],
	)
	chart.add_series('Count',list(cities['values']))
	chart = ctc.Pie("Top 5 cities by the number of respondents")

	chart.set_options(
	labels=list(cities.index),
	inner_radius=0.5,
	colors=['#FFF1C5','#F7B7A3','#EA5F89','#9B3192','#57167E','#47B39C','#00529B'],
	)
	chart.add_series(list(cities['values']))

	# Calling the load_javascript function when rendering chart first time.
	chart = ctc.Pie("Gender of Respondents")

	chart.set_options(
	labels=list(gender.index),
	inner_radius=0,
	colors=['#FFF1C1','#F7B7A3','#EA5F89'],
	)
	chart.add_series(list(gender['values']))

	# Calling the load_javascript function when rendering chart first time.
	df = pd.DataFrame({
	'Gender' : ['Female', 'Male', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male','Male', 'Female','Male', 'Female'],
	'Age' : [41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 29],
	'EducationField': ['Life Sciences', 'Engineering', 'Life Sciences', 'Life Sciences', 'Medical', 'Life Sciences', 'Life Sciences', 'Life Sciences', 'Engineering', 'Medical', 'Life Sciences', 'Life Sciences'],
	'MonthlyIncome': [5993, 5130, 2090, 2909, 3468, 3068, 2670, 2693, 9526, 5237, 2426, 4193]
	})
	df_Adelie = df[df['species'] == 'Adelie']
	df_Gentoo = df[df['species'] == 'Gentoo']
	df_Chinstrap = df[df['species'] == 'Chinstrap']

	datasets = [df_Adelie,df_Gentoo,df_Chinstrap]
	color = ['skyblue','red','orange']
	zip_datasets_color = zip(datasets, color)
	for d,c in zip_datasets_color:
	g = sns.lmplot(x = 'culmen_length_mm',
	y = 'culmen_depth_mm',
	sns.lmplot(x = 'culmen_length_mm',y = 'culmen_depth_mm', data = df);

	# For calculating correlation coefficient and superimposing on the plot
	r = stats.pearsonr(df['culmen_length_mm'], df['culmen_depth_mm'])[0]
	ax = plt.gca()
	ax.text(.03, 1, 'r={:.3f}'.format(r),
	transform=ax.transAxes)

	#Displaying the plot
	plt.show()
	%matplotlib inline
	from sklearn.metrics import roc_curve, precision_recall_curve, auc
	import matplotlib.pyplot as plt
	import numpy as np



	def get_auc(labels, scores):
	fpr, tpr, thresholds = roc_curve(labels, scores)
	auc_score = auc(fpr, tpr)
	df = pd.read_csv("diabetes.csv")
	df.head()

	Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
	0 6 148 72 35 0 33.6 0.627 50 1
	1 1 85 66 29 0 26.6 0.351 31 0
	2 8 183 64 0 0 23.3 0.672 32 1
	3 1 89 66 23 94 28.1 0.167 21 0
	4 0 137 40 35 168 43.1 2.288 33 1
	time_series = df.groupBy(F.year('datetime').alias('year'),
	F.weekofyear('datetime').alias('week')) \
	.agg(rf_agg_mean('ndvi').alias('mean_ndvi'))

	ts_pd = time_series.toPandas()

	#Visualizing using matplotlib
	ts_pd.sort_values(['year', 'week'], inplace=True)
	# Create a compact label of year and week number yyyy_ww
	ts_pd['year_week'] = ts_pd.apply(lambda r:'{0:g}_{1:02g}'.format(r.year, r.week), axis=1)
	df.select('red',
	'nir',
	'datetime',
	'id',
	rf_extent('red').alias('extent'),
	rf_crs('red').alias('crs')) \
	.filter(rf_no_data_cells(rf_with_no_data('red', 0)) < 800)
	# show tiles that have lots of valid data