Youssef Hosni youssefHosni

## data extraction and cleaning
important_features_index=[9,14,17,20,692,699,700,701,702,703,704,705,1368,1370,1371,1410,1411]
important_features_index=[x-2 for x in important_features_index]
demographics_data=demographics_data.iloc[important_features_index,:]
demographics_data=demographics_data.set_index('Attribute')
demographics_data=demographics_data.transpose()
new_columns_name=['Total population' ,'age 15-24','age 25-34','age 35-44','people with income','20-30 thousand','30-40 thousand','40-50 thousand','50-60 thousand','60-80 thousand','80-90 thousand','100 thousand and more','Total population over 15 for education','High school diploma or equivalent','Postsecondary certificate, diploma or degree','In the labour force','Employed']
demographics_data.columns=new_columns_name
demographics_data=demographics_data.reset_index(drop=False)
demographics_data.rename(columns={'index':'Neighborhood'},inplace=True)
demographics_data.drop(index=[0,1,2],axis=1,inplace=True)

## printing the first five rows
print("The shape of the demograhiics dataset is ", demographics_data.shape)
demographics_data.head(10)

## Categorey Exploration
print('The number of Categoreis in the data', len(demographics_data.Category.unique()))
demographics_data.Category.unique()

## plotting the categories
df_grouped_categorey = demographics_data.groupby('Category').count().reset_index()
fig = px.bar(df_grouped_categorey, x='Category', y='Topic')
fig.show()


## saving feature for each category
Language_features = demographics_df[demographics_df['Category']=='Language']['Characteristic'].unique()

Ethnocultural_diversity_features = demographics_df[demographics_df['Category']=='Ethnocultural diversity']['Characteristic'].unique()

Income_features = demographics_df[demographics_df['Category']=='Income']['Characteristic'].unique()

Immigration_citizenship_features = demographics_df[demographics_df['Category']=='Immigration and citizenship']['Characteristic'].unique()

Families_households_marital_status_features = demographics_df[demographics_df['Category']=='Families, households and marital status']['Characteristic'].unique()

## Transposing
demographics_df.drop(['Category', 'Topic'], axis=1 ,inplace=True) # droping categorey and topic columns
demographics_df = demographics_df.T
demographics_df = demographics_df.rename(columns=demographics_df.iloc[0]).drop(demographics_df.index[0])
demographics_df = demographics_df.reset_index().rename(columns={'index':'Neighborhood'})
demographics_df.head()

## Print the features
demographics_df[Aboriginal_Peoples_features].head()

## checking missing data Aboriginal People features
demographics_df[Aboriginal_Peoples_features].isna().sum() # check if there is missing data in this categorey

## checking the types Aboriginal People features
demographics_df[Aboriginal_Peoples_features].info()

## Covnert the Aboriginal People features
# convert the type of the features from object to float
demographics_df[Aboriginal_Peoples_features] = demographics_df[Aboriginal_Peoples_features].astype(str).astype(float)
demographics_df[Aboriginal_Peoples_features].info()
	important_features_index=[9,14,17,20,692,699,700,701,702,703,704,705,1368,1370,1371,1410,1411]
	important_features_index=[x-2 for x in important_features_index]
	demographics_data=demographics_data.iloc[important_features_index,:]
	demographics_data=demographics_data.set_index('Attribute')
	demographics_data=demographics_data.transpose()
	new_columns_name=['Total population' ,'age 15-24','age 25-34','age 35-44','people with income','20-30 thousand','30-40 thousand','40-50 thousand','50-60 thousand','60-80 thousand','80-90 thousand','100 thousand and more','Total population over 15 for education','High school diploma or equivalent','Postsecondary certificate, diploma or degree','In the labour force','Employed']
	demographics_data.columns=new_columns_name
	demographics_data=demographics_data.reset_index(drop=False)
	demographics_data.rename(columns={'index':'Neighborhood'},inplace=True)
	demographics_data.drop(index=[0,1,2],axis=1,inplace=True)
	print("The shape of the demograhiics dataset is ", demographics_data.shape)
	demographics_data.head(10)
	print('The number of Categoreis in the data', len(demographics_data.Category.unique()))
	demographics_data.Category.unique()
	df_grouped_categorey = demographics_data.groupby('Category').count().reset_index()
	fig = px.bar(df_grouped_categorey, x='Category', y='Topic')
	fig.show()
	Language_features = demographics_df[demographics_df['Category']=='Language']['Characteristic'].unique()

	Ethnocultural_diversity_features = demographics_df[demographics_df['Category']=='Ethnocultural diversity']['Characteristic'].unique()

	Income_features = demographics_df[demographics_df['Category']=='Income']['Characteristic'].unique()

	Immigration_citizenship_features = demographics_df[demographics_df['Category']=='Immigration and citizenship']['Characteristic'].unique()

	Families_households_marital_status_features = demographics_df[demographics_df['Category']=='Families, households and marital status']['Characteristic'].unique()
	demographics_df.drop(['Category', 'Topic'], axis=1 ,inplace=True) # droping categorey and topic columns
	demographics_df = demographics_df.T
	demographics_df = demographics_df.rename(columns=demographics_df.iloc[0]).drop(demographics_df.index[0])
	demographics_df = demographics_df.reset_index().rename(columns={'index':'Neighborhood'})
	demographics_df.head()
	# convert the type of the features from object to float
	demographics_df[Aboriginal_Peoples_features] = demographics_df[Aboriginal_Peoples_features].astype(str).astype(float)
	demographics_df[Aboriginal_Peoples_features].info()