vinayakg/demography_analyzer.py

## demography_analyzer.py
import pandas as pd


def calculate_demographic_data(print_data=True):
    # Read data from file
    df = pd.read_csv("boilerplate-demographic-data-analyzer/adult.data.csv")

    # How many of each race are represented in this dataset? This should be a Pandas series with race names as the index labels.
    # How many people of each race are represented in this dataset? This should be a Pandas series with race names as the index labels. (race column)
    race_count = df["race"].value_counts()

    # What is the average age of men?
    # What is the average age of men?

    average_age_men = df.loc[(df["sex"] == "Male")]["age"].mean(numeric_only=True)

    # What is the percentage of people who have a Bachelor's degree?
    # What is the percentage of people who have a Bachelor's degree?
    percentage_bachelors = (
        len(df.loc[(df["education"] == "Bachelors")]) / len(df)
    ) * 100

    # What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K?
    # What percentage of people without advanced education make more than 50K?
    advanced_educated = df[
        (df["education"] == "Bachelors")
        | (df["education"] == "Masters")
        | (df["education"] == "Doctorate")
    ]
    non_advanced_educated = df[
        (df["education"] != "Bachelors")
        & (df["education"] != "Masters")
        & (df["education"] != "Doctorate")
    ]
    percentage_advanced_educated_more_than_50K = (
        len(advanced_educated[advanced_educated["salary"] == ">50K"])
        / len(advanced_educated)
    ) * 100
    percentage_non_advanced_educated_more_than_50K = (
        len(non_advanced_educated[non_advanced_educated["salary"] == ">50K"])
        / len(non_advanced_educated)
    ) * 100

    # What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?

    # with and without `Bachelors`, `Masters`, or `Doctorate`
    higher_education = None
    lower_education = None

    # percentage with salary >50K
    # What percentage of people without advanced education make more than 50K?

    higher_education_rich = percentage_advanced_educated_more_than_50K
    lower_education_rich = percentage_non_advanced_educated_more_than_50K

    # What is the minimum number of hours a person works per week (hours-per-week feature)?
    # What is the minimum number of hours a person works per week?
    min_work_hours = df["hours-per-week"].min()

    # What percentage of the people who work the minimum number of hours per week have a salary of >50K?
    # What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?

    minimum_hour_workers = df[df["hours-per-week"] == min_work_hours]
    num_min_workers = (
        len(minimum_hour_workers[minimum_hour_workers["salary"] == ">50K"])
        / len(minimum_hour_workers)
    ) * 100

    rich_percentage = num_min_workers

    # What country has the highest percentage of people that earn >50K?
    # What country has the highest percentage of people that earn >50K and what is that percentage?
    # Calculate the total count for each country:
    country_counts_obj = df["native-country"].value_counts()
    countrypop_earning_more_than_50K = df[df["salary"] == ">50K"]
    highest_earning_country_obj = countrypop_earning_more_than_50K[
        "native-country"
    ].value_counts()

    highest_earning_country_percentage_obj = (
        highest_earning_country_obj / country_counts_obj
    ) * 100

    highest_earning_country = highest_earning_country_percentage_obj.idxmax()
    highest_earning_country_percentage = highest_earning_country_percentage_obj.max()

    # Identify the most popular occupation for those who earn >50K in India.
    # Identify the most popular occupation for those who earn >50K in India.
    top_IN_occupation = (
        countrypop_earning_more_than_50K[
            countrypop_earning_more_than_50K["native-country"] == "India"
        ]["occupation"]
        .value_counts()
        .idxmax()
    )
    # DO NOT MODIFY BELOW THIS LINE

    if print_data:
        print("Number of each race:\n", race_count)
        print("Average age of men:", average_age_men)
        print(f"Percentage with Bachelors degrees: {percentage_bachelors}%")
        print(
            f"Percentage with higher education that earn >50K: {higher_education_rich}%"
        )
        print(
            f"Percentage without higher education that earn >50K: {lower_education_rich}%"
        )
        print(f"Min work time: {min_work_hours} hours/week")
        print(
            f"Percentage of rich among those who work fewest hours: {rich_percentage}%"
        )
        print("Country with highest percentage of rich:", highest_earning_country)
        print(
            f"Highest percentage of rich people in country: {highest_earning_country_percentage}%"
        )
        print("Top occupations in India:", top_IN_occupation)

    return {
        "race_count": race_count,
        "average_age_men": average_age_men,
        "percentage_bachelors": percentage_bachelors,
        "higher_education_rich": higher_education_rich,
        "lower_education_rich": lower_education_rich,
        "min_work_hours": min_work_hours,
        "rich_percentage": rich_percentage,
        "highest_earning_country": highest_earning_country,
        "highest_earning_country_percentage": highest_earning_country_percentage,
        "top_IN_occupation": top_IN_occupation,
    }
	import pandas as pd


	def calculate_demographic_data(print_data=True):
	# Read data from file
	df = pd.read_csv("boilerplate-demographic-data-analyzer/adult.data.csv")

	# How many of each race are represented in this dataset? This should be a Pandas series with race names as the index labels.
	# How many people of each race are represented in this dataset? This should be a Pandas series with race names as the index labels. (race column)
	race_count = df["race"].value_counts()

	# What is the average age of men?
	# What is the average age of men?

	average_age_men = df.loc[(df["sex"] == "Male")]["age"].mean(numeric_only=True)

	# What is the percentage of people who have a Bachelor's degree?
	# What is the percentage of people who have a Bachelor's degree?
	percentage_bachelors = (
	len(df.loc[(df["education"] == "Bachelors")]) / len(df)
	) * 100

	# What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K?
	# What percentage of people without advanced education make more than 50K?
	advanced_educated = df[
	(df["education"] == "Bachelors")
	\| (df["education"] == "Masters")
	\| (df["education"] == "Doctorate")
	]
	non_advanced_educated = df[
	(df["education"] != "Bachelors")
	& (df["education"] != "Masters")
	& (df["education"] != "Doctorate")
	]
	percentage_advanced_educated_more_than_50K = (
	len(advanced_educated[advanced_educated["salary"] == ">50K"])
	/ len(advanced_educated)
	) * 100
	percentage_non_advanced_educated_more_than_50K = (
	len(non_advanced_educated[non_advanced_educated["salary"] == ">50K"])
	/ len(non_advanced_educated)
	) * 100

	# What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?

	# with and without `Bachelors`, `Masters`, or `Doctorate`
	higher_education = None
	lower_education = None

	# percentage with salary >50K
	# What percentage of people without advanced education make more than 50K?

	higher_education_rich = percentage_advanced_educated_more_than_50K
	lower_education_rich = percentage_non_advanced_educated_more_than_50K

	# What is the minimum number of hours a person works per week (hours-per-week feature)?
	# What is the minimum number of hours a person works per week?
	min_work_hours = df["hours-per-week"].min()

	# What percentage of the people who work the minimum number of hours per week have a salary of >50K?
	# What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?

	minimum_hour_workers = df[df["hours-per-week"] == min_work_hours]
	num_min_workers = (
	len(minimum_hour_workers[minimum_hour_workers["salary"] == ">50K"])
	/ len(minimum_hour_workers)
	) * 100

	rich_percentage = num_min_workers

	# What country has the highest percentage of people that earn >50K?
	# What country has the highest percentage of people that earn >50K and what is that percentage?
	# Calculate the total count for each country:
	country_counts_obj = df["native-country"].value_counts()
	countrypop_earning_more_than_50K = df[df["salary"] == ">50K"]
	highest_earning_country_obj = countrypop_earning_more_than_50K[
	"native-country"
	].value_counts()

	highest_earning_country_percentage_obj = (
	highest_earning_country_obj / country_counts_obj
	) * 100

	highest_earning_country = highest_earning_country_percentage_obj.idxmax()
	highest_earning_country_percentage = highest_earning_country_percentage_obj.max()

	# Identify the most popular occupation for those who earn >50K in India.
	# Identify the most popular occupation for those who earn >50K in India.
	top_IN_occupation = (
	countrypop_earning_more_than_50K[
	countrypop_earning_more_than_50K["native-country"] == "India"
	]["occupation"]
	.value_counts()
	.idxmax()
	)
	# DO NOT MODIFY BELOW THIS LINE

	if print_data:
	print("Number of each race:\n", race_count)
	print("Average age of men:", average_age_men)
	print(f"Percentage with Bachelors degrees: {percentage_bachelors}%")
	print(
	f"Percentage with higher education that earn >50K: {higher_education_rich}%"
	)
	print(
	f"Percentage without higher education that earn >50K: {lower_education_rich}%"
	)
	print(f"Min work time: {min_work_hours} hours/week")
	print(
	f"Percentage of rich among those who work fewest hours: {rich_percentage}%"
	)
	print("Country with highest percentage of rich:", highest_earning_country)
	print(
	f"Highest percentage of rich people in country: {highest_earning_country_percentage}%"
	)
	print("Top occupations in India:", top_IN_occupation)

	return {
	"race_count": race_count,
	"average_age_men": average_age_men,
	"percentage_bachelors": percentage_bachelors,
	"higher_education_rich": higher_education_rich,
	"lower_education_rich": lower_education_rich,
	"min_work_hours": min_work_hours,
	"rich_percentage": rich_percentage,
	"highest_earning_country": highest_earning_country,
	"highest_earning_country_percentage": highest_earning_country_percentage,
	"top_IN_occupation": top_IN_occupation,
	}