andyljones/developer_compensations.py

## developer_compensations.py
# -*- coding: utf-8 -*-
"""
To use this, drop the file

'Full Results - Stack Overflow Developer Survey - 2015.csv'

from

https://drive.google.com/file/d/0Bzd_CzYvUxE5U1NSWnA2SFVKX00/view

into the same directory, then run the script.
"""

import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

POUNDS_PER_DOLLAR = 0.64

data_path = 'Full Results - Stack Overflow Developer Survey - 2015.csv'
data = pd.read_csv(data_path, skiprows=1)
data = data[(data['Country'] == 'United Kingdom') & (data['Employment Status'] == 'Employed full-time')]

occupation_and_compensation = data[['Compensation', 'Occupation']].dropna()

salary_map = {'Rather not say':    sp.nan,
              'Unemployed':        0,
              'Less than $20,000': 10000,
              '$20,000 - $40,000': 30000,
              '$40,000 - $60,000': 50000,
              '$60,000 - $80,000': 70000,
              '$80,000 - $100,000': 90000,
              '$100,000 - $120,000': 110000,
              '$120,000 - $140,000': 130000,
              '$140,000 - $160,000': 150000,
              'More than $160,000': 170000}

occupation_and_compensation['Compensation'] = POUNDS_PER_DOLLAR*occupation_and_compensation['Compensation'].apply(salary_map.get)
occupation_and_compensation = occupation_and_compensation.dropna()
compensations = occupation_and_compensation.groupby('Occupation')

def compare_compensations(compensations):
    compensations_of_interest = compensations.filter(lambda x: len(x) > 25)
    mean_compensation = compensations_of_interest.groupby('Occupation').aggregate(sp.mean)

    ax = mean_compensation.sort('Compensation').plot(kind='barh')
    ax.set_title('Mean compensation by occupation \nn = {:,}'.format(len(compensations_of_interest)))
    ax.set_xticklabels(['{:,}'.format(int(x)) for x in plt.gca().xaxis.get_majorticklocs()])
    ax.set_xlabel('Mean compensation (GBP)')
    ax.figure.set_size_inches(10, 10)
    ax.legend_.remove()

    return ax

def job_compensation(compensations, name):
    comps_for_job = compensations.get_group(name)['Compensation'].values
    comps = sorted(sp.unique(comps_for_job))
    counts = [100*(comps_for_job <= c).sum()/float(comps_for_job.size) for c in comps]
    plt.plot(comps, counts, label=name)
    plt.xlabel('Compensation (GBP)')
    plt.ylabel('Cumulative percentage')
    plt.title('Cumulative compensation distribution for {}s\nn = {:,}'.format(name.lower(), len(comps_for_job)))
    plt.gca().set_xticklabels(['{:,}'.format(int(x)) for x in plt.gca().xaxis.get_majorticklocs()])
    plt.gcf().set_size_inches(10, 10)

    return plt.gca()

compare_compensations(compensations)
job_compensation(compensations, 'Full-stack web developer')
	# -- coding: utf-8 --
	"""
	To use this, drop the file

	'Full Results - Stack Overflow Developer Survey - 2015.csv'

	from

	https://drive.google.com/file/d/0Bzd_CzYvUxE5U1NSWnA2SFVKX00/view

	into the same directory, then run the script.
	"""

	import scipy as sp
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns

	POUNDS_PER_DOLLAR = 0.64

	data_path = 'Full Results - Stack Overflow Developer Survey - 2015.csv'
	data = pd.read_csv(data_path, skiprows=1)
	data = data[(data['Country'] == 'United Kingdom') & (data['Employment Status'] == 'Employed full-time')]

	occupation_and_compensation = data[['Compensation', 'Occupation']].dropna()

	salary_map = {'Rather not say': sp.nan,
	'Unemployed': 0,
	'Less than $20,000': 10000,
	'$20,000 - $40,000': 30000,
	'$40,000 - $60,000': 50000,
	'$60,000 - $80,000': 70000,
	'$80,000 - $100,000': 90000,
	'$100,000 - $120,000': 110000,
	'$120,000 - $140,000': 130000,
	'$140,000 - $160,000': 150000,
	'More than $160,000': 170000}

	occupation_and_compensation['Compensation'] = POUNDS_PER_DOLLAR*occupation_and_compensation['Compensation'].apply(salary_map.get)
	occupation_and_compensation = occupation_and_compensation.dropna()
	compensations = occupation_and_compensation.groupby('Occupation')

	def compare_compensations(compensations):
	compensations_of_interest = compensations.filter(lambda x: len(x) > 25)
	mean_compensation = compensations_of_interest.groupby('Occupation').aggregate(sp.mean)

	ax = mean_compensation.sort('Compensation').plot(kind='barh')
	ax.set_title('Mean compensation by occupation \nn = {:,}'.format(len(compensations_of_interest)))
	ax.set_xticklabels(['{:,}'.format(int(x)) for x in plt.gca().xaxis.get_majorticklocs()])
	ax.set_xlabel('Mean compensation (GBP)')
	ax.figure.set_size_inches(10, 10)
	ax.legend_.remove()

	return ax

	def job_compensation(compensations, name):
	comps_for_job = compensations.get_group(name)['Compensation'].values
	comps = sorted(sp.unique(comps_for_job))
	counts = [100*(comps_for_job <= c).sum()/float(comps_for_job.size) for c in comps]
	plt.plot(comps, counts, label=name)
	plt.xlabel('Compensation (GBP)')
	plt.ylabel('Cumulative percentage')
	plt.title('Cumulative compensation distribution for {}s\nn = {:,}'.format(name.lower(), len(comps_for_job)))
	plt.gca().set_xticklabels(['{:,}'.format(int(x)) for x in plt.gca().xaxis.get_majorticklocs()])
	plt.gcf().set_size_inches(10, 10)

	return plt.gca()

	compare_compensations(compensations)
	job_compensation(compensations, 'Full-stack web developer')