ubershmekel/h1b.py

## h1b.py
"""
Analyze USA h1b salaries

data from http://www.foreignlaborcert.doleta.gov/quarterlydata.cfm

specifically:
http://www.foreignlaborcert.doleta.gov/pdf/quarter_2_2012/PW_FY2012_Q2.csv
"""

import math
import re
import csv
import os
from collections import namedtuple
from collections import Counter

import numpy as np
import matplotlib.pyplot as plt

TOPN = 20


def xkcd_colors(cache=[]):
    """
    http://blog.xkcd.com/2010/05/03/color-survey-results/
    http://xkcd.com/color/rgb.txt
    """
    if len(cache) > 0:
        return cache
    with open('rgb.txt') as fhand:
        text = fhand.read()
        colors = re.findall(r'#([0-9a-f]+)', text)
        not_saturated = [i for i in colors if not saturated(i)]
        cache.extend(not_saturated)
        return cache

def saturated(color):
    """
    255 * 3 = 765
    """
    r, g, b = [int(i, 16) for i in [color[:2], color[2:4], color[4:6]]]
    lux = r + g + b
    if lux > 510:
        return True
    else:
        return False


def load():
    global tech_jobs, all_jobs
    c = csv.reader(open('PW_FY2012_Q2.csv'))
    header = next(c)
    header.index('PWD_WAGE_RATE')
    wage_i = header.index('PWD_WAGE_RATE')
    field_i = header.index('PWD_SOC_TITLE')
    job_i = header.index('PW_JOB_TITLE')
    # PRIMARY_WORKSITE_STATE or EMPLOYER_STATE ?
    state_i = header.index('PRIMARY_WORKSITE_STATE')

    Row = namedtuple('Row', 'pay, state, field, job')

    all_jobs = []
    for row in c:
        field = row[field_i]
        job = row[job_i]
        state = row[state_i]
        pay_str = row[wage_i]
        try:
            pay = float(pay_str)
        except ValueError:
            # not a number
            continue
        if pay < 10000:
            # under min wage, a bug
            continue

        tup = Row(pay, state.strip().lower(), field.strip().lower(), job.strip().lower())
        all_jobs.append(tup)


    print 'len jobs', len(all_jobs)

    tech_jobs = []
    for tup in all_jobs:
        pay, state, field, job = tup
        job_name = field + ' - ' + job
        if re.findall(r'(program|software|computer)', job_name.lower()):
            tech_jobs.append(tup)

    print 'len tech jobs', len(tech_jobs)
    tech_jobs = sorted(tech_jobs)

#data = np.array(new_data)
#wages = data[:,1].astype(float)

def print_ps(wages):
    percentiles = 75, 50, 25, 10, 5, 1, 0.1
    for p in percentiles:
        p_i = int(len(wages) * (100 - p) / 100.0)
        min_wage = wages[p_i]
        print '%s%% earn more than $%0.0fK' % (p, min_wage / 1000)

def state_ps(state=None):
    if state is None:
        wages = [i.pay for i in tech_jobs]
    else:
        print state, 'state'
        wages = [i.pay for i in tech_jobs if i.state.lower().strip() == state]
    print len(wages)
    print_ps(wages)

def print_percentiles():
    state_ps()
    state_ps('new york')
    state_ps('florida')
    state_ps('california')

#print '-------'
#for i in tech_jobs[int(len(tech_jobs) * .999):]:
#    print i
COLOR_SCHEMES = {
                 'oldschool': ((59,76,76), (125,140,116), (217,175,95), (127,92,70), (51,36,35)),
                 'citrus': ((34,51,49), (70,102,66), (153,142,61), (229,156,44), (255,116,37)),
                 'goldfish': ((229,106,0), (204,199,148), (153,145,124), (88,89,86), (48,49,51)),
                 'audacity': ((181,40,65), (255,192,81), (255,137,57), (232,95,77), (89,0,81)),
                }

import random
DEFAULT_STYLE = '''
    .tag_item {
        text-decoration: none;
        font-weight: bold;
        white-space: nowrap;
    }
    .tag_item:hover {
        background-color: #eee;
    }
    '''
def html_tag_cloud(name_sizes, fn, max_size=100, min_size=5, css_class="tag_item", style=DEFAULT_STYLE, count_fmt="{}"):
    #colors = COLOR_SCHEMES['audacity']
    sizes = np.array([i[1] for i in name_sizes])
    normalizer = 1.0 * max_size / sizes.max()
    sizes *= normalizer
    #offset = min_size - sizes.min()
    #sizes += offset
    sizes = sizes.astype(int)
    with open(fn, 'w') as fhand:
        droplet_template = '''<a href="" class="{css_class}" title="{count}" style="font-size: {size}px; color: #{color};">{text}</a> '''
        fhand.write('<style>%s</style>' % style)
        for i, (name, count) in enumerate(name_sizes):
            #r, g, b = random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)
            #color = '%x%x%x' % (r, g, b) #random.choice(colors)
            color = random.choice(xkcd_colors())
            #lux = random.randint(0, 600)
            #r, g, b =
            line = droplet_template.format(size=sizes[i], text=name, count=count_fmt.format(count), color=color, style=style, css_class=css_class)
            fhand.write(line)

d3_fmt = '''
<body>
<style>
    svg {
        cursor:default;
    }
</style>
<script src="http://d3js.org/d3.v3.min.js"></script>
<script src="d3.layout.cloud.js"></script>

<div id="all_frequencies"></div>
<div id="all_wages"></div>


<script>
    var fill = d3.scale.category20();
    var width = 600;
    var height = 600;

    var tooltip = d3.select("body")
        .append("div")
        .style("position", "absolute")
        .style("z-index", "10")
        .style("background-color", "#fff")
        .style("border", "1px solid #000")
        .style("padding", "5px")
        .style("visibility", "hidden")
        .text("a simple tooltip");

    var suffix = "%s";
    var words = [
        %s
      ];
    var normalize_size = 90.0 / d3.max(words, function(d){return d.size});
    var cloud = d3.layout.cloud()
      .words(
        words
      )
      .size([width, height])
      .timeInterval(10)
      .font("Impact")
      .fontSize(function(d) { return d.o_size * normalize_size; })
      .rotate(function(d) { return ~~(Math.random() * 5) * 10 - 20; })
      .padding(1)
      .on("end", draw)
      .start();

    function draw(words) {
      d3.select("body").append("svg")
        .attr("width", width)
        .attr("height", height)
      .append("g")
        .attr("transform", "translate(" + (width/2) + "," + (height/2) + ")")
      .selectAll("text")
        .data(words)
      .enter().append("text")
        .style("font-size", function(d) { return d.size + "px"; })
        .style("font-family", "Impact")
        .style("fill", function(d, i) { return fill(i); })
        .attr("text-anchor", "middle")
        .attr("transform", function(d) {
          return "translate(" + [d.x, d.y] + ")rotate(" + d.rotate + ")";
        })
        .text(function(d) { return d.text; })
        .on("mouseover", function(d){
            d3.select(this).style("opacity", 0.7);
            tooltip.text(d.o_size + " - " + d.text + " " + suffix);
            //tooltip.text(d.tip);
            tooltip.style("visibility", "visible");
            })
        .on("mousemove", function(){return tooltip.style("top", (event.pageY-10)+"px").style("left",(event.pageX+10)+"px");})
        .on("mouseout", function(){
            d3.select(this).style("opacity", 1.0);
            tooltip.style("visibility", "hidden");
            });
    }
</script>
</body>
'''


def d3_tag_cloud(name_sizes, fn, max_size=90, suffix='usd'):
    #names = [i[0] for i in name_sizes]
    #sizes = np.array([i[1] for i in name_sizes])
    #normalizer = 1.0 * max_size / sizes.max()
    #sizes *= normalizer
    #name_sizes = zip(names, sizes)
    word_fmt = '{text: "%s", size: 1, o_size: %f, tip: "%s"}'
    with open(fn, 'w') as fhand:
        words_str = ','.join([word_fmt % (name, size, "%s %s" % (size, suffix)) for name, size in name_sizes])
        html = d3_fmt % (suffix, words_str)
        fhand.write(html)


def tag_cloud(jobs, set_name='job'):
    job_counts = Counter([i.job for i in jobs])
    #job_frequencies = sorted(job_counts.iteritems(), key=lambda x: x[1], reverse=True)
    #print 'len jobs', len(job_frequencies)
    #job_frequencies = job_frequencies[:TOPN]
    job_frequencies = job_counts.most_common(TOPN)

    # ignore rare jobs
    print 1
    pop_jobs = set(i[0] for i in job_frequencies)
    job_pays = {}
    for tup in jobs:
        if tup.job not in pop_jobs:
            continue
        wages = job_pays.get(tup.job, [])
        wages.append(tup.pay)
        job_pays[tup.job] = wages

    print 2
    job_meds = []
    for name, wages in job_pays.items():
        job_meds.append((name, np.median(wages)))
    job_meds = sorted(job_meds, key=lambda x: x[1], reverse=True)


    d3_tag_cloud(job_frequencies, set_name + '_frequencies.html', suffix='')
    d3_tag_cloud(job_meds, set_name + '_wages.html')
    #from pytagcloud import create_tag_image, make_tags
    #from pytagcloud.lang.counter import get_tag_counts
    #html_tag_cloud(job_frequencies, set_name + '_frequencies.html')
    #html_tag_cloud(job_meds, set_name + '_wages.html', count_fmt='${}')

    #YOUR_TEXT = "A tag cloud is a visual representation for text data, typically\
    #used to depict keyword metadata on websites, or to visualize free form text."
    #tags = make_tags(get_tag_counts(YOUR_TEXT), maxsize=120)

    #print 3
    #tags = make_tags(job_frequencies, maxsize=80)
    #create_tag_image(tags, 'job_frequencies.png', size=(900, 600), fontname='Lobster')

    #print 4
    #tags = make_tags(job_meds, maxsize=80)
    #create_tag_image(tags, 'job_median_pay.png', size=(900, 600), fontname='Lobster')


def anomalies():
    global all_jobs
    pay_freqs = Counter([i.pay for i in h1b.all_jobs])
    pay_freqs.most_common(TOPN)

def main():
    load()
    print_percentiles()
    tag_cloud(all_jobs, 'all')
    tag_cloud(tech_jobs, 'tech')


if __name__ == '__main__':
    main()

'''
trunc = 1.0 / 200
bottom = wages[len(wages) * trunc]
top = wages[-len(wages) * trunc]
#plt.hist(wages, range=[bottom, top])
span = top - bottom
n_bins = 100
bins = np.zeros(n_bins)
for w in wages:
    percentile_i = int(n_bins * (w - bottom) / span)
    if percentile_i < 0:
        percentile_i = 0
    elif percentile_i >= n_bins:
        percentile_i = n_bins - 1
    bins[percentile_i] += 1

plt.bar([i * span / n_bins + bottom for i in range(n_bins)], bins, width=span/n_bins)
'''
	"""
	Analyze USA h1b salaries

	data from http://www.foreignlaborcert.doleta.gov/quarterlydata.cfm

	specifically:
	http://www.foreignlaborcert.doleta.gov/pdf/quarter_2_2012/PW_FY2012_Q2.csv
	"""

	import math
	import re
	import csv
	import os
	from collections import namedtuple
	from collections import Counter

	import numpy as np
	import matplotlib.pyplot as plt

	TOPN = 20


	def xkcd_colors(cache=[]):
	"""
	http://blog.xkcd.com/2010/05/03/color-survey-results/
	http://xkcd.com/color/rgb.txt
	"""
	if len(cache) > 0:
	return cache
	with open('rgb.txt') as fhand:
	text = fhand.read()
	colors = re.findall(r'#([0-9a-f]+)', text)
	not_saturated = [i for i in colors if not saturated(i)]
	cache.extend(not_saturated)
	return cache

	def saturated(color):
	"""
	255 * 3 = 765
	"""
	r, g, b = [int(i, 16) for i in [color[:2], color[2:4], color[4:6]]]
	lux = r + g + b
	if lux > 510:
	return True
	else:
	return False


	def load():
	global tech_jobs, all_jobs
	c = csv.reader(open('PW_FY2012_Q2.csv'))
	header = next(c)
	header.index('PWD_WAGE_RATE')
	wage_i = header.index('PWD_WAGE_RATE')
	field_i = header.index('PWD_SOC_TITLE')
	job_i = header.index('PW_JOB_TITLE')
	# PRIMARY_WORKSITE_STATE or EMPLOYER_STATE ?
	state_i = header.index('PRIMARY_WORKSITE_STATE')

	Row = namedtuple('Row', 'pay, state, field, job')

	all_jobs = []
	for row in c:
	field = row[field_i]
	job = row[job_i]
	state = row[state_i]
	pay_str = row[wage_i]
	try:
	pay = float(pay_str)
	except ValueError:
	# not a number
	continue
	if pay < 10000:
	# under min wage, a bug
	continue

	tup = Row(pay, state.strip().lower(), field.strip().lower(), job.strip().lower())
	all_jobs.append(tup)


	print 'len jobs', len(all_jobs)

	tech_jobs = []
	for tup in all_jobs:
	pay, state, field, job = tup
	job_name = field + ' - ' + job
	if re.findall(r'(program\|software\|computer)', job_name.lower()):
	tech_jobs.append(tup)

	print 'len tech jobs', len(tech_jobs)
	tech_jobs = sorted(tech_jobs)

	#data = np.array(new_data)
	#wages = data[:,1].astype(float)

	def print_ps(wages):
	percentiles = 75, 50, 25, 10, 5, 1, 0.1
	for p in percentiles:
	p_i = int(len(wages) * (100 - p) / 100.0)
	min_wage = wages[p_i]
	print '%s%% earn more than $%0.0fK' % (p, min_wage / 1000)

	def state_ps(state=None):
	if state is None:
	wages = [i.pay for i in tech_jobs]
	else:
	print state, 'state'
	wages = [i.pay for i in tech_jobs if i.state.lower().strip() == state]
	print len(wages)
	print_ps(wages)

	def print_percentiles():
	state_ps()
	state_ps('new york')
	state_ps('florida')
	state_ps('california')

	#print '-------'
	#for i in tech_jobs[int(len(tech_jobs) * .999):]:
	# print i
	COLOR_SCHEMES = {
	'oldschool': ((59,76,76), (125,140,116), (217,175,95), (127,92,70), (51,36,35)),
	'citrus': ((34,51,49), (70,102,66), (153,142,61), (229,156,44), (255,116,37)),
	'goldfish': ((229,106,0), (204,199,148), (153,145,124), (88,89,86), (48,49,51)),
	'audacity': ((181,40,65), (255,192,81), (255,137,57), (232,95,77), (89,0,81)),
	}

	import random
	DEFAULT_STYLE = '''
	.tag_item {
	text-decoration: none;
	font-weight: bold;
	white-space: nowrap;
	}
	.tag_item:hover {
	background-color: #eee;
	}
	'''
	def html_tag_cloud(name_sizes, fn, max_size=100, min_size=5, css_class="tag_item", style=DEFAULT_STYLE, count_fmt="{}"):
	#colors = COLOR_SCHEMES['audacity']
	sizes = np.array([i[1] for i in name_sizes])
	normalizer = 1.0 * max_size / sizes.max()
	sizes *= normalizer
	#offset = min_size - sizes.min()
	#sizes += offset
	sizes = sizes.astype(int)
	with open(fn, 'w') as fhand:
	droplet_template = '''<a href="" class="{css_class}" title="{count}" style="font-size: {size}px; color: #{color};">{text}</a> '''
	fhand.write('<style>%s</style>' % style)
	for i, (name, count) in enumerate(name_sizes):
	#r, g, b = random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)
	#color = '%x%x%x' % (r, g, b) #random.choice(colors)
	color = random.choice(xkcd_colors())
	#lux = random.randint(0, 600)
	#r, g, b =
	line = droplet_template.format(size=sizes[i], text=name, count=count_fmt.format(count), color=color, style=style, css_class=css_class)
	fhand.write(line)

	d3_fmt = '''
	<body>
	<style>
	svg {
	cursor:default;
	}
	</style>
	<script src="http://d3js.org/d3.v3.min.js"></script>
	<script src="d3.layout.cloud.js"></script>

	<div id="all_frequencies"></div>
	<div id="all_wages"></div>


	<script>
	var fill = d3.scale.category20();
	var width = 600;
	var height = 600;

	var tooltip = d3.select("body")
	.append("div")
	.style("position", "absolute")
	.style("z-index", "10")
	.style("background-color", "#fff")
	.style("border", "1px solid #000")
	.style("padding", "5px")
	.style("visibility", "hidden")
	.text("a simple tooltip");

	var suffix = "%s";
	var words = [
	%s
	];
	var normalize_size = 90.0 / d3.max(words, function(d){return d.size});
	var cloud = d3.layout.cloud()
	.words(
	words
	)
	.size([width, height])
	.timeInterval(10)
	.font("Impact")
	.fontSize(function(d) { return d.o_size * normalize_size; })
	.rotate(function(d) { return ~~(Math.random() * 5) * 10 - 20; })
	.padding(1)
	.on("end", draw)
	.start();

	function draw(words) {
	d3.select("body").append("svg")
	.attr("width", width)
	.attr("height", height)
	.append("g")
	.attr("transform", "translate(" + (width/2) + "," + (height/2) + ")")
	.selectAll("text")
	.data(words)
	.enter().append("text")
	.style("font-size", function(d) { return d.size + "px"; })
	.style("font-family", "Impact")
	.style("fill", function(d, i) { return fill(i); })
	.attr("text-anchor", "middle")
	.attr("transform", function(d) {
	return "translate(" + [d.x, d.y] + ")rotate(" + d.rotate + ")";
	})
	.text(function(d) { return d.text; })
	.on("mouseover", function(d){
	d3.select(this).style("opacity", 0.7);
	tooltip.text(d.o_size + " - " + d.text + " " + suffix);
	//tooltip.text(d.tip);
	tooltip.style("visibility", "visible");
	})
	.on("mousemove", function(){return tooltip.style("top", (event.pageY-10)+"px").style("left",(event.pageX+10)+"px");})
	.on("mouseout", function(){
	d3.select(this).style("opacity", 1.0);
	tooltip.style("visibility", "hidden");
	});
	}
	</script>
	</body>
	'''


	def d3_tag_cloud(name_sizes, fn, max_size=90, suffix='usd'):
	#names = [i[0] for i in name_sizes]
	#sizes = np.array([i[1] for i in name_sizes])
	#normalizer = 1.0 * max_size / sizes.max()
	#sizes *= normalizer
	#name_sizes = zip(names, sizes)
	word_fmt = '{text: "%s", size: 1, o_size: %f, tip: "%s"}'
	with open(fn, 'w') as fhand:
	words_str = ','.join([word_fmt % (name, size, "%s %s" % (size, suffix)) for name, size in name_sizes])
	html = d3_fmt % (suffix, words_str)
	fhand.write(html)


	def tag_cloud(jobs, set_name='job'):
	job_counts = Counter([i.job for i in jobs])
	#job_frequencies = sorted(job_counts.iteritems(), key=lambda x: x[1], reverse=True)
	#print 'len jobs', len(job_frequencies)
	#job_frequencies = job_frequencies[:TOPN]
	job_frequencies = job_counts.most_common(TOPN)

	# ignore rare jobs
	print 1
	pop_jobs = set(i[0] for i in job_frequencies)
	job_pays = {}
	for tup in jobs:
	if tup.job not in pop_jobs:
	continue
	wages = job_pays.get(tup.job, [])
	wages.append(tup.pay)
	job_pays[tup.job] = wages

	print 2
	job_meds = []
	for name, wages in job_pays.items():
	job_meds.append((name, np.median(wages)))
	job_meds = sorted(job_meds, key=lambda x: x[1], reverse=True)



	d3_tag_cloud(job_frequencies, set_name + '_frequencies.html', suffix='')
	d3_tag_cloud(job_meds, set_name + '_wages.html')
	#from pytagcloud import create_tag_image, make_tags
	#from pytagcloud.lang.counter import get_tag_counts
	#html_tag_cloud(job_frequencies, set_name + '_frequencies.html')
	#html_tag_cloud(job_meds, set_name + '_wages.html', count_fmt='${}')

	#YOUR_TEXT = "A tag cloud is a visual representation for text data, typically\
	#used to depict keyword metadata on websites, or to visualize free form text."
	#tags = make_tags(get_tag_counts(YOUR_TEXT), maxsize=120)

	#print 3
	#tags = make_tags(job_frequencies, maxsize=80)
	#create_tag_image(tags, 'job_frequencies.png', size=(900, 600), fontname='Lobster')

	#print 4
	#tags = make_tags(job_meds, maxsize=80)
	#create_tag_image(tags, 'job_median_pay.png', size=(900, 600), fontname='Lobster')


	def anomalies():
	global all_jobs
	pay_freqs = Counter([i.pay for i in h1b.all_jobs])
	pay_freqs.most_common(TOPN)

	def main():
	load()
	print_percentiles()
	tag_cloud(all_jobs, 'all')
	tag_cloud(tech_jobs, 'tech')


	if __name__ == '__main__':
	main()

	'''
	trunc = 1.0 / 200
	bottom = wages[len(wages) * trunc]
	top = wages[-len(wages) * trunc]
	#plt.hist(wages, range=[bottom, top])
	span = top - bottom
	n_bins = 100
	bins = np.zeros(n_bins)
	for w in wages:
	percentile_i = int(n_bins * (w - bottom) / span)
	if percentile_i < 0:
	percentile_i = 0
	elif percentile_i >= n_bins:
	percentile_i = n_bins - 1
	bins[percentile_i] += 1

	plt.bar([i * span / n_bins + bottom for i in range(n_bins)], bins, width=span/n_bins)
	'''