Skip to content

Instantly share code, notes, and snippets.

@ubershmekel
Created January 27, 2013 20:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ubershmekel/4650376 to your computer and use it in GitHub Desktop.
Save ubershmekel/4650376 to your computer and use it in GitHub Desktop.
h1b wages analysis
"""
Analyze USA h1b salaries
data from http://www.foreignlaborcert.doleta.gov/quarterlydata.cfm
specifically:
http://www.foreignlaborcert.doleta.gov/pdf/quarter_2_2012/PW_FY2012_Q2.csv
"""
import math
import re
import csv
import os
from collections import namedtuple
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
TOPN = 20
def xkcd_colors(cache=[]):
"""
http://blog.xkcd.com/2010/05/03/color-survey-results/
http://xkcd.com/color/rgb.txt
"""
if len(cache) > 0:
return cache
with open('rgb.txt') as fhand:
text = fhand.read()
colors = re.findall(r'#([0-9a-f]+)', text)
not_saturated = [i for i in colors if not saturated(i)]
cache.extend(not_saturated)
return cache
def saturated(color):
"""
255 * 3 = 765
"""
r, g, b = [int(i, 16) for i in [color[:2], color[2:4], color[4:6]]]
lux = r + g + b
if lux > 510:
return True
else:
return False
def load():
global tech_jobs, all_jobs
c = csv.reader(open('PW_FY2012_Q2.csv'))
header = next(c)
header.index('PWD_WAGE_RATE')
wage_i = header.index('PWD_WAGE_RATE')
field_i = header.index('PWD_SOC_TITLE')
job_i = header.index('PW_JOB_TITLE')
# PRIMARY_WORKSITE_STATE or EMPLOYER_STATE ?
state_i = header.index('PRIMARY_WORKSITE_STATE')
Row = namedtuple('Row', 'pay, state, field, job')
all_jobs = []
for row in c:
field = row[field_i]
job = row[job_i]
state = row[state_i]
pay_str = row[wage_i]
try:
pay = float(pay_str)
except ValueError:
# not a number
continue
if pay < 10000:
# under min wage, a bug
continue
tup = Row(pay, state.strip().lower(), field.strip().lower(), job.strip().lower())
all_jobs.append(tup)
print 'len jobs', len(all_jobs)
tech_jobs = []
for tup in all_jobs:
pay, state, field, job = tup
job_name = field + ' - ' + job
if re.findall(r'(program|software|computer)', job_name.lower()):
tech_jobs.append(tup)
print 'len tech jobs', len(tech_jobs)
tech_jobs = sorted(tech_jobs)
#data = np.array(new_data)
#wages = data[:,1].astype(float)
def print_ps(wages):
percentiles = 75, 50, 25, 10, 5, 1, 0.1
for p in percentiles:
p_i = int(len(wages) * (100 - p) / 100.0)
min_wage = wages[p_i]
print '%s%% earn more than $%0.0fK' % (p, min_wage / 1000)
def state_ps(state=None):
if state is None:
wages = [i.pay for i in tech_jobs]
else:
print state, 'state'
wages = [i.pay for i in tech_jobs if i.state.lower().strip() == state]
print len(wages)
print_ps(wages)
def print_percentiles():
state_ps()
state_ps('new york')
state_ps('florida')
state_ps('california')
#print '-------'
#for i in tech_jobs[int(len(tech_jobs) * .999):]:
# print i
COLOR_SCHEMES = {
'oldschool': ((59,76,76), (125,140,116), (217,175,95), (127,92,70), (51,36,35)),
'citrus': ((34,51,49), (70,102,66), (153,142,61), (229,156,44), (255,116,37)),
'goldfish': ((229,106,0), (204,199,148), (153,145,124), (88,89,86), (48,49,51)),
'audacity': ((181,40,65), (255,192,81), (255,137,57), (232,95,77), (89,0,81)),
}
import random
DEFAULT_STYLE = '''
.tag_item {
text-decoration: none;
font-weight: bold;
white-space: nowrap;
}
.tag_item:hover {
background-color: #eee;
}
'''
def html_tag_cloud(name_sizes, fn, max_size=100, min_size=5, css_class="tag_item", style=DEFAULT_STYLE, count_fmt="{}"):
#colors = COLOR_SCHEMES['audacity']
sizes = np.array([i[1] for i in name_sizes])
normalizer = 1.0 * max_size / sizes.max()
sizes *= normalizer
#offset = min_size - sizes.min()
#sizes += offset
sizes = sizes.astype(int)
with open(fn, 'w') as fhand:
droplet_template = '''<a href="" class="{css_class}" title="{count}" style="font-size: {size}px; color: #{color};">{text}</a> '''
fhand.write('<style>%s</style>' % style)
for i, (name, count) in enumerate(name_sizes):
#r, g, b = random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)
#color = '%x%x%x' % (r, g, b) #random.choice(colors)
color = random.choice(xkcd_colors())
#lux = random.randint(0, 600)
#r, g, b =
line = droplet_template.format(size=sizes[i], text=name, count=count_fmt.format(count), color=color, style=style, css_class=css_class)
fhand.write(line)
d3_fmt = '''
<body>
<style>
svg {
cursor:default;
}
</style>
<script src="http://d3js.org/d3.v3.min.js"></script>
<script src="d3.layout.cloud.js"></script>
<div id="all_frequencies"></div>
<div id="all_wages"></div>
<script>
var fill = d3.scale.category20();
var width = 600;
var height = 600;
var tooltip = d3.select("body")
.append("div")
.style("position", "absolute")
.style("z-index", "10")
.style("background-color", "#fff")
.style("border", "1px solid #000")
.style("padding", "5px")
.style("visibility", "hidden")
.text("a simple tooltip");
var suffix = "%s";
var words = [
%s
];
var normalize_size = 90.0 / d3.max(words, function(d){return d.size});
var cloud = d3.layout.cloud()
.words(
words
)
.size([width, height])
.timeInterval(10)
.font("Impact")
.fontSize(function(d) { return d.o_size * normalize_size; })
.rotate(function(d) { return ~~(Math.random() * 5) * 10 - 20; })
.padding(1)
.on("end", draw)
.start();
function draw(words) {
d3.select("body").append("svg")
.attr("width", width)
.attr("height", height)
.append("g")
.attr("transform", "translate(" + (width/2) + "," + (height/2) + ")")
.selectAll("text")
.data(words)
.enter().append("text")
.style("font-size", function(d) { return d.size + "px"; })
.style("font-family", "Impact")
.style("fill", function(d, i) { return fill(i); })
.attr("text-anchor", "middle")
.attr("transform", function(d) {
return "translate(" + [d.x, d.y] + ")rotate(" + d.rotate + ")";
})
.text(function(d) { return d.text; })
.on("mouseover", function(d){
d3.select(this).style("opacity", 0.7);
tooltip.text(d.o_size + " - " + d.text + " " + suffix);
//tooltip.text(d.tip);
tooltip.style("visibility", "visible");
})
.on("mousemove", function(){return tooltip.style("top", (event.pageY-10)+"px").style("left",(event.pageX+10)+"px");})
.on("mouseout", function(){
d3.select(this).style("opacity", 1.0);
tooltip.style("visibility", "hidden");
});
}
</script>
</body>
'''
def d3_tag_cloud(name_sizes, fn, max_size=90, suffix='usd'):
#names = [i[0] for i in name_sizes]
#sizes = np.array([i[1] for i in name_sizes])
#normalizer = 1.0 * max_size / sizes.max()
#sizes *= normalizer
#name_sizes = zip(names, sizes)
word_fmt = '{text: "%s", size: 1, o_size: %f, tip: "%s"}'
with open(fn, 'w') as fhand:
words_str = ','.join([word_fmt % (name, size, "%s %s" % (size, suffix)) for name, size in name_sizes])
html = d3_fmt % (suffix, words_str)
fhand.write(html)
def tag_cloud(jobs, set_name='job'):
job_counts = Counter([i.job for i in jobs])
#job_frequencies = sorted(job_counts.iteritems(), key=lambda x: x[1], reverse=True)
#print 'len jobs', len(job_frequencies)
#job_frequencies = job_frequencies[:TOPN]
job_frequencies = job_counts.most_common(TOPN)
# ignore rare jobs
print 1
pop_jobs = set(i[0] for i in job_frequencies)
job_pays = {}
for tup in jobs:
if tup.job not in pop_jobs:
continue
wages = job_pays.get(tup.job, [])
wages.append(tup.pay)
job_pays[tup.job] = wages
print 2
job_meds = []
for name, wages in job_pays.items():
job_meds.append((name, np.median(wages)))
job_meds = sorted(job_meds, key=lambda x: x[1], reverse=True)
d3_tag_cloud(job_frequencies, set_name + '_frequencies.html', suffix='')
d3_tag_cloud(job_meds, set_name + '_wages.html')
#from pytagcloud import create_tag_image, make_tags
#from pytagcloud.lang.counter import get_tag_counts
#html_tag_cloud(job_frequencies, set_name + '_frequencies.html')
#html_tag_cloud(job_meds, set_name + '_wages.html', count_fmt='${}')
#YOUR_TEXT = "A tag cloud is a visual representation for text data, typically\
#used to depict keyword metadata on websites, or to visualize free form text."
#tags = make_tags(get_tag_counts(YOUR_TEXT), maxsize=120)
#print 3
#tags = make_tags(job_frequencies, maxsize=80)
#create_tag_image(tags, 'job_frequencies.png', size=(900, 600), fontname='Lobster')
#print 4
#tags = make_tags(job_meds, maxsize=80)
#create_tag_image(tags, 'job_median_pay.png', size=(900, 600), fontname='Lobster')
def anomalies():
global all_jobs
pay_freqs = Counter([i.pay for i in h1b.all_jobs])
pay_freqs.most_common(TOPN)
def main():
load()
print_percentiles()
tag_cloud(all_jobs, 'all')
tag_cloud(tech_jobs, 'tech')
if __name__ == '__main__':
main()
'''
trunc = 1.0 / 200
bottom = wages[len(wages) * trunc]
top = wages[-len(wages) * trunc]
#plt.hist(wages, range=[bottom, top])
span = top - bottom
n_bins = 100
bins = np.zeros(n_bins)
for w in wages:
percentile_i = int(n_bins * (w - bottom) / span)
if percentile_i < 0:
percentile_i = 0
elif percentile_i >= n_bins:
percentile_i = n_bins - 1
bins[percentile_i] += 1
plt.bar([i * span / n_bins + bottom for i in range(n_bins)], bins, width=span/n_bins)
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment