Created
January 26, 2016 14:28
-
-
Save s-leroux/2c8e8f527602919e1050 to your computer and use it in GitHub Desktop.
A quick-and-dirty script modeling the recruiting process
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# A quick-and-dirty script modeling the recruiting process. | |
# by Sylvain Leroux | |
# Each candidat as a set of skills | |
# and a "personal fit" with the job offer. | |
# The skill level is known by the recruiter if >= 0.5 | |
# The "personal fit" in [0,1) range is not know | |
# All skill values are in the range (0-1] | |
# The job offer has set of known required skills and set of unknown required skills | |
# The resulting score used to judge the algorithms outcode | |
# is the weighted average of skills and personal fit. | |
import sys | |
import random | |
from collections import defaultdict | |
from collections import Counter | |
import math | |
import operator | |
class Experiment: | |
# Initialize a new experiment by building the random set of applicants | |
def __init__(self, nskills, | |
ncandidates, keep, | |
skills_per_candidate): | |
self.nskills = nskills | |
self.candidates = [] | |
self.candidates_by_skill = defaultdict(list) | |
self.keep = keep | |
for i in range(ncandidates): | |
candidate = { | |
'id': i, | |
'skills': {}, | |
'personal_fit': random.random() | |
} | |
sn = skills_per_candidate #random.randrange(1,11) | |
for s in random.sample(range(nskills), sn): | |
aptitude = 1-random.random() # in (0, 1] range | |
candidate['skills'][s] = aptitude | |
if aptitude >= 0.5: # Recruiters have only access to skills at 50% or above | |
self.candidates_by_skill[s].append(i) | |
self.candidates.append(candidate) | |
# some_skills = random.sample(range(nskills), reqskills+nuseful) | |
# | |
# self.required_skills = some_skills[0:reqskills] | |
# self.useful_skills = some_skills[reqskills:] | |
# Omniscient oracle. Compute the real score | |
def get_score(self, candidate, skills, pweight): | |
score = 0.0 | |
for skill in skills: | |
if skill in candidate['skills']: | |
score += candidate['skills'][skill] | |
return (1-pweight)*score/len(skills) + \ | |
pweight*candidate['personal_fit'] | |
# Inaccurate oracle. Compute a score based on recruiter's knowledge | |
def get_known_score(self, candidate, knownskills): | |
score = 0.0 | |
for skill in knownskills: | |
if skill in candidate['skills'] and candidate['skills'][skill] >= .5: | |
score += candidate['skills'][skill] | |
return score/len(knownskills) | |
# Aggregate mean, max and min data for a selection sample | |
def mmnnm(self, applicants): | |
sum = 0 | |
n = 0 | |
max = float("NaN") | |
min = float("NaN") | |
for _,_, score in applicants: | |
sum += score | |
n += 1 | |
if not score < max: | |
max = score | |
if not score > min: | |
min = score | |
return { "mean": sum/n if n != 0 else float('NaN'), | |
"max": max, | |
"min": min, | |
"n":n} | |
# Run the selection algorithms on the precomputed applicants population | |
# Build the set on required and useful skills based on the given | |
# parameters | |
def collect_sample_data(self, tag, nreq, knownskills, nuseful, pweight): | |
result = defaultdict(dict) | |
some_skills = random.sample(range(self.nskills), knownskills+nuseful) | |
required_skills = some_skills[0:knownskills] | |
useful_skills = some_skills[knownskills:] | |
skills = required_skills+useful_skills | |
best_score = self.sample_best_score(self.candidates,skills,pweight) | |
for name, method in (("random choice", self.random_pick), | |
("best fit", self.best_pick), | |
("2-pass best", self.twopass_pick)): | |
found = method(self.candidates,nreq,required_skills, useful_skills) | |
result[tag][name] = \ | |
self.oracle(found, best_score,skills,pweight) | |
return result | |
# filter out unwanted candidates | |
def filter(self, candidates, nreq, required_skills): | |
if nreq > 0: | |
subsel = Counter() | |
for skill in required_skills: | |
subsel += Counter(self.candidates_by_skill[skill]) | |
m = {candidate['id']: candidate for candidate in candidates} | |
candidates = [m[k] for k,v in subsel.most_common() if v >= nreq and k in m] | |
return candidates | |
# return candidates based on known data | |
def best(self, candidates, sample_size, knownskills): | |
l = [[ candidate['id'], | |
candidate, | |
self.get_known_score(candidate, knownskills) ] for candidate in candidates] | |
l = sorted(l, key=operator.itemgetter(2), reverse=True) | |
return [candidate for _, candidate, _ in l[:sample_size]] | |
# return a random sample or all applicants if the | |
# sample_size is larger than the population size | |
def sample(self, candidates, sample_size): | |
if sample_size > len(candidates): | |
return candidates | |
return random.sample(candidates, sample_size) | |
# Evaluate the score for each candidat in the list. | |
# The oracle has full knowledge to perform that task | |
def oracle(self, candidates, best_score, skills, pweight): | |
nf = 1/best_score # Normalization factor. Set to 1 to denormalize data | |
return [[candidate['id'], | |
candidate, | |
self.get_score(candidate, skills, pweight)*nf] | |
for candidate in candidates] | |
# Best pick selection algorithm | |
# | |
# optionally filter by the number of required skills | |
# then select the top-most candidate based of their known score | |
def best_pick(self, candidates, nreq,required_skills, useful_skills): | |
candidates = self.filter(candidates, nreq, required_skills) | |
candidates = self.best(candidates, self.keep, required_skills) | |
return candidates | |
# Random selection algorithm | |
# | |
# optionally filter by the number of required skills | |
# then select a random sample of the remaining applicants | |
def random_pick(self, candidates, nreq,required_skills, useful_skills): | |
candidates = self.filter(candidates, nreq, required_skills) | |
candidates = self.sample(candidates, self.keep) | |
return candidates | |
# 2-pass selection algorithm | |
# | |
# optionally filter by the number of required skills | |
# then select twice as most as needed applicant with the top-most score | |
# perform a sonc select pass, keeping the top-most candidate based | |
# on a possibly different set of known skills. | |
def twopass_pick(self, candidates, nreq,required_skills, useful_skills): | |
candidates = self.filter(candidates, nreq, required_skills) | |
candidates = self.best(candidates, self.keep*2, required_skills) | |
skills = required_skills+useful_skills # The second pass has some partial oracle powers | |
candidates = self.best(candidates, self.keep, random.sample(skills, len(required_skills))) | |
return candidates | |
# Find the best (real) score in a set of applicants | |
def sample_best_score(self, candidates, skills,pweight): | |
best_score = 0.0 | |
for candidate in candidates: | |
score = self.get_score(candidate, skills,pweight) | |
if score > best_score: | |
best_score = score | |
return best_score | |
#print(candidates) | |
#print(candidates_by_skill) | |
sel = 100 | |
keep = 5 | |
nskills = 50 | |
n_required_skills = 10 | |
n_useful_skills = 5 | |
skills_per_candidate = 20 | |
personal_weight = .3 | |
min_req = 0 | |
print("Each candidate has", skills_per_candidate, "skills among", nskills) | |
print("Each candidate has a hidden 'personal fit' for the position") | |
print("'personal fit' will count for ", int(personal_weight*100), "% in the success in that position", sep="") | |
print("Recruiters are only aware of candidate skills at 50% of above") | |
print("Recruiters are only aware of ", n_required_skills, "required skills") | |
print("Recruiters will examine", sel, "candidates") | |
print("Recruiters will keep at most", keep, "candidates") | |
print("Recruiters will keep only candidates with", min_req, "required skills or more") | |
print("Success in that position require", n_useful_skills, "more skills") | |
aggregate = defaultdict(lambda : defaultdict(lambda : defaultdict(float))) | |
rows = set() | |
cols = set() | |
datas = set() | |
# Main loop. run experiment 'loops' time and collect results | |
loops = 2000 | |
for run in range(loops): | |
exp = Experiment(nskills, sel, keep, | |
skills_per_candidate); | |
#print("required skills") | |
#print(exp.required_skills) | |
print("\r", run+1,"/"+str(loops), end="",sep="",file=sys.stderr) | |
sys.stderr.flush() | |
for condition in range(1,20+1): | |
# n_required_skills = condition | |
# message = "{:2d} skills".format(condition) | |
# n_useful_skills = condition | |
# message = "{:2d} extra useful skills".format(condition) | |
min_req = condition | |
message = "at least {:2d} skills".format(min_req) | |
# personal_weight = condition*5.0/100.0 | |
# message = "w = {:4.0%}".format(personal_weight) | |
result_data = exp.collect_sample_data(message, min_req, | |
n_required_skills, n_useful_skills, | |
personal_weight) | |
for rowname, row in result_data.items(): | |
rows.add(rowname) | |
for colname, col in row.items(): | |
cols.add(colname) | |
if col: | |
aggregate[rowname][colname]['count'] += 1 | |
for k, v in exp.mmnnm(col).items(): | |
datas.add(k) | |
aggregate[rowname][colname][k] += v | |
print() | |
# | |
# Formatted output | |
# | |
need_header = True | |
for row in sorted(rows): | |
if need_header: | |
need_header = False | |
print("{:28s}".format(""), end=", ") | |
for ch in sorted(cols): | |
print("{:6s}, {:27s},".format("",ch),","*(len(datas)-1), sep="", end=" ") | |
print() | |
print("{:28s}".format(""), end=", ") | |
for ch in sorted(cols): | |
print("{:6s}".format("cnt"), end=", ") | |
for k in sorted(datas): | |
print("{:6s}".format(k), end=", ") | |
print() | |
print("{:28}".format(row), end=", ") | |
for col in sorted(cols): | |
data = aggregate[row][col] | |
count = aggregate[row][col]['count'] | |
print("{:6d}".format(int(count)), end=", ") | |
for k in sorted(datas): | |
if count: | |
print("{:6.3f}".format(data[k]/count), end=", ") | |
else: | |
print("{:6.3s}".format(""), end=", ") | |
print() | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment