Skip to content

Instantly share code, notes, and snippets.

@s-leroux
Created January 26, 2016 14:28
Show Gist options
  • Save s-leroux/2c8e8f527602919e1050 to your computer and use it in GitHub Desktop.
Save s-leroux/2c8e8f527602919e1050 to your computer and use it in GitHub Desktop.
A quick-and-dirty script modeling the recruiting process
#!/usr/bin/python3
# A quick-and-dirty script modeling the recruiting process.
# by Sylvain Leroux
# Each candidat as a set of skills
# and a "personal fit" with the job offer.
# The skill level is known by the recruiter if >= 0.5
# The "personal fit" in [0,1) range is not know
# All skill values are in the range (0-1]
# The job offer has set of known required skills and set of unknown required skills
# The resulting score used to judge the algorithms outcode
# is the weighted average of skills and personal fit.
import sys
import random
from collections import defaultdict
from collections import Counter
import math
import operator
class Experiment:
# Initialize a new experiment by building the random set of applicants
def __init__(self, nskills,
ncandidates, keep,
skills_per_candidate):
self.nskills = nskills
self.candidates = []
self.candidates_by_skill = defaultdict(list)
self.keep = keep
for i in range(ncandidates):
candidate = {
'id': i,
'skills': {},
'personal_fit': random.random()
}
sn = skills_per_candidate #random.randrange(1,11)
for s in random.sample(range(nskills), sn):
aptitude = 1-random.random() # in (0, 1] range
candidate['skills'][s] = aptitude
if aptitude >= 0.5: # Recruiters have only access to skills at 50% or above
self.candidates_by_skill[s].append(i)
self.candidates.append(candidate)
# some_skills = random.sample(range(nskills), reqskills+nuseful)
#
# self.required_skills = some_skills[0:reqskills]
# self.useful_skills = some_skills[reqskills:]
# Omniscient oracle. Compute the real score
def get_score(self, candidate, skills, pweight):
score = 0.0
for skill in skills:
if skill in candidate['skills']:
score += candidate['skills'][skill]
return (1-pweight)*score/len(skills) + \
pweight*candidate['personal_fit']
# Inaccurate oracle. Compute a score based on recruiter's knowledge
def get_known_score(self, candidate, knownskills):
score = 0.0
for skill in knownskills:
if skill in candidate['skills'] and candidate['skills'][skill] >= .5:
score += candidate['skills'][skill]
return score/len(knownskills)
# Aggregate mean, max and min data for a selection sample
def mmnnm(self, applicants):
sum = 0
n = 0
max = float("NaN")
min = float("NaN")
for _,_, score in applicants:
sum += score
n += 1
if not score < max:
max = score
if not score > min:
min = score
return { "mean": sum/n if n != 0 else float('NaN'),
"max": max,
"min": min,
"n":n}
# Run the selection algorithms on the precomputed applicants population
# Build the set on required and useful skills based on the given
# parameters
def collect_sample_data(self, tag, nreq, knownskills, nuseful, pweight):
result = defaultdict(dict)
some_skills = random.sample(range(self.nskills), knownskills+nuseful)
required_skills = some_skills[0:knownskills]
useful_skills = some_skills[knownskills:]
skills = required_skills+useful_skills
best_score = self.sample_best_score(self.candidates,skills,pweight)
for name, method in (("random choice", self.random_pick),
("best fit", self.best_pick),
("2-pass best", self.twopass_pick)):
found = method(self.candidates,nreq,required_skills, useful_skills)
result[tag][name] = \
self.oracle(found, best_score,skills,pweight)
return result
# filter out unwanted candidates
def filter(self, candidates, nreq, required_skills):
if nreq > 0:
subsel = Counter()
for skill in required_skills:
subsel += Counter(self.candidates_by_skill[skill])
m = {candidate['id']: candidate for candidate in candidates}
candidates = [m[k] for k,v in subsel.most_common() if v >= nreq and k in m]
return candidates
# return candidates based on known data
def best(self, candidates, sample_size, knownskills):
l = [[ candidate['id'],
candidate,
self.get_known_score(candidate, knownskills) ] for candidate in candidates]
l = sorted(l, key=operator.itemgetter(2), reverse=True)
return [candidate for _, candidate, _ in l[:sample_size]]
# return a random sample or all applicants if the
# sample_size is larger than the population size
def sample(self, candidates, sample_size):
if sample_size > len(candidates):
return candidates
return random.sample(candidates, sample_size)
# Evaluate the score for each candidat in the list.
# The oracle has full knowledge to perform that task
def oracle(self, candidates, best_score, skills, pweight):
nf = 1/best_score # Normalization factor. Set to 1 to denormalize data
return [[candidate['id'],
candidate,
self.get_score(candidate, skills, pweight)*nf]
for candidate in candidates]
# Best pick selection algorithm
#
# optionally filter by the number of required skills
# then select the top-most candidate based of their known score
def best_pick(self, candidates, nreq,required_skills, useful_skills):
candidates = self.filter(candidates, nreq, required_skills)
candidates = self.best(candidates, self.keep, required_skills)
return candidates
# Random selection algorithm
#
# optionally filter by the number of required skills
# then select a random sample of the remaining applicants
def random_pick(self, candidates, nreq,required_skills, useful_skills):
candidates = self.filter(candidates, nreq, required_skills)
candidates = self.sample(candidates, self.keep)
return candidates
# 2-pass selection algorithm
#
# optionally filter by the number of required skills
# then select twice as most as needed applicant with the top-most score
# perform a sonc select pass, keeping the top-most candidate based
# on a possibly different set of known skills.
def twopass_pick(self, candidates, nreq,required_skills, useful_skills):
candidates = self.filter(candidates, nreq, required_skills)
candidates = self.best(candidates, self.keep*2, required_skills)
skills = required_skills+useful_skills # The second pass has some partial oracle powers
candidates = self.best(candidates, self.keep, random.sample(skills, len(required_skills)))
return candidates
# Find the best (real) score in a set of applicants
def sample_best_score(self, candidates, skills,pweight):
best_score = 0.0
for candidate in candidates:
score = self.get_score(candidate, skills,pweight)
if score > best_score:
best_score = score
return best_score
#print(candidates)
#print(candidates_by_skill)
sel = 100
keep = 5
nskills = 50
n_required_skills = 10
n_useful_skills = 5
skills_per_candidate = 20
personal_weight = .3
min_req = 0
print("Each candidate has", skills_per_candidate, "skills among", nskills)
print("Each candidate has a hidden 'personal fit' for the position")
print("'personal fit' will count for ", int(personal_weight*100), "% in the success in that position", sep="")
print("Recruiters are only aware of candidate skills at 50% of above")
print("Recruiters are only aware of ", n_required_skills, "required skills")
print("Recruiters will examine", sel, "candidates")
print("Recruiters will keep at most", keep, "candidates")
print("Recruiters will keep only candidates with", min_req, "required skills or more")
print("Success in that position require", n_useful_skills, "more skills")
aggregate = defaultdict(lambda : defaultdict(lambda : defaultdict(float)))
rows = set()
cols = set()
datas = set()
# Main loop. run experiment 'loops' time and collect results
loops = 2000
for run in range(loops):
exp = Experiment(nskills, sel, keep,
skills_per_candidate);
#print("required skills")
#print(exp.required_skills)
print("\r", run+1,"/"+str(loops), end="",sep="",file=sys.stderr)
sys.stderr.flush()
for condition in range(1,20+1):
# n_required_skills = condition
# message = "{:2d} skills".format(condition)
# n_useful_skills = condition
# message = "{:2d} extra useful skills".format(condition)
min_req = condition
message = "at least {:2d} skills".format(min_req)
# personal_weight = condition*5.0/100.0
# message = "w = {:4.0%}".format(personal_weight)
result_data = exp.collect_sample_data(message, min_req,
n_required_skills, n_useful_skills,
personal_weight)
for rowname, row in result_data.items():
rows.add(rowname)
for colname, col in row.items():
cols.add(colname)
if col:
aggregate[rowname][colname]['count'] += 1
for k, v in exp.mmnnm(col).items():
datas.add(k)
aggregate[rowname][colname][k] += v
print()
#
# Formatted output
#
need_header = True
for row in sorted(rows):
if need_header:
need_header = False
print("{:28s}".format(""), end=", ")
for ch in sorted(cols):
print("{:6s}, {:27s},".format("",ch),","*(len(datas)-1), sep="", end=" ")
print()
print("{:28s}".format(""), end=", ")
for ch in sorted(cols):
print("{:6s}".format("cnt"), end=", ")
for k in sorted(datas):
print("{:6s}".format(k), end=", ")
print()
print("{:28}".format(row), end=", ")
for col in sorted(cols):
data = aggregate[row][col]
count = aggregate[row][col]['count']
print("{:6d}".format(int(count)), end=", ")
for k in sorted(datas):
if count:
print("{:6.3f}".format(data[k]/count), end=", ")
else:
print("{:6.3s}".format(""), end=", ")
print()
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment