s-leroux/recruiting-model.py

## recruiting-model.py
#!/usr/bin/python3

# A quick-and-dirty script modeling the recruiting process.
# by Sylvain Leroux

# Each candidat as a set of skills
# and a "personal fit" with the job offer.
# The skill level is known by the recruiter if >= 0.5
# The "personal fit" in [0,1) range is not know
# All skill values are in the range (0-1]
# The job offer has set of known required skills and set of unknown required skills

# The resulting score used to judge the algorithms outcode
# is the weighted average of skills and personal fit.

import sys
import random
from collections import defaultdict
from collections import Counter
import math
import operator


class Experiment:
    # Initialize a new experiment by building the random set of applicants
    def __init__(self, nskills,
                    ncandidates, keep,
                    skills_per_candidate):
        self.nskills = nskills
        self.candidates = []
        self.candidates_by_skill = defaultdict(list)
        self.keep = keep

        for i in range(ncandidates):
            candidate = {
                            'id': i,
                            'skills': {},
                            'personal_fit': random.random()
                        }
            sn = skills_per_candidate #random.randrange(1,11)
            for s in random.sample(range(nskills), sn):
                aptitude = 1-random.random() # in (0, 1] range

                candidate['skills'][s] = aptitude
                if aptitude >= 0.5: # Recruiters have only access to skills at 50% or above
                    self.candidates_by_skill[s].append(i)
            self.candidates.append(candidate)

#        some_skills = random.sample(range(nskills), reqskills+nuseful)
#
#        self.required_skills = some_skills[0:reqskills]
#        self.useful_skills = some_skills[reqskills:]

    # Omniscient oracle. Compute the real score
    def get_score(self, candidate, skills, pweight):
        score = 0.0

        for skill in skills:
            if skill in candidate['skills']:
                score += candidate['skills'][skill]

        return (1-pweight)*score/len(skills) + \
                pweight*candidate['personal_fit']

    # Inaccurate oracle. Compute a score based on recruiter's knowledge
    def get_known_score(self, candidate, knownskills):
        score = 0.0

        for skill in knownskills:
            if skill in candidate['skills'] and candidate['skills'][skill] >= .5:
                score += candidate['skills'][skill]

        return score/len(knownskills)


    # Aggregate mean, max and min data for a selection sample
    def mmnnm(self, applicants):
        sum = 0
        n = 0
        max = float("NaN")
        min = float("NaN")
        for _,_, score in applicants:
            sum += score
            n += 1
            if not score < max:
                max = score
            if not score > min:
                min = score

        return { "mean": sum/n if n != 0 else float('NaN'),
                 "max": max,
                 "min": min,
                 "n":n}

    # Run the selection algorithms on the precomputed applicants population
    # Build the set on required and useful skills based on the given
    # parameters
    def collect_sample_data(self, tag, nreq, knownskills, nuseful, pweight):
        result = defaultdict(dict)

        some_skills = random.sample(range(self.nskills), knownskills+nuseful)

        required_skills = some_skills[0:knownskills]
        useful_skills = some_skills[knownskills:]
        skills = required_skills+useful_skills
        best_score = self.sample_best_score(self.candidates,skills,pweight)

        for name, method in (("random choice", self.random_pick),
                             ("best fit", self.best_pick),
                             ("2-pass best", self.twopass_pick)):
            found = method(self.candidates,nreq,required_skills, useful_skills)
            result[tag][name] = \
                self.oracle(found, best_score,skills,pweight)

        return result

    # filter out unwanted candidates
    def filter(self, candidates, nreq, required_skills):
        if nreq > 0:
            subsel = Counter()
            for skill in required_skills:
                subsel += Counter(self.candidates_by_skill[skill])

            m = {candidate['id']: candidate for candidate in candidates}
            candidates = [m[k] for k,v in subsel.most_common() if v >= nreq and k in m]

        return candidates

    # return candidates based on known data
    def best(self, candidates, sample_size, knownskills):
        l = [[ candidate['id'],
                        candidate,
                        self.get_known_score(candidate, knownskills) ] for candidate in candidates]
        l = sorted(l, key=operator.itemgetter(2), reverse=True)

        return [candidate for _, candidate, _ in l[:sample_size]]

    # return a random sample or all applicants if the
    # sample_size is larger than the population size
    def sample(self, candidates, sample_size):
        if sample_size > len(candidates):
            return candidates

        return random.sample(candidates, sample_size)

    # Evaluate the score for each candidat in the list.
    # The oracle has full knowledge to perform that task
    def oracle(self, candidates, best_score, skills, pweight):
        nf = 1/best_score # Normalization factor. Set to 1 to denormalize data

        return [[candidate['id'],
                candidate,
                self.get_score(candidate, skills, pweight)*nf]
            for candidate in candidates]

    # Best pick selection algorithm
    #
    # optionally filter by the number of required skills
    # then select the top-most candidate based of their known score
    def best_pick(self, candidates, nreq,required_skills, useful_skills):
        candidates = self.filter(candidates, nreq, required_skills)
        candidates = self.best(candidates, self.keep, required_skills)

        return candidates

    # Random selection algorithm
    #
    # optionally filter by the number of required skills
    # then select a random sample of the remaining applicants
    def random_pick(self, candidates, nreq,required_skills, useful_skills):
        candidates = self.filter(candidates, nreq, required_skills)
        candidates = self.sample(candidates, self.keep)

        return candidates

    # 2-pass selection algorithm
    #
    # optionally filter by the number of required skills
    # then select twice as most as needed applicant with the top-most score
    # perform a sonc select pass, keeping the top-most candidate based
    # on a possibly different set of known skills.
    def twopass_pick(self, candidates, nreq,required_skills, useful_skills):
        candidates = self.filter(candidates, nreq, required_skills)
        candidates = self.best(candidates, self.keep*2, required_skills)

        skills = required_skills+useful_skills # The second pass has some partial oracle powers
        candidates = self.best(candidates, self.keep, random.sample(skills, len(required_skills)))

        return candidates

    # Find the best (real) score in a set of applicants
    def sample_best_score(self, candidates, skills,pweight):
        best_score = 0.0

        for candidate in candidates:
            score = self.get_score(candidate, skills,pweight)
            if score > best_score:
                best_score = score

        return best_score


#print(candidates)
#print(candidates_by_skill)

sel = 100
keep = 5
nskills = 50
n_required_skills = 10
n_useful_skills = 5
skills_per_candidate = 20
personal_weight = .3
min_req = 0

print("Each candidate has", skills_per_candidate, "skills among", nskills)
print("Each candidate has a hidden 'personal fit' for the position")
print("'personal fit' will count for ", int(personal_weight*100), "% in the success in that position", sep="")
print("Recruiters are only aware of candidate skills at 50% of above")
print("Recruiters are only aware of ", n_required_skills, "required skills")
print("Recruiters will examine", sel, "candidates")
print("Recruiters will keep at most", keep, "candidates")
print("Recruiters will keep only candidates with", min_req, "required skills or more")
print("Success in that position require", n_useful_skills, "more skills")


aggregate = defaultdict(lambda : defaultdict(lambda : defaultdict(float)))
rows = set()
cols = set()
datas = set()

# Main loop. run experiment 'loops' time and collect results
loops = 2000
for run in range(loops):
    exp = Experiment(nskills, sel, keep,
                    skills_per_candidate);

    #print("required skills")
    #print(exp.required_skills)

    print("\r", run+1,"/"+str(loops), end="",sep="",file=sys.stderr)
    sys.stderr.flush()

    for condition in range(1,20+1):
#        n_required_skills = condition
#        message = "{:2d} skills".format(condition)
#        n_useful_skills = condition
#        message = "{:2d} extra useful skills".format(condition)
        min_req = condition
        message = "at least {:2d} skills".format(min_req)
#        personal_weight = condition*5.0/100.0
#        message = "w = {:4.0%}".format(personal_weight)

        result_data = exp.collect_sample_data(message, min_req,
                                                    n_required_skills, n_useful_skills,
                                                    personal_weight)

        for rowname, row in result_data.items():
            rows.add(rowname)
            for colname, col in row.items():
                cols.add(colname)

                if col:
                    aggregate[rowname][colname]['count'] += 1
                    for k, v in exp.mmnnm(col).items():
                        datas.add(k)
                        aggregate[rowname][colname][k] += v

print()


#
# Formatted output
#
need_header = True
for row in sorted(rows):
    if need_header:
        need_header = False
        print("{:28s}".format(""), end=", ")
        for ch in sorted(cols):
            print("{:6s}, {:27s},".format("",ch),","*(len(datas)-1), sep="", end=" ")
        print()
        print("{:28s}".format(""), end=", ")
        for ch in sorted(cols):
            print("{:6s}".format("cnt"), end=", ")
            for k in sorted(datas):
                print("{:6s}".format(k), end=", ")
        print()

    print("{:28}".format(row), end=", ")
    for col in sorted(cols):
        data = aggregate[row][col]
        count = aggregate[row][col]['count']

        print("{:6d}".format(int(count)), end=", ")
        for k in sorted(datas):
            if count:
                print("{:6.3f}".format(data[k]/count), end=", ")
            else:
                print("{:6.3s}".format(""), end=", ")
    print()

print()
	#!/usr/bin/python3

	# A quick-and-dirty script modeling the recruiting process.
	# by Sylvain Leroux

	# Each candidat as a set of skills
	# and a "personal fit" with the job offer.
	# The skill level is known by the recruiter if >= 0.5
	# The "personal fit" in [0,1) range is not know
	# All skill values are in the range (0-1]
	# The job offer has set of known required skills and set of unknown required skills

	# The resulting score used to judge the algorithms outcode
	# is the weighted average of skills and personal fit.

	import sys
	import random
	from collections import defaultdict
	from collections import Counter
	import math
	import operator


	class Experiment:
	# Initialize a new experiment by building the random set of applicants
	def __init__(self, nskills,
	ncandidates, keep,
	skills_per_candidate):
	self.nskills = nskills
	self.candidates = []
	self.candidates_by_skill = defaultdict(list)
	self.keep = keep

	for i in range(ncandidates):
	candidate = {
	'id': i,
	'skills': {},
	'personal_fit': random.random()
	}
	sn = skills_per_candidate #random.randrange(1,11)
	for s in random.sample(range(nskills), sn):
	aptitude = 1-random.random() # in (0, 1] range

	candidate['skills'][s] = aptitude
	if aptitude >= 0.5: # Recruiters have only access to skills at 50% or above
	self.candidates_by_skill[s].append(i)
	self.candidates.append(candidate)

	# some_skills = random.sample(range(nskills), reqskills+nuseful)
	#
	# self.required_skills = some_skills[0:reqskills]
	# self.useful_skills = some_skills[reqskills:]

	# Omniscient oracle. Compute the real score
	def get_score(self, candidate, skills, pweight):
	score = 0.0

	for skill in skills:
	if skill in candidate['skills']:
	score += candidate['skills'][skill]

	return (1-pweight)*score/len(skills) + \
	pweight*candidate['personal_fit']

	# Inaccurate oracle. Compute a score based on recruiter's knowledge
	def get_known_score(self, candidate, knownskills):
	score = 0.0

	for skill in knownskills:
	if skill in candidate['skills'] and candidate['skills'][skill] >= .5:
	score += candidate['skills'][skill]

	return score/len(knownskills)



	# Aggregate mean, max and min data for a selection sample
	def mmnnm(self, applicants):
	sum = 0
	n = 0
	max = float("NaN")
	min = float("NaN")
	for _,_, score in applicants:
	sum += score
	n += 1
	if not score < max:
	max = score
	if not score > min:
	min = score

	return { "mean": sum/n if n != 0 else float('NaN'),
	"max": max,
	"min": min,
	"n":n}

	# Run the selection algorithms on the precomputed applicants population
	# Build the set on required and useful skills based on the given
	# parameters
	def collect_sample_data(self, tag, nreq, knownskills, nuseful, pweight):
	result = defaultdict(dict)

	some_skills = random.sample(range(self.nskills), knownskills+nuseful)

	required_skills = some_skills[0:knownskills]
	useful_skills = some_skills[knownskills:]
	skills = required_skills+useful_skills
	best_score = self.sample_best_score(self.candidates,skills,pweight)

	for name, method in (("random choice", self.random_pick),
	("best fit", self.best_pick),
	("2-pass best", self.twopass_pick)):
	found = method(self.candidates,nreq,required_skills, useful_skills)
	result[tag][name] = \
	self.oracle(found, best_score,skills,pweight)

	return result

	# filter out unwanted candidates
	def filter(self, candidates, nreq, required_skills):
	if nreq > 0:
	subsel = Counter()
	for skill in required_skills:
	subsel += Counter(self.candidates_by_skill[skill])

	m = {candidate['id']: candidate for candidate in candidates}
	candidates = [m[k] for k,v in subsel.most_common() if v >= nreq and k in m]

	return candidates

	# return candidates based on known data
	def best(self, candidates, sample_size, knownskills):
	l = [[ candidate['id'],
	candidate,
	self.get_known_score(candidate, knownskills) ] for candidate in candidates]
	l = sorted(l, key=operator.itemgetter(2), reverse=True)

	return [candidate for _, candidate, _ in l[:sample_size]]

	# return a random sample or all applicants if the
	# sample_size is larger than the population size
	def sample(self, candidates, sample_size):
	if sample_size > len(candidates):
	return candidates

	return random.sample(candidates, sample_size)

	# Evaluate the score for each candidat in the list.
	# The oracle has full knowledge to perform that task
	def oracle(self, candidates, best_score, skills, pweight):
	nf = 1/best_score # Normalization factor. Set to 1 to denormalize data

	return [[candidate['id'],
	candidate,
	self.get_score(candidate, skills, pweight)*nf]
	for candidate in candidates]

	# Best pick selection algorithm
	#
	# optionally filter by the number of required skills
	# then select the top-most candidate based of their known score
	def best_pick(self, candidates, nreq,required_skills, useful_skills):
	candidates = self.filter(candidates, nreq, required_skills)
	candidates = self.best(candidates, self.keep, required_skills)

	return candidates

	# Random selection algorithm
	#
	# optionally filter by the number of required skills
	# then select a random sample of the remaining applicants
	def random_pick(self, candidates, nreq,required_skills, useful_skills):
	candidates = self.filter(candidates, nreq, required_skills)
	candidates = self.sample(candidates, self.keep)

	return candidates

	# 2-pass selection algorithm
	#
	# optionally filter by the number of required skills
	# then select twice as most as needed applicant with the top-most score
	# perform a sonc select pass, keeping the top-most candidate based
	# on a possibly different set of known skills.
	def twopass_pick(self, candidates, nreq,required_skills, useful_skills):
	candidates = self.filter(candidates, nreq, required_skills)
	candidates = self.best(candidates, self.keep*2, required_skills)

	skills = required_skills+useful_skills # The second pass has some partial oracle powers
	candidates = self.best(candidates, self.keep, random.sample(skills, len(required_skills)))

	return candidates

	# Find the best (real) score in a set of applicants
	def sample_best_score(self, candidates, skills,pweight):
	best_score = 0.0

	for candidate in candidates:
	score = self.get_score(candidate, skills,pweight)
	if score > best_score:
	best_score = score

	return best_score


	#print(candidates)
	#print(candidates_by_skill)

	sel = 100
	keep = 5
	nskills = 50
	n_required_skills = 10
	n_useful_skills = 5
	skills_per_candidate = 20
	personal_weight = .3
	min_req = 0

	print("Each candidate has", skills_per_candidate, "skills among", nskills)
	print("Each candidate has a hidden 'personal fit' for the position")
	print("'personal fit' will count for ", int(personal_weight*100), "% in the success in that position", sep="")
	print("Recruiters are only aware of candidate skills at 50% of above")
	print("Recruiters are only aware of ", n_required_skills, "required skills")
	print("Recruiters will examine", sel, "candidates")
	print("Recruiters will keep at most", keep, "candidates")
	print("Recruiters will keep only candidates with", min_req, "required skills or more")
	print("Success in that position require", n_useful_skills, "more skills")


	aggregate = defaultdict(lambda : defaultdict(lambda : defaultdict(float)))
	rows = set()
	cols = set()
	datas = set()

	# Main loop. run experiment 'loops' time and collect results
	loops = 2000
	for run in range(loops):
	exp = Experiment(nskills, sel, keep,
	skills_per_candidate);

	#print("required skills")
	#print(exp.required_skills)

	print("\r", run+1,"/"+str(loops), end="",sep="",file=sys.stderr)
	sys.stderr.flush()

	for condition in range(1,20+1):
	# n_required_skills = condition
	# message = "{:2d} skills".format(condition)
	# n_useful_skills = condition
	# message = "{:2d} extra useful skills".format(condition)
	min_req = condition
	message = "at least {:2d} skills".format(min_req)
	# personal_weight = condition*5.0/100.0
	# message = "w = {:4.0%}".format(personal_weight)

	result_data = exp.collect_sample_data(message, min_req,
	n_required_skills, n_useful_skills,
	personal_weight)

	for rowname, row in result_data.items():
	rows.add(rowname)
	for colname, col in row.items():
	cols.add(colname)

	if col:
	aggregate[rowname][colname]['count'] += 1
	for k, v in exp.mmnnm(col).items():
	datas.add(k)
	aggregate[rowname][colname][k] += v

	print()


	#
	# Formatted output
	#
	need_header = True
	for row in sorted(rows):
	if need_header:
	need_header = False
	print("{:28s}".format(""), end=", ")
	for ch in sorted(cols):
	print("{:6s}, {:27s},".format("",ch),","*(len(datas)-1), sep="", end=" ")
	print()
	print("{:28s}".format(""), end=", ")
	for ch in sorted(cols):
	print("{:6s}".format("cnt"), end=", ")
	for k in sorted(datas):
	print("{:6s}".format(k), end=", ")
	print()

	print("{:28}".format(row), end=", ")
	for col in sorted(cols):
	data = aggregate[row][col]
	count = aggregate[row][col]['count']

	print("{:6d}".format(int(count)), end=", ")
	for k in sorted(datas):
	if count:
	print("{:6.3f}".format(data[k]/count), end=", ")
	else:
	print("{:6.3s}".format(""), end=", ")
	print()

	print()