Skip to content

Instantly share code, notes, and snippets.

@backupbrain
Last active February 10, 2024 07:07
Show Gist options
  • Save backupbrain/f355126042513188c484db5ef91650cd to your computer and use it in GitHub Desktop.
Save backupbrain/f355126042513188c484db5ef91650cd to your computer and use it in GitHub Desktop.
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd
stop_words = set(stopwords.words("english"))
def get_first_title(title):
# keep "co-founder, co-ceo, etc"
title = re.sub(r"[Cc]o[\-\ ]","", title)
split_titles = re.split(r"\,|\-|\||\&|\:|\/|and", title)
return split_titles[0].strip()
def get_title_features(title):
features = {}
word_tokens = nltk.word_tokenize(title)
filtered_words = [w for w in word_tokens if not w in stop_words]
for word in filtered_words:
features['contains({})'.format(word.lower())] = True
if len(filtered_words) > 0:
first_key = 'first({})'.format(filtered_words[0].lower())
last_key = 'last({})'.format(filtered_words[-1].lower())
features[first_key] = True
features[last_key] = True
return features
## build feature sets
# Responsibilities
responsibilities_features = [
(
get_title_features(job_title["title"]),
job_title["responsibility"]
)
for job_title in raw_job_titles
if job_title["responsibility"] is not None
]
# Departments
departments_features = [
(
get_title_features(job_title["title"]),
job_title["department"]
)
for job_title in raw_job_titles
if job_title["department"] is not None
]
## Train classifier
# Responsibilities
r_size = int(len(responsibilities_features) * 0.5)
r_train_set = responsibilities_features[r_size:]
r_test_set = responsibilities_features[:r_size]
responsibilities_classifier = nltk.NaiveBayesClassifier.train(
r_train_set
)
print("Responsibility classification accuracy: {}".format(
nltk.classify.accuracy(
responsibilities_classifier,
r_test_set
)
))
# Departments
d_size = int(len(departments_features) * 0.5)
d_train_set = departments_features[d_size:]
d_test_set = departments_features[:d_size]
departments_classifier = nltk.NaiveBayesClassifier.train(
d_train_set
)
print("Department classification accuracy: {}".format(
nltk.classify.accuracy(
departments_classifier,
d_test_set
)
))
## Test Classifier
title = "Director of Communications"
responsibility = responsibilities_classifier.classify(
get_title_features(title)
)
department = departments_classifier.classify(
get_title_features(title)
)
print("Job title: '{}'".format(title))
print("Responsibility: '{}'".format(responsibility))
print("Department: '{}'".format(department))
## Grade Classifier
# Responsibility
responsibility_probability = \
responsibilities_classifier.prob_classify(
get_title_features(title)
)
responsibility_probability = 100 * responsibility_probability.prob(
responsibility_probability.max()
)
print("Responsibility confidence: {}%".format(
round(responsibility_probability)
))
# Department
department_probability = \
departments_classifier.prob_classify(
get_title_features(title)
)
department_probability = 100 * department_probability.prob(
department_probability.max()
)
print("Department confidence: {}%".format(
round(department_probability)
))
@nogur9
Copy link

nogur9 commented Aug 2, 2022

Working on similar problem, I would like the dataset too

@akashahmad427
Copy link

good working

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment