Skip to content

Instantly share code, notes, and snippets.

@dennisdv1
Created Dec 2, 2020
Embed
What would you like to do?
import nl_core_news_sm
# Load pre-trained Dutch language model
nlp = nl_core_news_sm.load()
# File Extension. set as 'pdf' or as 'doc(x)'
extension = 'pdf'
def create_tokenized_texts_list(extension):
'''Create two lists, one with the names of the candidate and one with the tokenized
resume texts extracted from either a .pdf or .doc'''
resume_texts, resume_names = [], []
# Loop over the contents of the directory containing the resumes, filtering by .pdf or .doc(x)
for resume in list(filter(lambda x: extension in x, os.listdir(PROJECT_DIR + '/CV'))):
if extension == 'pdf':
# Read in every resume with pdf extension in the directory
resume_texts.append(nlp(extract_text_from_pdf(PROJECT_DIR + '/CV/' + resume)))
elif 'doc' in extension:
# Read in every resume with .doc or .docx extension in the directory
resume_texts.append(nlp(extract_text_from_word(PROJECT_DIR + '/CV/' + resume)))
resume_names.append(resume.split('_')[0].capitalize())
return resume_texts, resume_names
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment