-
-
Save dennisdv1/dbf36b10ae560c818a92ee58275dbe89 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nl_core_news_sm | |
# Load pre-trained Dutch language model | |
nlp = nl_core_news_sm.load() | |
# File Extension. set as 'pdf' or as 'doc(x)' | |
extension = 'pdf' | |
def create_tokenized_texts_list(extension): | |
'''Create two lists, one with the names of the candidate and one with the tokenized | |
resume texts extracted from either a .pdf or .doc''' | |
resume_texts, resume_names = [], [] | |
# Loop over the contents of the directory containing the resumes, filtering by .pdf or .doc(x) | |
for resume in list(filter(lambda x: extension in x, os.listdir(PROJECT_DIR + '/CV'))): | |
if extension == 'pdf': | |
# Read in every resume with pdf extension in the directory | |
resume_texts.append(nlp(extract_text_from_pdf(PROJECT_DIR + '/CV/' + resume))) | |
elif 'doc' in extension: | |
# Read in every resume with .doc or .docx extension in the directory | |
resume_texts.append(nlp(extract_text_from_word(PROJECT_DIR + '/CV/' + resume))) | |
resume_names.append(resume.split('_')[0].capitalize()) | |
return resume_texts, resume_names |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
its showing os and PROJECT_DIR not definied