dennisdv1/tokenize.py Secret

## tokenize.py
import nl_core_news_sm

# Load pre-trained Dutch language model
nlp = nl_core_news_sm.load()

# File Extension. set as 'pdf' or as 'doc(x)'
extension = 'pdf'

def create_tokenized_texts_list(extension):
    '''Create two lists, one with the names of the candidate and one with the tokenized
       resume texts extracted from either a .pdf or .doc'''
    resume_texts, resume_names = [], []

    # Loop over the contents of the directory containing the resumes, filtering by .pdf or .doc(x)
    for resume in list(filter(lambda x: extension in x, os.listdir(PROJECT_DIR + '/CV'))):
        if extension == 'pdf':
            # Read in every resume with pdf extension in the directory
            resume_texts.append(nlp(extract_text_from_pdf(PROJECT_DIR + '/CV/' + resume)))
        elif 'doc' in extension:
            # Read in every resume with .doc or .docx extension in the directory
            resume_texts.append(nlp(extract_text_from_word(PROJECT_DIR + '/CV/' + resume)))

        resume_names.append(resume.split('_')[0].capitalize())

    return resume_texts, resume_names
	import nl_core_news_sm

	# Load pre-trained Dutch language model
	nlp = nl_core_news_sm.load()

	# File Extension. set as 'pdf' or as 'doc(x)'
	extension = 'pdf'

	def create_tokenized_texts_list(extension):
	'''Create two lists, one with the names of the candidate and one with the tokenized
	resume texts extracted from either a .pdf or .doc'''
	resume_texts, resume_names = [], []

	# Loop over the contents of the directory containing the resumes, filtering by .pdf or .doc(x)
	for resume in list(filter(lambda x: extension in x, os.listdir(PROJECT_DIR + '/CV'))):
	if extension == 'pdf':
	# Read in every resume with pdf extension in the directory
	resume_texts.append(nlp(extract_text_from_pdf(PROJECT_DIR + '/CV/' + resume)))
	elif 'doc' in extension:
	# Read in every resume with .doc or .docx extension in the directory
	resume_texts.append(nlp(extract_text_from_word(PROJECT_DIR + '/CV/' + resume)))

	resume_names.append(resume.split('_')[0].capitalize())

	return resume_texts, resume_names