Last active
June 7, 2022 10:21
-
-
Save Lakshmi-1212/00c7e05c62653e5beb9cdfb902d2af70 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def extract_keywords(text, ignore_words = [], | |
min_word_length = 0, | |
ignore_numbers = True, | |
ignore_case = True): | |
# Remove words with special characters | |
filtered_text = ''.join(filter(lambda x:x in string.printable, text)) | |
# Create word tokens from the text string | |
tokens = word_tokenize(filtered_text) | |
# List of punctuations to be ignored | |
punctuations = ['(',')',';',':','[',']',',','.','--','-','#','!','*','"','%'] | |
# Get the stopwords list to be ignored | |
stop_words = stopwords.words('english') | |
# Convert ignore words from user to lower case | |
ignore_words_lower = [x.lower() for x in ignore_words] | |
# Combine all the words to be ignored | |
all_ignored_words = punctuations + stop_words + ignore_words_lower | |
# Get the keywords list | |
keywords = [word for word in tokens \ | |
if word.lower() not in all_ignored_words | |
and len(word) >= min_word_length] | |
# Remove keywords with only digits | |
if ignore_numbers: | |
keywords = [keyword for keyword in keywords if not keyword.isdigit()] | |
# Return all keywords in lower case if case is not of significance | |
if ignore_case: | |
keywords = [keyword.lower() for keyword in keywords] | |
return keywords |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment