Skip to content

Instantly share code, notes, and snippets.

@Lakshmi-1212
Last active June 7, 2022 10:21
Show Gist options
  • Save Lakshmi-1212/00c7e05c62653e5beb9cdfb902d2af70 to your computer and use it in GitHub Desktop.
Save Lakshmi-1212/00c7e05c62653e5beb9cdfb902d2af70 to your computer and use it in GitHub Desktop.
def extract_keywords(text, ignore_words = [],
min_word_length = 0,
ignore_numbers = True,
ignore_case = True):
# Remove words with special characters
filtered_text = ''.join(filter(lambda x:x in string.printable, text))
# Create word tokens from the text string
tokens = word_tokenize(filtered_text)
# List of punctuations to be ignored
punctuations = ['(',')',';',':','[',']',',','.','--','-','#','!','*','"','%']
# Get the stopwords list to be ignored
stop_words = stopwords.words('english')
# Convert ignore words from user to lower case
ignore_words_lower = [x.lower() for x in ignore_words]
# Combine all the words to be ignored
all_ignored_words = punctuations + stop_words + ignore_words_lower
# Get the keywords list
keywords = [word for word in tokens \
if word.lower() not in all_ignored_words
and len(word) >= min_word_length]
# Remove keywords with only digits
if ignore_numbers:
keywords = [keyword for keyword in keywords if not keyword.isdigit()]
# Return all keywords in lower case if case is not of significance
if ignore_case:
keywords = [keyword.lower() for keyword in keywords]
return keywords
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment