Skip to content

Instantly share code, notes, and snippets.

@Ghost---Shadow
Created December 11, 2015 05:01
Show Gist options
  • Save Ghost---Shadow/c361f2d6b4501f40648b to your computer and use it in GitHub Desktop.
Save Ghost---Shadow/c361f2d6b4501f40648b to your computer and use it in GitHub Desktop.
Using NLTK to replace all words in a string with their synonyms.
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from random import randint
import nltk.data
# Load a text file if required
text = "Pete ate a large cake. Sam has a big mouth."
output = ""
# Load the pretrained neural net
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# Tokenize the text
tokenized = tokenizer.tokenize(text)
# Get the list of words from the entire text
words = word_tokenize(text)
# Identify the parts of speech
tagged = nltk.pos_tag(words)
for i in range(0,len(words)):
replacements = []
# Only replace nouns with nouns, vowels with vowels etc.
for syn in wordnet.synsets(words[i]):
# Do not attempt to replace proper nouns or determiners
if tagged[i][1] == 'NNP' or tagged[i][1] == 'DT':
break
# The tokenizer returns strings like NNP, VBP etc
# but the wordnet synonyms has tags like .n.
# So we extract the first character from NNP ie n
# then we check if the dictionary word has a .n. or not
word_type = tagged[i][1][0].lower()
if syn.name().find("."+word_type+"."):
# extract the word only
r = syn.name()[0:syn.name().find(".")]
replacements.append(r)
if len(replacements) > 0:
# Choose a random replacement
replacement = replacements[randint(0,len(replacements)-1)]
output = output + " " + replacement
else:
# If no replacement could be found, then just use the
# original word
output = output + " " + words[i]
print(output)
@chandreshiit
Copy link

Generated output is terrible "Pete ate a large coat . Sam have a big mouth "

@naikm2
Copy link

naikm2 commented Feb 5, 2018

how to extract date using nltk

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment