balachandrapai/NLPBasics.py

## NLPBasics.py
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

##Tokenizing - Splitting sentences and words from the body of text.
##Part of Speech tagging

##Corpus - Body of text, singular. Corpora is the plural of this.
##Example: A collection of medical journals.

##Lexicon - Words and their meanings.
##Example: English dictionary. Consider, however, that various fields will
##have different lexicons. For example: To a financial investor, the first
##meaning for the word "Bull" is someone who is confident about the market,
##as compared to the common English lexicon, where the first meaning for the
##word "Bull" is an animal. As such, there is a special lexicon for financial
##investors, doctors, children, mechanics, and so on.

##Token - Each "entity" that is a part of whatever was split up based on rules.
##For examples, each word is a token when a sentence is "tokenized" into words.
##Each sentence can also be a token, if you tokenized the sentences out of a
##paragraph.

EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."

print(sent_tokenize(EXAMPLE_TEXT))
print(word_tokenize(EXAMPLE_TEXT))

##Stop-words - In NLP, Stop-words are nothing but useless words which are of
##no importance and do not convey any meaning. They're just like filler words

example_sent = "This is a sample sentence, showing off the stop words filtration."

stop_words = set(stopwords.words('english'))

##below line prints the common stop words set by NLTK
##print(stop_words)

word_tokens = word_tokenize(example_sent)

filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

##one line statement for the above operation
##filtered_sentence = [w for w in word_tokens if not w in stop_words]

print(word_tokens)
print(filtered_sentence)

##stemming is a sort of normalizing method.
##Many variations of words carry the same meaning, other than when tense is
##involved.
##Having a dictionary entries for every word in the english language will be
##redundant and inefficient

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

example_words = ["python","pythoner","pythoning","pythoned","pythonly"]

for w in example_words:
    print(ps.stem(w))

new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."

words = word_tokenize(new_text)

for w in words:
    print(ps.stem(w))


#Lemmatizing
from nltk.stem import WordNetLemmatizer

##The major difference between these is, as you saw earlier,
##stemming can often create non-existent words, whereas lemmas are actual words.
##pos - parts of speech parameter, if not specified default is noun


lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))
	from nltk.corpus import stopwords
	from nltk.tokenize import sent_tokenize, word_tokenize

	##Tokenizing - Splitting sentences and words from the body of text.
	##Part of Speech tagging

	##Corpus - Body of text, singular. Corpora is the plural of this.
	##Example: A collection of medical journals.

	##Lexicon - Words and their meanings.
	##Example: English dictionary. Consider, however, that various fields will
	##have different lexicons. For example: To a financial investor, the first
	##meaning for the word "Bull" is someone who is confident about the market,
	##as compared to the common English lexicon, where the first meaning for the
	##word "Bull" is an animal. As such, there is a special lexicon for financial
	##investors, doctors, children, mechanics, and so on.

	##Token - Each "entity" that is a part of whatever was split up based on rules.
	##For examples, each word is a token when a sentence is "tokenized" into words.
	##Each sentence can also be a token, if you tokenized the sentences out of a
	##paragraph.

	EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."

	print(sent_tokenize(EXAMPLE_TEXT))
	print(word_tokenize(EXAMPLE_TEXT))

	##Stop-words - In NLP, Stop-words are nothing but useless words which are of
	##no importance and do not convey any meaning. They're just like filler words

	example_sent = "This is a sample sentence, showing off the stop words filtration."

	stop_words = set(stopwords.words('english'))

	##below line prints the common stop words set by NLTK
	##print(stop_words)

	word_tokens = word_tokenize(example_sent)

	filtered_sentence = []

	for w in word_tokens:
	if w not in stop_words:
	filtered_sentence.append(w)

	##one line statement for the above operation
	##filtered_sentence = [w for w in word_tokens if not w in stop_words]

	print(word_tokens)
	print(filtered_sentence)

	##stemming is a sort of normalizing method.
	##Many variations of words carry the same meaning, other than when tense is
	##involved.
	##Having a dictionary entries for every word in the english language will be
	##redundant and inefficient

	from nltk.stem import PorterStemmer
	from nltk.tokenize import sent_tokenize, word_tokenize

	ps = PorterStemmer()

	example_words = ["python","pythoner","pythoning","pythoned","pythonly"]

	for w in example_words:
	print(ps.stem(w))

	new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."

	words = word_tokenize(new_text)

	for w in words:
	print(ps.stem(w))


	#Lemmatizing
	from nltk.stem import WordNetLemmatizer

	##The major difference between these is, as you saw earlier,
	##stemming can often create non-existent words, whereas lemmas are actual words.
	##pos - parts of speech parameter, if not specified default is noun


	lemmatizer = WordNetLemmatizer()

	print(lemmatizer.lemmatize("cats"))
	print(lemmatizer.lemmatize("cacti"))
	print(lemmatizer.lemmatize("geese"))
	print(lemmatizer.lemmatize("rocks"))
	print(lemmatizer.lemmatize("python"))
	print(lemmatizer.lemmatize("better", pos="a"))
	print(lemmatizer.lemmatize("best", pos="a"))
	print(lemmatizer.lemmatize("run"))
	print(lemmatizer.lemmatize("run",'v'))