Skip to content

Instantly share code, notes, and snippets.

@sureshgorakala
Created February 8, 2017 01:26
Show Gist options
  • Save sureshgorakala/47bf062020eebf71fea9a653d71e2081 to your computer and use it in GitHub Desktop.
Save sureshgorakala/47bf062020eebf71fea9a653d71e2081 to your computer and use it in GitHub Desktop.
import nltk
#reading text into python
path = "~/textCourpus.txt"
f = open(path,'r')
lines = [line.replace('\n','') for line in f.readlines()]
#lines2 = [line.replace('\n','') for line in f.readlines()]
type(lines)
len(lines)
#sentence tokenizing
from nltk.tokenize import sent_tokenize
lines[4]
sent_tokenize(lines[4])
len(sent_tokenize(lines[4]))
#download required resources for tokenizer - english.pickle
nltk.download()
#nltk.download('punkt')
sent_tokenize(lines[4])
sent_tokenize(lines[4])[0]
sent_tokenize(lines[4])[1]
#tokenising sentences to words
from nltk.tokenize import word_tokenize
sent = sent_tokenize(lines[4])[1]
word_tokenize(sent)
type(word_tokenize(sent))
# (OR)
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(sent)
#frequency distributions:
from nltk.probability import FreqDist
fdist = FreqDist(word.lower() for word in word_tokenize(sent))
#length of each word
[len(word) for word in word_tokenize(sent)]
#collocations, wordsense disambiguation, co-reference
#stopwords
from nltk.corpus import stopwords
english_stops = set(stopwords.words('english'))
[word for word in word_tokenize(sent) if word not in english_stops]
english_stops
stopwords.fileids()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment