Created
December 24, 2013 05:19
-
-
Save rpietro/8109076 to your computer and use it in GitHub Desktop.
source code for chapter 3 of http://nltk.org/book
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# code from http://nltk.org/book | |
from __future__ import division | |
import nltk, re, pprint | |
from urllib import urlopen | |
url = "http://www.gutenberg.org/files/2554/2554.txt" # crime and punishment | |
raw = urlopen(url).read() | |
type(raw) | |
len(raw) | |
raw[:75] | |
tokens = nltk.word_tokenize(raw) | |
type(tokens) | |
len(tokens) | |
tokens[:10] | |
text = nltk.Text(tokens) | |
type(text) | |
text[1020:1060] | |
text.collocations() | |
raw.find("PART I") | |
raw.rfind("End of Project Gutenberg's Crime") | |
raw = raw[5303:1157681] [1] | |
raw.find("PART I") | |
## HTML | |
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm" | |
html = urlopen(url).read() | |
html[:60] | |
raw = nltk.clean_html(html) | |
tokens = nltk.word_tokenize(raw) | |
tokens | |
tokens = tokens[96:399] | |
text = nltk.Text(tokens) | |
text.concordance('gene') | |
import feedparser # install from https://pypi.python.org/pypi/feedparser#downloads | |
llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom") | |
llog['feed']['title'] | |
len(llog.entries) | |
print llog.entries | |
post = llog.entries[2] | |
post.title | |
content = post.content[0].value | |
content[:70] | |
nltk.word_tokenize(nltk.clean_html(content)) | |
nltk.word_tokenize(nltk.clean_html(llog.entries[2].content[0].value)) | |
import os | |
os.chdir("/Users/rpietro/Desktop") | |
f = open('peirce.txt') | |
raw = f.read() | |
type(raw) | |
print raw | |
for line in f: | |
print line.strip() | |
import nltk | |
tokens = nltk.word_tokenize(raw) | |
type(tokens) | |
words = [w.lower() for w in tokens] | |
type(words) | |
vocab = sorted(set(words)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment