Skip to content

Instantly share code, notes, and snippets.

@rpietro
Created December 24, 2013 05:19
Show Gist options
  • Save rpietro/8109076 to your computer and use it in GitHub Desktop.
Save rpietro/8109076 to your computer and use it in GitHub Desktop.
source code for chapter 3 of http://nltk.org/book
# code from http://nltk.org/book
from __future__ import division
import nltk, re, pprint
from urllib import urlopen
url = "http://www.gutenberg.org/files/2554/2554.txt" # crime and punishment
raw = urlopen(url).read()
type(raw)
len(raw)
raw[:75]
tokens = nltk.word_tokenize(raw)
type(tokens)
len(tokens)
tokens[:10]
text = nltk.Text(tokens)
type(text)
text[1020:1060]
text.collocations()
raw.find("PART I")
raw.rfind("End of Project Gutenberg's Crime")
raw = raw[5303:1157681] [1]
raw.find("PART I")
## HTML
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = urlopen(url).read()
html[:60]
raw = nltk.clean_html(html)
tokens = nltk.word_tokenize(raw)
tokens
tokens = tokens[96:399]
text = nltk.Text(tokens)
text.concordance('gene')
import feedparser # install from https://pypi.python.org/pypi/feedparser#downloads
llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")
llog['feed']['title']
len(llog.entries)
print llog.entries
post = llog.entries[2]
post.title
content = post.content[0].value
content[:70]
nltk.word_tokenize(nltk.clean_html(content))
nltk.word_tokenize(nltk.clean_html(llog.entries[2].content[0].value))
import os
os.chdir("/Users/rpietro/Desktop")
f = open('peirce.txt')
raw = f.read()
type(raw)
print raw
for line in f:
print line.strip()
import nltk
tokens = nltk.word_tokenize(raw)
type(tokens)
words = [w.lower() for w in tokens]
type(words)
vocab = sorted(set(words))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment