Skip to content

Instantly share code, notes, and snippets.

@turicas
Created September 28, 2012 18:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save turicas/3801534 to your computer and use it in GitHub Desktop.
Save turicas/3801534 to your computer and use it in GitHub Desktop.
Simple script that show some encoding problem in NLTK's corpora
#!/usr/bin/env python2
# coding: utf-8
import nltk
from nltk.corpus import stopwords, machado, shakespeare
def get_available_corpora():
corpora = []
for element in dir(nltk.corpus):
if element[0] == '_':
continue
elements_type = str(type(getattr(nltk.corpus, element)))
if elements_type.startswith("<class 'nltk.corpus."):
corpora.append((element, getattr(nltk.corpus, element)))
return corpora
def get_types_of_words(corpus):
types = set()
for file_id in corpus.fileids():
for word in corpus.words(file_id):
types.add(type(word))
return list(types)
def main():
print 'NLTK version:', nltk.__version__
corpora = get_available_corpora()
while corpora:
corpus_name, corpus = corpora.pop(0)
try:
types_of_words = get_types_of_words(corpus)
raw_type = type(corpus.raw())
except:
print '{}: interface problem!'.format(corpus_name)
else:
print '{}:'.format(corpus_name),
print ' words: ', types_of_words,
print ' raw: ', raw_type
if __name__ == '__main__':
main()
NLTK version: 2.0.1rc4
abc: words: [<type 'str'>] raw: <type 'str'>
alpino: words: [<type 'str'>] raw: <type 'str'>
brown: words: [<type 'str'>] raw: <type 'str'>
cess_cat: words: [<type 'str'>] raw: <type 'str'>
cess_esp: words: [<type 'str'>] raw: <type 'str'>
cmudict: interface problem!
comtrans: words: [<type 'str'>] raw: <type 'str'>
conll2000: words: [<type 'str'>] raw: <type 'str'>
conll2002: words: [<type 'unicode'>] raw: <type 'unicode'>
conll2007: interface problem!
dependency_treebank: words: [<type 'str'>] raw: <type 'str'>
floresta: words: [<type 'str'>] raw: <type 'str'>
gazetteers: words: [<type 'str'>] raw: <type 'str'>
genesis: words: [<type 'unicode'>] raw: <type 'unicode'>
gutenberg: words: [<type 'str'>] raw: <type 'str'>
ieer: interface problem!
inaugural: words: [<type 'str'>] raw: <type 'str'>
indian: words: [<type 'str'>] raw: <type 'str'>
ipipan: interface problem!
jeita: words: [<type 'unicode'>] raw: <type 'unicode'>
knbc: words: [<type 'unicode'>] raw: <type 'unicode'>
mac_morpho: words: [<type 'unicode'>] raw: <type 'unicode'>
machado: words: [<type 'unicode'>] raw: <type 'unicode'>
movie_reviews: words: [<type 'str'>] raw: <type 'str'>
names: words: [<type 'str'>] raw: <type 'str'>
nombank: interface problem!
nps_chat: words: [<type 'str'>] raw: <type 'str'>
pl196x: words: [<type 'str'>] raw: <type 'str'>
ppattach: interface problem!
propbank: interface problem!
qc: interface problem!
reuters: words: [<type 'str'>] raw: <type 'str'>
rte: words: [<type 'unicode'>, <type 'str'>] raw: <type 'str'>
semcor: words: [<type 'str'>] raw: <type 'str'>
senseval: interface problem!
shakespeare: words: [<type 'str'>] raw: <type 'str'>
sinica_treebank: words: [<type 'str'>] raw: <type 'str'>
state_union: words: [<type 'str'>] raw: <type 'str'>
stopwords: words: [<type 'str'>] raw: <type 'str'>
swadesh: words: [<type 'str'>] raw: <type 'str'>
switchboard: interface problem!
timit: interface problem!
timit_tagged: words: [<type 'str'>] raw: <type 'str'>
toolbox: interface problem!
treebank: words: [<type 'str'>] raw: <type 'str'>
treebank_chunk: words: [<type 'str'>] raw: <type 'str'>
treebank_raw: words: [<type 'str'>] raw: <type 'str'>
udhr: interface problem!
verbnet: words: [<type 'str'>] raw: <type 'str'>
webtext: words: [<type 'str'>] raw: <type 'str'>
wordnet: interface problem!
wordnet_ic: interface problem!
words: words: [<type 'str'>] raw: <type 'str'>
ycoe: interface problem!
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment