Skip to content

Instantly share code, notes, and snippets.

@kmike
Forked from turicas/nltk_bug_unicode_corpora.py
Last active December 10, 2015 14:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kmike/4447068 to your computer and use it in GitHub Desktop.
Save kmike/4447068 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python2
# coding: utf-8
import nltk
def get_available_corpora():
for element in dir(nltk.corpus):
if element[0] == '_':
continue
elements_type = str(type(getattr(nltk.corpus, element)))
if elements_type.startswith("<class 'nltk.corpus."):
yield element, getattr(nltk.corpus, element)
def get_types_of_words(corpus):
types = set()
for file_id in corpus.fileids():
for word in corpus.words(file_id):
types.add(type(word))
return list(types)
def main():
print '='*30
print 'NLTK version:', nltk.__version__
print '='*30, "\n"
corpora = get_available_corpora()
for corpus_name, corpus in corpora:
print corpus_name
print '-'*30
try:
types_of_words = get_types_of_words(corpus)
print 'words:', types_of_words
except Exception as e:
print 'words: ', e
try:
raw_type = type(corpus.raw())
print 'raw:', raw_type
except Exception as e:
print 'raw: ', e
if hasattr(corpus, '_unload'):
corpus._unload()
print ""
if __name__ == '__main__':
main()
==============================
NLTK version: 2.0.4
==============================
abc
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
alpino
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
brown
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
cess_cat
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
cess_esp
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
cmudict
------------------------------
words: words() takes exactly 1 argument (2 given)
raw: local variable 'fileids' referenced before assignment
comtrans
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
conll2000
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
conll2002
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
conll2007
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
dependency_treebank
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
floresta
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
gazetteers
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
genesis
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
gutenberg
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
ieer
------------------------------
words: 'IEERCorpusReader' object has no attribute 'words'
raw: <type 'unicode'>
inaugural
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
indian
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
ipipan
------------------------------
words:
**********************************************************************
Resource u'corpora/ipipan' not found. Please use the NLTK
Downloader to obtain the resource: >>> nltk.download()
Searched in:
- '/Users/kmike/nltk_data'
- '/usr/share/nltk_data'
- '/usr/local/share/nltk_data'
- '/usr/lib/nltk_data'
- '/usr/local/lib/nltk_data'
**********************************************************************
raw:
**********************************************************************
Resource u'corpora/ipipan' not found. Please use the NLTK
Downloader to obtain the resource: >>> nltk.download()
Searched in:
- '/Users/kmike/nltk_data'
- '/usr/share/nltk_data'
- '/usr/local/share/nltk_data'
- '/usr/lib/nltk_data'
- '/usr/local/lib/nltk_data'
**********************************************************************
jeita
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
knbc
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
lin_thesaurus
------------------------------
words: 'LinThesaurusCorpusReader' object has no attribute 'words'
raw: 'LinThesaurusCorpusReader' object has no attribute 'raw'
mac_morpho
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
machado
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
movie_reviews
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
names
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
nombank
------------------------------
words: 'NombankCorpusReader' object has no attribute 'words'
raw: <type 'unicode'>
nombank_ptb
------------------------------
words: 'NombankCorpusReader' object has no attribute 'words'
raw: <type 'unicode'>
nps_chat
------------------------------
words: [<type 'unicode'>, <type 'str'>]
raw: <type 'unicode'>
pl196x
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
ppattach
------------------------------
words: 'PPAttachmentCorpusReader' object has no attribute 'words'
raw: <type 'unicode'>
propbank
------------------------------
words: 'PropbankCorpusReader' object has no attribute 'words'
raw: <type 'unicode'>
propbank_ptb
------------------------------
words: 'PropbankCorpusReader' object has no attribute 'words'
raw: <type 'unicode'>
ptb
------------------------------
words: []
raw: concat() expects at least one object!
qc
------------------------------
words: 'StringCategoryCorpusReader' object has no attribute 'words'
raw: <type 'unicode'>
reuters
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
rte
------------------------------
words: [<type 'unicode'>, <type 'str'>]
raw: <type 'unicode'>
semcor
------------------------------
words: [<type 'str'>]
raw: <type 'unicode'>
senseval
------------------------------
words: 'SensevalCorpusReader' object has no attribute 'words'
raw: <type 'unicode'>
shakespeare
------------------------------
words: [<type 'str'>]
raw: <type 'unicode'>
sinica_treebank
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
state_union
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
stopwords
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
swadesh
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
switchboard
------------------------------
words: words() takes exactly 1 argument (2 given)
raw: 'SwitchboardCorpusReader' object has no attribute 'raw'
timit
------------------------------
words: No such file or directory: u'/Users/kmike/nltk_data/corpora/timit/dr1-fvmh0/sa1.phn.wrd'
raw: 'TimitCorpusReader' object has no attribute 'raw'
timit_tagged
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
toolbox
------------------------------
words: []
raw: raw() takes exactly 2 arguments (1 given)
treebank
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
treebank_chunk
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
treebank_raw
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
udhr
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
verbnet
------------------------------
words: [<type 'str'>]
raw: <type 'unicode'>
webtext
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
wordnet
------------------------------
words: 'WordNetCorpusReader' object has no attribute 'words'
raw: 'WordNetCorpusReader' object has no attribute 'raw'
wordnet_ic
------------------------------
words: 'WordNetICCorpusReader' object has no attribute 'words'
raw: 'WordNetICCorpusReader' object has no attribute 'raw'
words
------------------------------
words: [<type 'unicode'>]
raw: <type 'unicode'>
ycoe
------------------------------
words: No such file or directory: u'/Users/kmike/nltk_data/corpora/ycoe/psd'
raw: No such file or directory: u'/Users/kmike/nltk_data/corpora/ycoe/psd'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment