Skip to content

Instantly share code, notes, and snippets.

@cmstewart
Created August 11, 2015 14:43
Show Gist options
  • Save cmstewart/90b0046dabfefd6562b4 to your computer and use it in GitHub Desktop.
Save cmstewart/90b0046dabfefd6562b4 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:ba3812163336b233976c70e7f53a2deb66f38edb7696571b071455daa2eaa708"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Introducci\u00f3n \n",
"\n",
"\"\"\"Este proyecto se trata de un trabajo de corpus sobre las diferencias ling\u00fc\u00edsticas entre hablantes nativos y hablantes de segunda lengua\n",
"(y luego hablantes de herencia...)\"\"\"\n",
"\n",
"\n",
"## Adquisici\u00f3n de los dados para los hablantes nativos\n",
"import requests\n",
"import zipfile\n",
"import StringIO\n",
"\n",
"zip_file_url = 'http://lenguajeacademico.info/proyecto/corpus_CLAE.zip'\n",
"r = requests.get(zip_file_url)\n",
"z = zipfile.ZipFile(StringIO.StringIO(r.content))\n",
"z.extractall()\n",
"\n",
"%cd corpus_CLAE/\n",
"%ls"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"/Users/christopherstewart/Desktop/sp_heritage/2/corpus_CLAE\n",
"CLgMg1.txt CLtEn1105.txt CLtEn813d.txt CLtRs1101.txt\r\n",
"CLgMg10.txt CLtEn1150.txt CLtEn814a.doc.txt CLtRs1103.txt\r\n",
"CLgMg11.txt CLtEn1151.txt CLtEn814b.txt CLtRs1104.txt\r\n",
"CLgMg1104.txt CLtEn1152.txt CLtEn814c.txt CLtRs1105.txt\r\n",
"CLgMg12.txt CLtEn1153.txt CLtEn814d.txt CLtRs512.txt\r\n",
"CLgMg14.txt CLtEn1154.txt CLtEn850a.txt CLtRs550.txt\r\n",
"CLgMg15.txt CLtEn1200b.txt CLtEn850b.txt CLtRs551.txt\r\n",
"CLgMg1500.txt CLtEn1200c.txt CLtEn851a.txt CLtRs552.txt\r\n",
"CLgMg1501.txt CLtEn1201b.txt CLtEn852a.txt CLtRs553.txt\r\n",
"CLgMg1502.txt CLtEn1201c.txt CLtEn852b.txt CLtRs554.txt\r\n",
"CLgMg1503.txt CLtEn1202b.txt CLtEn853a.txt CLtRs555.txt\r\n",
"CLgMg1504.txt CLtEn1202c.txt CLtEn853b.txt CLtRs556.txt\r\n",
"CLgMg1505.txt CLtEn1203b.txt CLtEn854a.txt CLtRs557.txt\r\n",
"CLgMg1506.txt CLtEn1204a.txt CLtEn854b.txt CLtRs558.txt\r\n",
"CLgMg1507.txt CLtEn1204b.txt CLtEn855a.txt CLtRs559.txt\r\n",
"CLgMg18.txt CLtEn1204c.txt CLtEn855b.txt CLtRs560.txt\r\n",
"CLgMg20.txt CLtEn1205a.txt CLtEn856b.txt CLtRs561.txt\r\n",
"CLgMg21.txt CLtEn1205b.txt CLtEn857a.txt CLtRs562.txt\r\n",
"CLgMg22.txt CLtEn1205c.txt CLtEn857b.txt CLtRs563.txt\r\n",
"CLgMg301.txt CLtEn1206a.txt CLtEn858a.txt CLtRs564.txt\r\n",
"CLgMg302.txt CLtEn1206b.txt CLtEn858b.txt CLtRs565.txt\r\n",
"CLgMg303.txt CLtEn1206c.txt CLtEn859a.txt CLtRs650.txt\r\n",
"CLgMg304.txt CLtEn1207a.txt CLtEn859b.txt CLtRs952.txt\r\n",
"CLgMg305.txt CLtEn1207b.txt CLtEn860a.txt MGEn1.txt\r\n",
"CLgMg306.txt CLtEn1207c.txt CLtEn860b.txt MGEn10.txt\r\n",
"CLgMg307.txt CLtEn2.txt CLtEn861a.txt MGEn11.txt\r\n",
"CLgMg308.txt CLtEn3.txt CLtEn862a.txt MGEn12.txt\r\n",
"CLgMg309.txt CLtEn500.txt CLtEn862b.txt MGEn13.txt\r\n",
"CLgMg310.txt CLtEn501.txt CLtEn863a.txt MGEn14.txt\r\n",
"CLgMg4.txt CLtEn502aa.txt CLtEn864a.txt MGEn15.txt\r\n",
"CLgMg6.txt CLtEn502bba.txt CLtEn864b.txt MGEn16.txt\r\n",
"CLgMg8.txt CLtEn502bbb.txt CLtEn865a.txt MGEn17.txt\r\n",
"CLgMg9.txt CLtEn502bbc.txt CLtEn865b.txt MGEn18.txt\r\n",
"CLiPr10a.txt CLtEn502bbd.txt CLtEn866a.txt MGEn19.txt\r\n",
"CLiPr10b.txt CLtEn503.txt CLtEn866b.txt MGEn2.txt\r\n",
"CLiPr11a.txt CLtEn505aa.txt CLtEn867a.txt MGEn20.txt\r\n",
"CLiPr11b.txt CLtEn505bba.txt CLtEn867b.txt MGEn3.txt\r\n",
"CLiPr12a.txt CLtEn505bbb.txt CLtEn900a.txt MGEn4.txt\r\n",
"CLiPr12b.txt CLtEn506aa.txt CLtEn900b.txt MGEn5.txt\r\n",
"CLiPr13a.txt CLtEn506bb.txt CLtEn901a.txt MGEn6.txt\r\n",
"CLiPr13aa.txt CLtEn507.txt CLtEn901b.txt MGEn8.txt\r\n",
"CLiPr13b.txt CLtEn508.txt CLtEn902b.txt MGEn9.txt\r\n",
"CLiPr13bb.txt CLtEn509.txt CLtEn903a.txt MGRm1.txt\r\n",
"CLiPr14a.txt CLtEn510aa.txt CLtEn903b.txt MGRm10.txt\r\n",
"CLiPr14b.txt CLtEn510bba.txt CLtEn905a.txt MGRm11.txt\r\n",
"CLiPr15a.txt CLtEn510bbb.txt CLtEn905b.txt MGRm12.txt\r\n",
"CLiPr15b.txt CLtEn511.txt CLtEn906a.txt MGRm13.txt\r\n",
"CLiPr16a.txt CLtEn512aa.txt CLtEn906b.txt MGRm14.txt\r\n",
"CLiPr16b.txt CLtEn512bb.txt CLtEn950a.txt MGRm15.txt\r\n",
"CLiPr17a.txt CLtEn553a.txt CLtEn950b.txt MGRm16.txt\r\n",
"CLiPr17aa.txt CLtEn553b.txt CLtEn951a.txt MGRm17.txt\r\n",
"CLiPr17b.txt CLtEn553c.txt CLtEn951b.txt MGRm18.txt\r\n",
"CLiPr17bb.txt CLtEn553d.txt CLtEn952aaa.txt MGRm19.txt\r\n",
"CLiPr18a.txt CLtEn555.txt CLtEn952aab.txt MGRm2.txt\r\n",
"CLiPr18b.txt CLtEn563a.txt CLtEn952bb.txt MGRm20.txt\r\n",
"CLiPr19a.txt CLtEn563b.txt CLtEn953.txt MGRm3.txt\r\n",
"CLiPr19b.txt CLtEn563c.txt CLtEn953a.txt MGRm4.txt\r\n",
"CLiPr1a.txt CLtEn563d.txt CLtEn954a.txt MGRm5.txt\r\n",
"CLiPr1aa.txt CLtEn600.txt CLtEn954b.txt MGRm6.txt\r\n",
"CLiPr1b.txt CLtEn601.txt CLtEn955a.txt MGRm7.txt\r\n",
"CLiPr1bb.txt CLtEn602.txt CLtEn955b.txt MGRm8.txt\r\n",
"CLiPr20a.txt CLtEn603.txt CLtEn956a.txt MGRm9.txt\r\n",
"CLiPr20b.txt CLtEn604aa.txt CLtEn956b.txt MGRp1.txt\r\n",
"CLiPr21a.txt CLtEn604bb.txt CLtEn957a.txt MGRp2.txt\r\n",
"CLiPr21aa.txt CLtEn605.txt CLtEn957b.txt MGRp3.txt\r\n",
"CLiPr21b.txt CLtEn606.txt CLtMg10.txt MGRp4.txt\r\n",
"CLiPr21bb.txt CLtEn607.txt CLtMg100.txt MGRp5.txt\r\n",
"CLiPr2a.txt CLtEn650aa.txt CLtMg101.txt MGRp6.txt\r\n",
"CLiPr2aa.txt CLtEn650bb.txt CLtMg102.txt MGRp8.txt\r\n",
"CLiPr2b.txt CLtEn650cca.txt CLtMg103.txt MGRp9.txt\r\n",
"CLiPr2bb.txt CLtEn650ccb.txt CLtMg104.txt MHEn1.txt\r\n",
"CLiPr3a.txt CLtEn650ccc.txt CLtMg105.txt MHEn11.txt\r\n",
"CLiPr3b.txt CLtEn650ccd.txt CLtMg106.txt MHEn12.txt\r\n",
"CLiPr4a.txt CLtEn651aa.txt CLtMg107.txt MHEn3.txt\r\n",
"CLiPr4aa.txt CLtEn651bba.txt CLtMg108.txt MHEn5.txt\r\n",
"CLiPr4b.txt CLtEn651bbb.txt CLtMg109.txt MHEn7.txt\r\n",
"CLiPr4bb.txt CLtEn652.txt CLtMg110.txt MHEn8.txt\r\n",
"CLiPr51a.txt CLtEn653.txt CLtMg111.txt MHEn9.txt\r\n",
"CLiPr51b.txt CLtEn654.txt CLtMg112.txt MHPr1.txt\r\n",
"CLiPr52a.txt CLtEn655.txt CLtMg113.txt MHPr10.txt\r\n",
"CLiPr52b.txt CLtEn700.txt CLtMg114.txt MHPr11.txt\r\n",
"CLiPr53a.txt CLtEn701.txt CLtMg115.txt MHPr12.txt\r\n",
"CLiPr53b.txt CLtEn702aa.txt CLtMg116.txt MHPr13.txt\r\n",
"CLiPr54a.txt CLtEn702bba.txt CLtMg117.txt MHPr14.txt\r\n",
"CLiPr54b.txt CLtEn702bbb.txt CLtMg118.txt MHPr2.txt\r\n",
"CLiPr55a.txt CLtEn703.txt CLtMg119.txt MHPr3.txt\r\n",
"CLiPr55b.txt CLtEn704.txt CLtMg12.txt MHPr4.txt\r\n",
"CLiPr56a.txt CLtEn750a.txt CLtMg120.txt MHPr5.txt\r\n",
"CLiPr56b.txt CLtEn750b.txt CLtMg121.txt MHPr6.txt\r\n",
"CLiPr57a.txt CLtEn750c.txt CLtMg122.txt MHPr7.txt\r\n",
"CLiPr57b.txt CLtEn751a.txt CLtMg123.txt MHPr8.txt\r\n",
"CLiPr58a.txt CLtEn751b.txt CLtMg124.txt MHPr9.txt\r\n",
"CLiPr58b.txt CLtEn751c.txt CLtMg13.txt MHRm1.txt\r\n",
"CLiPr59a.txt CLtEn752a.txt CLtMg14.txt MHRm11.txt\r\n",
"CLiPr59b.txt CLtEn752b.txt CLtMg18.txt MHRm12.txt\r\n",
"CLiPr5a.txt CLtEn752c.txt CLtMg201.txt MHRm13.txt\r\n",
"CLiPr5aa.txt CLtEn753a.txt CLtMg202.txt MHRm14.txt\r\n",
"CLiPr5b.txt CLtEn753b.txt CLtMg203.txt MHRm15.txt\r\n",
"CLiPr5bb.txt CLtEn753c.txt CLtMg204.txt MHRm17.txt\r\n",
"CLiPr60a.txt CLtEn754aaa.txt CLtMg205.txt MHRm18.txt\r\n",
"CLiPr60b.txt CLtEn754aab.txt CLtMg206.txt MHRm19.txt\r\n",
"CLiPr61a.txt CLtEn754aac.txt CLtMg207.txt MHRm2.txt\r\n",
"CLiPr61b.txt CLtEn754bba.txt CLtMg208.txt MHRm20.txt\r\n",
"CLiPr62a.txt CLtEn754bbb.txt CLtMg209.txt MHRm21.txt\r\n",
"CLiPr62b.txt CLtEn754cca.txt CLtMg210.txt MHRm22.txt\r\n",
"CLiPr63a.txt CLtEn755a.txt CLtMg211.txt MHRm23.txt\r\n",
"CLiPr63b.txt CLtEn755b.txt CLtMg212.txt MHRm24.txt\r\n",
"CLiPr64a.txt CLtEn755c.txt CLtMg31.txt MHRm25.txt\r\n",
"CLiPr64b.txt CLtEn756a.txt CLtMg401.txt MHRm26.txt\r\n",
"CLiPr65a.txt CLtEn756b.txt CLtMg402.txt MHRm3.txt\r\n",
"CLiPr65b.txt CLtEn757a.txt CLtMg403.txt MHRm4.txt\r\n",
"CLiPr66a.txt CLtEn757b.txt CLtMg5.txt MHRm5.txt\r\n",
"CLiPr66b.txt CLtEn757c.txt CLtMg64.txt MHRm6.txt\r\n",
"CLiPr67a.txt CLtEn799a.txt CLtMg66.txt MHRm7.txt\r\n",
"CLiPr67b.txt CLtEn799b.txt CLtMg69.txt MHRm8.txt\r\n",
"CLiPr68a.txt CLtEn799c.txt CLtMg71.txt MHRm9.txt\r\n",
"CLiPr68b.txt CLtEn799d.txt CLtMg9.txt MLtEn1.txt\r\n",
"CLiPr69a.txt CLtEn800aaa.txt CLtPr10.txt MLtEn10.txt\r\n",
"CLiPr69b.txt CLtEn800aab.txt CLtPr100.txt MLtEn11.txt\r\n",
"CLiPr6a.txt CLtEn800aad.txt CLtPr101.txt MLtEn12.txt\r\n",
"CLiPr6b.txt CLtEn800bb.txt CLtPr102.txt MLtEn13.txt\r\n",
"CLiPr70a.txt CLtEn801a.txt CLtPr103.txt MLtEn14.txt\r\n",
"CLiPr70b.txt CLtEn801b.txt CLtPr104.txt MLtEn15.txt\r\n",
"CLiPr71a.txt CLtEn801d.txt CLtPr105.txt MLtEn16.txt\r\n",
"CLiPr71b.txt CLtEn802a.txt CLtPr106.txt MLtEn17.txt\r\n",
"CLiPr72a.txt CLtEn802b.txt CLtPr107.txt MLtEn18.txt\r\n",
"CLiPr72b.txt CLtEn802c.txt CLtPr108.txt MLtEn19.txt\r\n",
"CLiPr7a.txt CLtEn802d.txt CLtPr109.txt MLtEn2.txt\r\n",
"CLiPr7b.txt CLtEn803a.txt CLtPr110.txt MLtEn20.txt\r\n",
"CLiPr8a.txt CLtEn803b.txt CLtPr111.txt MLtEn21.txt\r\n",
"CLiPr8b.txt CLtEn803c.txt CLtPr112.txt MLtEn22.txt\r\n",
"CLiPr9a.txt CLtEn803d.txt CLtPr113.txt MLtEn23.txt\r\n",
"CLiPr9b.txt CLtEn804a.txt CLtPr114.txt MLtEn24.txt\r\n",
"CLiRs10.txt CLtEn804b.txt CLtPr115.txt MLtEn25.txt\r\n",
"CLiRs11.txt CLtEn804c.txt CLtPr116.txt MLtEn26.txt\r\n",
"CLiRs12.txt CLtEn804d.txt CLtPr117.txt MLtEn3.txt\r\n",
"CLiRs13.txt CLtEn805a.txt CLtPr118.txt MLtEn4.txt\r\n",
"CLiRs14.txt CLtEn805b.txt CLtPr119.txt MLtEn5.txt\r\n",
"CLiRs15.txt CLtEn805c.txt CLtPr12.txt MLtEn6.txt\r\n",
"CLiRs16.txt CLtEn805d.txt CLtPr120.txt MLtEn7.txt\r\n",
"CLiRs17.txt CLtEn806a.txt CLtPr121.txt MLtEn8.txt\r\n",
"CLiRs18.txt CLtEn806b.txt CLtPr122.txt MLtEn9.txt\r\n",
"CLiRs19.txt CLtEn806c.txt CLtPr123.txt MLtPr1.txt\r\n",
"CLiRs2.txt CLtEn806d.txt CLtPr124.txt MLtPr10.txt\r\n",
"CLiRs20.txt CLtEn807a.txt CLtPr13.txt MLtPr11.txt\r\n",
"CLiRs21.txt CLtEn807b.txt CLtPr14.txt MLtPr12.txt\r\n",
"CLiRs3.txt CLtEn807c.txt CLtPr18.txt MLtPr13.txt\r\n",
"CLiRs4.txt CLtEn807d.txt CLtPr2.txt MLtPr14.txt\r\n",
"CLiRs5.txt CLtEn808a.txt CLtPr201.txt MLtPr15.txt\r\n",
"CLiRs6.txt CLtEn808b.txt CLtPr202.txt MLtPr2.txt\r\n",
"CLiRs7.txt CLtEn808c.txt CLtPr203.txt MLtPr3.txt\r\n",
"CLiRs8.txt CLtEn808d.txt CLtPr204.txt MLtPr4.txt\r\n",
"CLiRs9.txt CLtEn809a.txt CLtPr205.txt MLtPr5.txt\r\n",
"CLtEn1.txt CLtEn809b.txt CLtPr206.txt MLtPr6.txt\r\n",
"CLtEn1000a.txt CLtEn809c.txt CLtPr207.txt MLtPr7.txt\r\n",
"CLtEn1001a.txt CLtEn809d.txt CLtPr208.txt MLtPr8.txt\r\n",
"CLtEn1001b.txt CLtEn810a.txt CLtPr209.txt MLtPr9.txt\r\n",
"CLtEn1002a.txt CLtEn810b.txt CLtPr210.txt MLtRes1.txt\r\n",
"CLtEn1002b.txt CLtEn810c.txt CLtPr211.txt MLtRes10.txt\r\n",
"CLtEn1003a.txt CLtEn810d.txt CLtPr212.txt MLtRes11.txt\r\n",
"CLtEn1003b.txt CLtEn811a.txt CLtPr31.txt MLtRes2.txt\r\n",
"CLtEn1004a.txt CLtEn811b.txt CLtPr401.txt MLtRes3.txt\r\n",
"CLtEn1004b.txt CLtEn811c.txt CLtPr402.txt MLtRes4.txt\r\n",
"CLtEn1050.txt CLtEn811d.txt CLtPr403.txt MLtRes5.txt\r\n",
"CLtEn1051.txt CLtEn812a.txt CLtPr5.txt MLtRes6.txt\r\n",
"CLtEn1052.txt CLtEn812b.txt CLtPr64.txt MLtRes7.txt\r\n",
"CLtEn1053.txt CLtEn812c.txt CLtPr66.txt MLtRes8.txt\r\n",
"CLtEn1100.txt CLtEn812d.txt CLtPr69.txt MLtRes9.txt\r\n",
"CLtEn1101.txt CLtEn813a.txt CLtPr71.txt\r\n",
"CLtEn1102.txt CLtEn813b.txt CLtPr9.txt\r\n",
"CLtEn1104.txt CLtEn813c.txt CLtRs1100.txt\r\n"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import glob\n",
"from chardet.universaldetector import UniversalDetector\n",
"\n",
"detector = UniversalDetector()\n",
"for filename in glob.glob('*.txt'):\n",
" print filename.ljust(60),\n",
" detector.reset()\n",
" for line in file(filename, 'rb'):\n",
" detector.feed(line)\n",
" if detector.done: break\n",
" detector.close()\n",
" print detector.result\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 31
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "'ConcatenatedCorpusView' object is not callable",
"output_type": "pyerr",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-19-a76da12e7ca2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m L_toks = [(l, w)\n\u001b[1;32m 2\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0ml\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'C'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'M'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m for w in corp(categories=l)]\n\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m: 'ConcatenatedCorpusView' object is not callable"
]
}
],
"prompt_number": 19
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"L_toks = [(l, w)\n",
" for l in ['C', 'M']\n",
" for w in corp(categories=l)]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "UnicodeDecodeError",
"evalue": "'utf8' codec can't decode byte 0xfe in position 0: invalid start byte",
"output_type": "pyerr",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-28-c65562a4555d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mnltk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreader\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mCategorizedPlaintextCorpusReader\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mnltk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mText\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mcorp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mw\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mw\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mText\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mCategorizedPlaintextCorpusReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'.'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34mr'.*\\.txt'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcat_pattern\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34mr'(C|M)'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mw\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misalpha\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/text.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, tokens, name)\u001b[0m\n\u001b[1;32m 293\u001b[0m \"\"\"\n\u001b[1;32m 294\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_COPY_TOKENS\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 295\u001b[0;31m \u001b[0mtokens\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokens\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 296\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtokens\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtokens\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 297\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/corpus/reader/util.pyc\u001b[0m in \u001b[0;36m__len__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 370\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_offsets\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_pieces\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 371\u001b[0m \u001b[0;31m# Iterate to the end of the corpus.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 372\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mtok\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miterate_from\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_offsets\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 373\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 374\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_offsets\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/corpus/reader/util.pyc\u001b[0m in \u001b[0;36miterate_from\u001b[0;34m(self, start_tok)\u001b[0m\n\u001b[1;32m 392\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 393\u001b[0m \u001b[0;31m# Get everything we can from this piece.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 394\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mtok\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mpiece\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miterate_from\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstart_tok\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0moffset\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 395\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mtok\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 396\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/corpus/reader/util.pyc\u001b[0m in \u001b[0;36miterate_from\u001b[0;34m(self, start_tok)\u001b[0m\n\u001b[1;32m 289\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_current_toknum\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtoknum\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 290\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_current_blocknum\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mblock_index\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 291\u001b[0;31m \u001b[0mtokens\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_block\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_stream\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 292\u001b[0m assert isinstance(tokens, (tuple, list, AbstractLazySequence)), (\n\u001b[1;32m 293\u001b[0m \u001b[0;34m'block reader %s() should return list or tuple.'\u001b[0m \u001b[0;34m%\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/corpus/reader/plaintext.pyc\u001b[0m in \u001b[0;36m_read_word_block\u001b[0;34m(self, stream)\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[0mwords\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 116\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# Read 20 lines at a time.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 117\u001b[0;31m \u001b[0mwords\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_word_tokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtokenize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 118\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwords\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 119\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/data.pyc\u001b[0m in \u001b[0;36mreadline\u001b[0;34m(self, size)\u001b[0m\n\u001b[1;32m 1100\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1101\u001b[0m \u001b[0mstartpos\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtell\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbytebuffer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1102\u001b[0;31m \u001b[0mnew_chars\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreadsize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1103\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1104\u001b[0m \u001b[0;31m# If we're at a '\\r', then read one extra character, since\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/data.pyc\u001b[0m in \u001b[0;36m_read\u001b[0;34m(self, size)\u001b[0m\n\u001b[1;32m 1327\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1328\u001b[0m \u001b[0;31m# Decode the bytes into unicode characters\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1329\u001b[0;31m \u001b[0mchars\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbytes_decoded\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_incr_decode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbytes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1330\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1331\u001b[0m \u001b[0;31m# If we got bytes but couldn't decode any, then read further.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/data.pyc\u001b[0m in \u001b[0;36m_incr_decode\u001b[0;34m(self, bytes)\u001b[0m\n\u001b[1;32m 1357\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1358\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1359\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbytes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'strict'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1360\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mUnicodeDecodeError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1361\u001b[0m \u001b[0;31m# If the exception occurs at the end of the string,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/encodings/utf_8.pyc\u001b[0m in \u001b[0;36mdecode\u001b[0;34m(input, errors)\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'strict'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mcodecs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutf_8_decode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 17\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;32mclass\u001b[0m \u001b[0mIncrementalEncoder\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcodecs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIncrementalEncoder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mUnicodeDecodeError\u001b[0m: 'utf8' codec can't decode byte 0xfe in position 0: invalid start byte"
]
}
],
"prompt_number": 28
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from nltk.corpus.reader import CategorizedPlaintextCorpusReader\n",
"from nltk.text import Text\n",
"corp = [w.lower() for w in Text(CategorizedPlaintextCorpusReader('.', r'.*\\.txt', cat_pattern=r'(C|M)').words()) if w.isalpha()]\n"
],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment