Created
September 15, 2015 18:20
-
-
Save quaquel/bc88d8ada612737f4755 to your computer and use it in GitHub Desktop.
lemmatization example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import collections\n", | |
"import re\n", | |
"\n", | |
"import nltk \n", | |
"from nltk.stem import WordNetLemmatizer\n", | |
"\n", | |
"stopwords = nltk.corpus.stopwords.words('English')\n", | |
"stopwords += ['without','also']\n", | |
"wordnet_lemmatizer = WordNetLemmatizer() \n", | |
"\n", | |
"def make_sentences(text):\n", | |
" '''\n", | |
" Takes a sentence and returns a list of chunks. Basically, any punctuation \n", | |
" mark or number is an internal sentence seperator. Should be used with NLTK tokenizer.\n", | |
" \n", | |
" '''\n", | |
" \n", | |
" #matches anything which is not words and whitespace or dash\n", | |
" p = re.compile(r'[^a-zA-Z\\s\\-]') \n", | |
" \n", | |
" # force text to lower case\n", | |
" text = text.lower()\n", | |
" \n", | |
" # split text into parts based on regular\n", | |
" # expresion\n", | |
" sentences = re.split(p, text)\n", | |
"\n", | |
" keep_sentences = []\n", | |
" for sentence in sentences:\n", | |
" sentence = sentence.strip()\n", | |
" if len(sentence) > 1:\n", | |
" keep_sentences.append(sentence)\n", | |
" \n", | |
" return keep_sentences\n", | |
"\n", | |
"def count_words(sentence):\n", | |
" '''return a counter with word frequence for sentence'''\n", | |
" \n", | |
" counter = collections.Counter()\n", | |
" sentence = sentence.split(' ')\n", | |
" \n", | |
" for word in sentence:\n", | |
" if word not in stopwords:\n", | |
" counter[word] +=1\n", | |
" \n", | |
" return counter" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"with open('./data/worldbank titles.txt') as fh:\n", | |
" titles = fh.readlines() " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"global_counter = collections.Counter()\n", | |
"for i, title in enumerate(titles):\n", | |
" title = title.strip()\n", | |
" sentences = make_sentences(title)\n", | |
" for sentence in sentences:\n", | |
" global_counter += count_words(sentence)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"freqs = pd.Series(global_counter)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"freqs.to_csv('./data/worldbank.csv')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"adaptation 1\n", | |
"adapting 1\n", | |
"analysis 5\n", | |
"approaches 1\n", | |
"assessment 1\n", | |
"balanced 1\n", | |
"bangladesh 1\n", | |
"benefit 1\n", | |
"change 1\n", | |
"changing 1\n", | |
"dtype: int64" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"freqs[0:10]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment