Skip to content

Instantly share code, notes, and snippets.

@quaquel
Created September 15, 2015 18:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save quaquel/bc88d8ada612737f4755 to your computer and use it in GitHub Desktop.
Save quaquel/bc88d8ada612737f4755 to your computer and use it in GitHub Desktop.
lemmatization example
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import collections\n",
"import re\n",
"\n",
"import nltk \n",
"from nltk.stem import WordNetLemmatizer\n",
"\n",
"stopwords = nltk.corpus.stopwords.words('English')\n",
"stopwords += ['without','also']\n",
"wordnet_lemmatizer = WordNetLemmatizer() \n",
"\n",
"def make_sentences(text):\n",
" '''\n",
" Takes a sentence and returns a list of chunks. Basically, any punctuation \n",
" mark or number is an internal sentence seperator. Should be used with NLTK tokenizer.\n",
" \n",
" '''\n",
" \n",
" #matches anything which is not words and whitespace or dash\n",
" p = re.compile(r'[^a-zA-Z\\s\\-]') \n",
" \n",
" # force text to lower case\n",
" text = text.lower()\n",
" \n",
" # split text into parts based on regular\n",
" # expresion\n",
" sentences = re.split(p, text)\n",
"\n",
" keep_sentences = []\n",
" for sentence in sentences:\n",
" sentence = sentence.strip()\n",
" if len(sentence) > 1:\n",
" keep_sentences.append(sentence)\n",
" \n",
" return keep_sentences\n",
"\n",
"def count_words(sentence):\n",
" '''return a counter with word frequence for sentence'''\n",
" \n",
" counter = collections.Counter()\n",
" sentence = sentence.split(' ')\n",
" \n",
" for word in sentence:\n",
" if word not in stopwords:\n",
" counter[word] +=1\n",
" \n",
" return counter"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"with open('./data/worldbank titles.txt') as fh:\n",
" titles = fh.readlines() "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"global_counter = collections.Counter()\n",
"for i, title in enumerate(titles):\n",
" title = title.strip()\n",
" sentences = make_sentences(title)\n",
" for sentence in sentences:\n",
" global_counter += count_words(sentence)\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"freqs = pd.Series(global_counter)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"freqs.to_csv('./data/worldbank.csv')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"adaptation 1\n",
"adapting 1\n",
"analysis 5\n",
"approaches 1\n",
"assessment 1\n",
"balanced 1\n",
"bangladesh 1\n",
"benefit 1\n",
"change 1\n",
"changing 1\n",
"dtype: int64"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"freqs[0:10]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment