quaquel/text_mining.ipynb

## text_mining.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import collections\n",
    "import re\n",
    "\n",
    "import nltk \n",
    "from nltk.stem import WordNetLemmatizer\n",
    "\n",
    "stopwords = nltk.corpus.stopwords.words('English')\n",
    "stopwords += ['without','also']\n",
    "wordnet_lemmatizer = WordNetLemmatizer()    \n",
    "\n",
    "def make_sentences(text):\n",
    "    '''\n",
    "    Takes a sentence and returns a list of chunks. Basically, any punctuation \n",
    "    mark or number is an internal sentence seperator. Should be used with NLTK tokenizer.\n",
    "    \n",
    "    '''\n",
    "    \n",
    "    #matches anything which is not words and whitespace or dash\n",
    "    p = re.compile(r'[^a-zA-Z\\s\\-]') \n",
    "   \n",
    "    # force text to lower case\n",
    "    text = text.lower()\n",
    "    \n",
    "    # split text into parts based on regular\n",
    "    # expresion\n",
    "    sentences = re.split(p, text)\n",
    "\n",
    "    keep_sentences = []\n",
    "    for sentence in sentences:\n",
    "        sentence = sentence.strip()\n",
    "        if len(sentence) > 1:\n",
    "            keep_sentences.append(sentence)\n",
    "    \n",
    "    return keep_sentences\n",
    "\n",
    "def count_words(sentence):\n",
    "    '''return a counter with word frequence for sentence'''\n",
    "    \n",
    "    counter = collections.Counter()\n",
    "    sentence = sentence.split(' ')\n",
    "    \n",
    "    for word in sentence:\n",
    "        if word not in stopwords:\n",
    "            counter[word] +=1\n",
    "        \n",
    "    return counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "with open('./data/worldbank titles.txt') as fh:\n",
    "    titles = fh.readlines()    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "global_counter = collections.Counter()\n",
    "for i, title in enumerate(titles):\n",
    "    title = title.strip()\n",
    "    sentences = make_sentences(title)\n",
    "    for sentence in sentences:\n",
    "        global_counter += count_words(sentence)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "freqs = pd.Series(global_counter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "freqs.to_csv('./data/worldbank.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "adaptation    1\n",
       "adapting      1\n",
       "analysis      5\n",
       "approaches    1\n",
       "assessment    1\n",
       "balanced      1\n",
       "bangladesh    1\n",
       "benefit       1\n",
       "change        1\n",
       "changing      1\n",
       "dtype: int64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "freqs[0:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import collections\n",
	"import re\n",
	"\n",
	"import nltk \n",
	"from nltk.stem import WordNetLemmatizer\n",
	"\n",
	"stopwords = nltk.corpus.stopwords.words('English')\n",
	"stopwords += ['without','also']\n",
	"wordnet_lemmatizer = WordNetLemmatizer() \n",
	"\n",
	"def make_sentences(text):\n",
	" '''\n",
	" Takes a sentence and returns a list of chunks. Basically, any punctuation \n",
	" mark or number is an internal sentence seperator. Should be used with NLTK tokenizer.\n",
	" \n",
	" '''\n",
	" \n",
	" #matches anything which is not words and whitespace or dash\n",
	" p = re.compile(r'[^a-zA-Z\\s\\-]') \n",
	" \n",
	" # force text to lower case\n",
	" text = text.lower()\n",
	" \n",
	" # split text into parts based on regular\n",
	" # expresion\n",
	" sentences = re.split(p, text)\n",
	"\n",
	" keep_sentences = []\n",
	" for sentence in sentences:\n",
	" sentence = sentence.strip()\n",
	" if len(sentence) > 1:\n",
	" keep_sentences.append(sentence)\n",
	" \n",
	" return keep_sentences\n",
	"\n",
	"def count_words(sentence):\n",
	" '''return a counter with word frequence for sentence'''\n",
	" \n",
	" counter = collections.Counter()\n",
	" sentence = sentence.split(' ')\n",
	" \n",
	" for word in sentence:\n",
	" if word not in stopwords:\n",
	" counter[word] +=1\n",
	" \n",
	" return counter"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"with open('./data/worldbank titles.txt') as fh:\n",
	" titles = fh.readlines() "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"global_counter = collections.Counter()\n",
	"for i, title in enumerate(titles):\n",
	" title = title.strip()\n",
	" sentences = make_sentences(title)\n",
	" for sentence in sentences:\n",
	" global_counter += count_words(sentence)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"freqs = pd.Series(global_counter)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"freqs.to_csv('./data/worldbank.csv')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"adaptation 1\n",
	"adapting 1\n",
	"analysis 5\n",
	"approaches 1\n",
	"assessment 1\n",
	"balanced 1\n",
	"bangladesh 1\n",
	"benefit 1\n",
	"change 1\n",
	"changing 1\n",
	"dtype: int64"
	]
	},
	"execution_count": 11,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"freqs[0:10]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}