itsuncheng/preprocessing.ipynb

## gistfile1.txt
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import Library"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import nltk\n",
    "nltk.download()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Lowercase"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "it is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.\n"
     ]
    }
   ],
   "source": [
    "text = \"It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.\"\n",
    "text = text.lower()\n",
    "print(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Removing Punctuation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~\n"
     ]
    }
   ],
   "source": [
    "import string\n",
    "print(string.punctuation)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "it is a truth universally acknowledged that a single man in possession of a good fortune must be in want of a wife\n"
     ]
    }
   ],
   "source": [
    "text_p = \"\".join([char for char in text if char not in string.punctuation])\n",
    "print(text_p)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Word Tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['it', 'is', 'a', 'truth', 'universally', 'acknowledged', 'that', 'a', 'single', 'man', 'in', 'possession', 'of', 'a', 'good', 'fortune', 'must', 'be', 'in', 'want', 'of', 'a', 'wife']\n"
     ]
    }
   ],
   "source": [
    "from nltk import word_tokenize\n",
    "words = word_tokenize(text_p)\n",
    "print(words)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Stopword Filtering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \"you're\", \"you've\", \"you'll\", \"you'd\", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', \"she's\", 'her', 'hers', 'herself', 'it', \"it's\", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', \"that'll\", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', \"don't\", 'should', \"should've\", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', \"aren't\", 'couldn', \"couldn't\", 'didn', \"didn't\", 'doesn', \"doesn't\", 'hadn', \"hadn't\", 'hasn', \"hasn't\", 'haven', \"haven't\", 'isn', \"isn't\", 'ma', 'mightn', \"mightn't\", 'mustn', \"mustn't\", 'needn', \"needn't\", 'shan', \"shan't\", 'shouldn', \"shouldn't\", 'wasn', \"wasn't\", 'weren', \"weren't\", 'won', \"won't\", 'wouldn', \"wouldn't\"]\n"
     ]
    }
   ],
   "source": [
    "from nltk.corpus import stopwords\n",
    "stop_words = stopwords.words('english')\n",
    "print(stop_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['truth', 'universally', 'acknowledged', 'single', 'man', 'possession', 'good', 'fortune', 'must', 'want', 'wife']\n"
     ]
    }
   ],
   "source": [
    "filtered_words = [word for word in words if word not in stop_words]\n",
    "print(filtered_words)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Stemming"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['truth', 'univers', 'acknowledg', 'singl', 'man', 'possess', 'good', 'fortun', 'must', 'want', 'wife']\n"
     ]
    }
   ],
   "source": [
    "from nltk.stem.porter import PorterStemmer\n",
    "porter = PorterStemmer()\n",
    "stemmed = [porter.stem(word) for word in filtered_words]\n",
    "print(stemmed)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## POS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('truth', 'NN'), ('universally', 'RB'), ('acknowledged', 'VBD'), ('single', 'JJ'), ('man', 'NN'), ('possession', 'NN'), ('good', 'JJ'), ('fortune', 'NN'), ('must', 'MD'), ('want', 'VB'), ('wife', 'NN')]\n"
     ]
    }
   ],
   "source": [
    "from nltk import pos_tag\n",
    "pos = pos_tag(filtered_words)\n",
    "print(pos)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}

## preprocessing.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              preprocessing.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Import Library"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"True"
	]
	},
	"execution_count": 1,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import nltk\n",
	"nltk.download()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Lowercase"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"it is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.\n"
	]
	}
	],
	"source": [
	"text = \"It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.\"\n",
	"text = text.lower()\n",
	"print(text)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Removing Punctuation"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{\|}~\n"
	]
	}
	],
	"source": [
	"import string\n",
	"print(string.punctuation)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"it is a truth universally acknowledged that a single man in possession of a good fortune must be in want of a wife\n"
	]
	}
	],
	"source": [
	"text_p = \"\".join([char for char in text if char not in string.punctuation])\n",
	"print(text_p)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Word Tokenization"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"['it', 'is', 'a', 'truth', 'universally', 'acknowledged', 'that', 'a', 'single', 'man', 'in', 'possession', 'of', 'a', 'good', 'fortune', 'must', 'be', 'in', 'want', 'of', 'a', 'wife']\n"
	]
	}
	],
	"source": [
	"from nltk import word_tokenize\n",
	"words = word_tokenize(text_p)\n",
	"print(words)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Stopword Filtering"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \"you're\", \"you've\", \"you'll\", \"you'd\", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', \"she's\", 'her', 'hers', 'herself', 'it', \"it's\", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', \"that'll\", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', \"don't\", 'should', \"should've\", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', \"aren't\", 'couldn', \"couldn't\", 'didn', \"didn't\", 'doesn', \"doesn't\", 'hadn', \"hadn't\", 'hasn', \"hasn't\", 'haven', \"haven't\", 'isn', \"isn't\", 'ma', 'mightn', \"mightn't\", 'mustn', \"mustn't\", 'needn', \"needn't\", 'shan', \"shan't\", 'shouldn', \"shouldn't\", 'wasn', \"wasn't\", 'weren', \"weren't\", 'won', \"won't\", 'wouldn', \"wouldn't\"]\n"
	]
	}
	],
	"source": [
	"from nltk.corpus import stopwords\n",
	"stop_words = stopwords.words('english')\n",
	"print(stop_words)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"['truth', 'universally', 'acknowledged', 'single', 'man', 'possession', 'good', 'fortune', 'must', 'want', 'wife']\n"
	]
	}
	],
	"source": [
	"filtered_words = [word for word in words if word not in stop_words]\n",
	"print(filtered_words)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Stemming"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"['truth', 'univers', 'acknowledg', 'singl', 'man', 'possess', 'good', 'fortun', 'must', 'want', 'wife']\n"
	]
	}
	],
	"source": [
	"from nltk.stem.porter import PorterStemmer\n",
	"porter = PorterStemmer()\n",
	"stemmed = [porter.stem(word) for word in filtered_words]\n",
	"print(stemmed)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## POS"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[('truth', 'NN'), ('universally', 'RB'), ('acknowledged', 'VBD'), ('single', 'JJ'), ('man', 'NN'), ('possession', 'NN'), ('good', 'JJ'), ('fortune', 'NN'), ('must', 'MD'), ('want', 'VB'), ('wife', 'NN')]\n"
	]
	}
	],
	"source": [
	"from nltk import pos_tag\n",
	"pos = pos_tag(filtered_words)\n",
	"print(pos)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}