Skip to content

Instantly share code, notes, and snippets.

@thunderock
Last active August 1, 2020 23:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thunderock/0b342161f06e6e0e37f072702e428916 to your computer and use it in GitHub Desktop.
Save thunderock/0b342161f06e6e0e37f072702e428916 to your computer and use it in GitHub Desktop.
data preprocessing
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Archive: DATA/1014_4361_bundle_archive.zip\n",
" inflating: DATA/ner.csv \n",
" inflating: DATA/ner_dataset.csv \n"
]
}
],
"source": [
"import pandas as pd\n",
"from tqdm import tqdm\n",
"!unzip DATA/1014_4361_bundle_archive.zip -d DATA/"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"b'Skipping line 281837: expected 25 fields, saw 34\\n'\n"
]
}
],
"source": [
"df = pd.read_csv(\"DATA/ner.csv\", encoding = \"ISO-8859-1\", error_bad_lines=False)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>lemma</th>\n",
" <th>next-lemma</th>\n",
" <th>next-next-lemma</th>\n",
" <th>next-next-pos</th>\n",
" <th>next-next-shape</th>\n",
" <th>next-next-word</th>\n",
" <th>next-pos</th>\n",
" <th>next-shape</th>\n",
" <th>next-word</th>\n",
" <th>...</th>\n",
" <th>prev-prev-lemma</th>\n",
" <th>prev-prev-pos</th>\n",
" <th>prev-prev-shape</th>\n",
" <th>prev-prev-word</th>\n",
" <th>prev-shape</th>\n",
" <th>prev-word</th>\n",
" <th>sentence_idx</th>\n",
" <th>shape</th>\n",
" <th>word</th>\n",
" <th>tag</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>thousand</td>\n",
" <td>of</td>\n",
" <td>demonstr</td>\n",
" <td>NNS</td>\n",
" <td>lowercase</td>\n",
" <td>demonstrators</td>\n",
" <td>IN</td>\n",
" <td>lowercase</td>\n",
" <td>of</td>\n",
" <td>...</td>\n",
" <td>__start2__</td>\n",
" <td>__START2__</td>\n",
" <td>wildcard</td>\n",
" <td>__START2__</td>\n",
" <td>wildcard</td>\n",
" <td>__START1__</td>\n",
" <td>1.0</td>\n",
" <td>capitalized</td>\n",
" <td>Thousands</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>of</td>\n",
" <td>demonstr</td>\n",
" <td>have</td>\n",
" <td>VBP</td>\n",
" <td>lowercase</td>\n",
" <td>have</td>\n",
" <td>NNS</td>\n",
" <td>lowercase</td>\n",
" <td>demonstrators</td>\n",
" <td>...</td>\n",
" <td>__start1__</td>\n",
" <td>__START1__</td>\n",
" <td>wildcard</td>\n",
" <td>__START1__</td>\n",
" <td>capitalized</td>\n",
" <td>Thousands</td>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>of</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>demonstr</td>\n",
" <td>have</td>\n",
" <td>march</td>\n",
" <td>VBN</td>\n",
" <td>lowercase</td>\n",
" <td>marched</td>\n",
" <td>VBP</td>\n",
" <td>lowercase</td>\n",
" <td>have</td>\n",
" <td>...</td>\n",
" <td>thousand</td>\n",
" <td>NNS</td>\n",
" <td>capitalized</td>\n",
" <td>Thousands</td>\n",
" <td>lowercase</td>\n",
" <td>of</td>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>demonstrators</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>have</td>\n",
" <td>march</td>\n",
" <td>through</td>\n",
" <td>IN</td>\n",
" <td>lowercase</td>\n",
" <td>through</td>\n",
" <td>VBN</td>\n",
" <td>lowercase</td>\n",
" <td>marched</td>\n",
" <td>...</td>\n",
" <td>of</td>\n",
" <td>IN</td>\n",
" <td>lowercase</td>\n",
" <td>of</td>\n",
" <td>lowercase</td>\n",
" <td>demonstrators</td>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>have</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>march</td>\n",
" <td>through</td>\n",
" <td>london</td>\n",
" <td>NNP</td>\n",
" <td>capitalized</td>\n",
" <td>London</td>\n",
" <td>IN</td>\n",
" <td>lowercase</td>\n",
" <td>through</td>\n",
" <td>...</td>\n",
" <td>demonstr</td>\n",
" <td>NNS</td>\n",
" <td>lowercase</td>\n",
" <td>demonstrators</td>\n",
" <td>lowercase</td>\n",
" <td>have</td>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>marched</td>\n",
" <td>O</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 25 columns</p>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 lemma next-lemma next-next-lemma next-next-pos \\\n",
"0 0 thousand of demonstr NNS \n",
"1 1 of demonstr have VBP \n",
"2 2 demonstr have march VBN \n",
"3 3 have march through IN \n",
"4 4 march through london NNP \n",
"\n",
" next-next-shape next-next-word next-pos next-shape next-word ... \\\n",
"0 lowercase demonstrators IN lowercase of ... \n",
"1 lowercase have NNS lowercase demonstrators ... \n",
"2 lowercase marched VBP lowercase have ... \n",
"3 lowercase through VBN lowercase marched ... \n",
"4 capitalized London IN lowercase through ... \n",
"\n",
" prev-prev-lemma prev-prev-pos prev-prev-shape prev-prev-word prev-shape \\\n",
"0 __start2__ __START2__ wildcard __START2__ wildcard \n",
"1 __start1__ __START1__ wildcard __START1__ capitalized \n",
"2 thousand NNS capitalized Thousands lowercase \n",
"3 of IN lowercase of lowercase \n",
"4 demonstr NNS lowercase demonstrators lowercase \n",
"\n",
" prev-word sentence_idx shape word tag \n",
"0 __START1__ 1.0 capitalized Thousands O \n",
"1 Thousands 1.0 lowercase of O \n",
"2 of 1.0 lowercase demonstrators O \n",
"3 demonstrators 1.0 lowercase have O \n",
"4 have 1.0 lowercase marched O \n",
"\n",
"[5 rows x 25 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Unnamed: 0', 'lemma', 'next-lemma', 'next-next-lemma', 'next-next-pos',\n",
" 'next-next-shape', 'next-next-word', 'next-pos', 'next-shape',\n",
" 'next-word', 'pos', 'prev-iob', 'prev-lemma', 'prev-pos',\n",
" 'prev-prev-iob', 'prev-prev-lemma', 'prev-prev-pos', 'prev-prev-shape',\n",
" 'prev-prev-word', 'prev-shape', 'prev-word', 'sentence_idx', 'shape',\n",
" 'word', 'tag'],\n",
" dtype='object')"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"df=df.drop(['Unnamed: 0', 'lemma', 'next-lemma', 'next-next-lemma', 'next-next-pos',\n",
" 'next-next-shape', 'next-next-word', 'next-pos', 'next-shape',\n",
" 'next-word', 'prev-iob', 'prev-lemma', 'prev-pos',\n",
" 'prev-prev-iob', 'prev-prev-lemma', 'prev-prev-pos', 'prev-prev-shape',\n",
" 'prev-prev-word', 'prev-shape', 'prev-word',\"pos\"],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1050795 entries, 0 to 1050794\n",
"Data columns (total 4 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 sentence_idx 1050794 non-null float64\n",
" 1 shape 1050794 non-null object \n",
" 2 word 1050794 non-null object \n",
" 3 tag 1050794 non-null object \n",
"dtypes: float64(1), object(3)\n",
"memory usage: 32.1+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sentence_idx</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>1.050794e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>1.898184e+04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>1.576237e+04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>5.997000e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>1.201700e+04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>3.592600e+04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>4.795900e+04</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sentence_idx\n",
"count 1.050794e+06\n",
"mean 1.898184e+04\n",
"std 1.576237e+04\n",
"min 1.000000e+00\n",
"25% 5.997000e+03\n",
"50% 1.201700e+04\n",
"75% 3.592600e+04\n",
"max 4.795900e+04"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sentence_idx</th>\n",
" <th>shape</th>\n",
" <th>word</th>\n",
" <th>tag</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>capitalized</td>\n",
" <td>Thousands</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>of</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>demonstrators</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>have</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>marched</td>\n",
" <td>O</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sentence_idx shape word tag\n",
"0 1.0 capitalized Thousands O\n",
"1 1.0 lowercase of O\n",
"2 1.0 lowercase demonstrators O\n",
"3 1.0 lowercase have O\n",
"4 1.0 lowercase marched O"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 1050794\n",
"unique 17\n",
"top O\n",
"freq 889973\n",
"Name: tag, dtype: object"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['tag'].describe()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',\n",
" 'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',\n",
" 'I-eve', 'I-nat', nan], dtype=object)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.tag.unique()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sentence_idx</th>\n",
" <th>shape</th>\n",
" <th>word</th>\n",
" <th>tag</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>capitalized</td>\n",
" <td>Thousands</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>of</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>demonstrators</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>have</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>marched</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>through</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1.0</td>\n",
" <td>capitalized</td>\n",
" <td>London</td>\n",
" <td>B-geo</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>to</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>protest</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>the</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>war</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>in</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>1.0</td>\n",
" <td>capitalized</td>\n",
" <td>Iraq</td>\n",
" <td>B-geo</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>and</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>demand</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>the</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>withdrawal</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>of</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>1.0</td>\n",
" <td>capitalized</td>\n",
" <td>British</td>\n",
" <td>B-gpe</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>troops</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>from</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>that</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>1.0</td>\n",
" <td>lowercase</td>\n",
" <td>country</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>1.0</td>\n",
" <td>punct</td>\n",
" <td>.</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>2.0</td>\n",
" <td>capitalized</td>\n",
" <td>Families</td>\n",
" <td>O</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sentence_idx shape word tag\n",
"0 1.0 capitalized Thousands O\n",
"1 1.0 lowercase of O\n",
"2 1.0 lowercase demonstrators O\n",
"3 1.0 lowercase have O\n",
"4 1.0 lowercase marched O\n",
"5 1.0 lowercase through O\n",
"6 1.0 capitalized London B-geo\n",
"7 1.0 lowercase to O\n",
"8 1.0 lowercase protest O\n",
"9 1.0 lowercase the O\n",
"10 1.0 lowercase war O\n",
"11 1.0 lowercase in O\n",
"12 1.0 capitalized Iraq B-geo\n",
"13 1.0 lowercase and O\n",
"14 1.0 lowercase demand O\n",
"15 1.0 lowercase the O\n",
"16 1.0 lowercase withdrawal O\n",
"17 1.0 lowercase of O\n",
"18 1.0 capitalized British B-gpe\n",
"19 1.0 lowercase troops O\n",
"20 1.0 lowercase from O\n",
"21 1.0 lowercase that O\n",
"22 1.0 lowercase country O\n",
"23 1.0 punct . O\n",
"24 2.0 capitalized Families O"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(25)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"total_sentences = dict()\n",
"for i in df.itertuples():\n",
" if i[1] not in total_sentences:\n",
" total_sentences[i[1]] = i[0]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(35178, 1.0, 47959.0)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(total_sentences), min(total_sentences.keys()), max(total_sentences.keys())"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('O', 'have', 'lowercase', 1.0)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.iloc[3].tag, df.iloc[3].word, df.iloc[3]['shape'], df.iloc[3].sentence_idx"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|█████████▉| 35177/35178 [07:02<00:00, 83.28it/s] \n"
]
}
],
"source": [
"final_sentences = []\n",
"for i in tqdm(total_sentences):\n",
" sentence = []\n",
" present_id = i\n",
" idx = total_sentences[i]\n",
" try:\n",
" while df.iloc[idx].sentence_idx == present_id:\n",
" sentence.append((df.iloc[idx].word, df.iloc[idx]['shape'], \n",
" df.iloc[idx].tag))\n",
" idx += 1\n",
" except: break\n",
" final_sentences.append(sentence) "
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"35177"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(final_sentences)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('Thousands', 'capitalized', 'O'),\n",
" ('of', 'lowercase', 'O'),\n",
" ('demonstrators', 'lowercase', 'O'),\n",
" ('have', 'lowercase', 'O'),\n",
" ('marched', 'lowercase', 'O'),\n",
" ('through', 'lowercase', 'O'),\n",
" ('London', 'capitalized', 'B-geo'),\n",
" ('to', 'lowercase', 'O'),\n",
" ('protest', 'lowercase', 'O'),\n",
" ('the', 'lowercase', 'O'),\n",
" ('war', 'lowercase', 'O'),\n",
" ('in', 'lowercase', 'O'),\n",
" ('Iraq', 'capitalized', 'B-geo'),\n",
" ('and', 'lowercase', 'O'),\n",
" ('demand', 'lowercase', 'O'),\n",
" ('the', 'lowercase', 'O'),\n",
" ('withdrawal', 'lowercase', 'O'),\n",
" ('of', 'lowercase', 'O'),\n",
" ('British', 'capitalized', 'B-gpe'),\n",
" ('troops', 'lowercase', 'O'),\n",
" ('from', 'lowercase', 'O'),\n",
" ('that', 'lowercase', 'O'),\n",
" ('country', 'lowercase', 'O'),\n",
" ('.', 'punct', 'O')]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_sentences[0]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"import pickle as pkl"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"pkl.dump(final_sentences, open('DATA/training_instances.pkl', 'wb'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment