amqdn/fourthbrain-tccc-classification.ipynb

## fourthbrain-tccc-classification.ipynb
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.7.10","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"## Welcome to the Jigsaw Toxic Comment Classification Challenge","metadata":{}},{"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nimport altair as alt","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2021-06-17T06:20:50.463156Z","iopub.execute_input":"2021-06-17T06:20:50.463656Z","iopub.status.idle":"2021-06-17T06:20:50.628071Z","shell.execute_reply.started":"2021-06-17T06:20:50.463557Z","shell.execute_reply":"2021-06-17T06:20:50.627068Z"},"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"code","source":"import os\nos.chdir('../input/jigsaw-toxic-comment-classification-challenge/')","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:50.629479Z","iopub.execute_input":"2021-06-17T06:20:50.629760Z","iopub.status.idle":"2021-06-17T06:20:50.633034Z","shell.execute_reply.started":"2021-06-17T06:20:50.629734Z","shell.execute_reply":"2021-06-17T06:20:50.632373Z"},"trusted":true},"execution_count":2,"outputs":[]},{"cell_type":"markdown","source":"We are provided `train.csv`, `test.csv`, and `test_labels.csv`. All of these files also happen to be `.zip`.","metadata":{}},{"cell_type":"code","source":"train = pd.read_csv('train.csv.zip')\ntrain.head()","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:50.634656Z","iopub.execute_input":"2021-06-17T06:20:50.635053Z","iopub.status.idle":"2021-06-17T06:20:52.750224Z","shell.execute_reply.started":"2021-06-17T06:20:50.635023Z","shell.execute_reply":"2021-06-17T06:20:52.749325Z"},"trusted":true},"execution_count":3,"outputs":[{"execution_count":3,"output_type":"execute_result","data":{"text/plain":"                 id                                       comment_text  toxic  \\\n0  0000997932d777bf  Explanation\\nWhy the edits made under my usern...      0   \n1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   \n2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   \n3  0001b41b1c6bb37e  \"\\nMore\\nI can't make any real suggestions on ...      0   \n4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   \n\n   severe_toxic  obscene  threat  insult  identity_hate  \n0             0        0       0       0              0  \n1             0        0       0       0              0  \n2             0        0       0       0              0  \n3             0        0       0       0              0  \n4             0        0       0       0              0  ","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>comment_text</th>\n      <th>toxic</th>\n      <th>severe_toxic</th>\n      <th>obscene</th>\n      <th>threat</th>\n      <th>insult</th>\n      <th>identity_hate</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0000997932d777bf</td>\n      <td>Explanation\\nWhy the edits made under my usern...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>000103f0d9cfb60f</td>\n      <td>D'aww! He matches this background colour I'm s...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>000113f07ec002fd</td>\n      <td>Hey man, I'm really not trying to edit war. It...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>0001b41b1c6bb37e</td>\n      <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>0001d958c54c6e35</td>\n      <td>You, sir, are my hero. Any chance you remember...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"train.info()","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:52.751677Z","iopub.execute_input":"2021-06-17T06:20:52.752059Z","iopub.status.idle":"2021-06-17T06:20:52.808832Z","shell.execute_reply.started":"2021-06-17T06:20:52.752029Z","shell.execute_reply":"2021-06-17T06:20:52.808155Z"},"trusted":true},"execution_count":4,"outputs":[{"name":"stdout","text":"<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 159571 entries, 0 to 159570\nData columns (total 8 columns):\n #   Column         Non-Null Count   Dtype \n---  ------         --------------   ----- \n 0   id             159571 non-null  object\n 1   comment_text   159571 non-null  object\n 2   toxic          159571 non-null  int64 \n 3   severe_toxic   159571 non-null  int64 \n 4   obscene        159571 non-null  int64 \n 5   threat         159571 non-null  int64 \n 6   insult         159571 non-null  int64 \n 7   identity_hate  159571 non-null  int64 \ndtypes: int64(6), object(2)\nmemory usage: 9.7+ MB\n","output_type":"stream"}]},{"cell_type":"markdown","source":"We don't need the `id` column, so we'll drop that. Let's also sample some examples of each type of class. *Warning: It can get pretty bad.*","metadata":{}},{"cell_type":"code","source":"train = train.drop(columns=['id'])","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:52.809971Z","iopub.execute_input":"2021-06-17T06:20:52.810221Z","iopub.status.idle":"2021-06-17T06:20:52.824192Z","shell.execute_reply.started":"2021-06-17T06:20:52.810196Z","shell.execute_reply":"2021-06-17T06:20:52.823263Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"for col in train.columns[1:]:\n    print(f'CLASS: {col}')\n    print(train.loc[train[col] == 1].sample().comment_text.values[0])\n    print()\n    print('----')\n    print()","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:52.825254Z","iopub.execute_input":"2021-06-17T06:20:52.825534Z","iopub.status.idle":"2021-06-17T06:20:52.867466Z","shell.execute_reply.started":"2021-06-17T06:20:52.825507Z","shell.execute_reply":"2021-06-17T06:20:52.866555Z"},"trusted":true},"execution_count":6,"outputs":[{"name":"stdout","text":"CLASS: toxic\nhurt? i'm alright and don't give a damn fool.\n\n----\n\nCLASS: severe_toxic\ndie \n\nplease why dont u do us all a favioe and kill yourslef\n\n----\n\nCLASS: obscene\nYou guys have a lot of balls whining about WP transgressions after the displays of lack of respect for these policies yourselves! Man WP displays it's ignorance and bias for a few childish editors in these examples. Compete hoax. The idiots pushing allthe rules cry realquick when the same techniques are spouted towards them.\n\n----\n\nCLASS: threat\nYOURE GONNA BE A DEAD FAGGOT 66.158.193.58\n\n----\n\nCLASS: insult\nfuck you\n\nfuck you majorly. suck my dick asshole.\n\n----\n\nCLASS: identity_hate\nWhy am I being blocked by an obvious anti-semite? Gareth Hughes, the so called anglican priest is a racist and an anti-semite.  Look at everyone of his posts regarding Assyrians?  He hates Assyrians, he is a jealous little twit with too much time on his hands and no real education... Anti-Semite!!!\n\n----\n\n","output_type":"stream"}]},{"cell_type":"markdown","source":"Ah, the internet... Let's plot the distribution of labels:","metadata":{}},{"cell_type":"code","source":"label_counts = train.iloc[:, 1:].sum().reset_index().rename(columns={'index': 'Label', 0: 'Count'})\nalt.Chart(label_counts).mark_bar().encode(\n    x=alt.X('Label', axis=alt.Axis(labelAngle=-45)),\n    y='Count'\n).properties(title='Train Set Label Distribution')","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:52.868661Z","iopub.execute_input":"2021-06-17T06:20:52.868912Z","iopub.status.idle":"2021-06-17T06:20:52.906740Z","shell.execute_reply.started":"2021-06-17T06:20:52.868886Z","shell.execute_reply":"2021-06-17T06:20:52.905756Z"},"trusted":true},"execution_count":7,"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/html":"\n<div id=\"altair-viz-f9d85869fa82461a9ad261a73ee198b0\"></div>\n<script type=\"text/javascript\">\n  (function(spec, embedOpt){\n    let outputDiv = document.currentScript.previousElementSibling;\n    if (outputDiv.id !== \"altair-viz-f9d85869fa82461a9ad261a73ee198b0\") {\n      outputDiv = document.getElementById(\"altair-viz-f9d85869fa82461a9ad261a73ee198b0\");\n    }\n    const paths = {\n      \"vega\": \"https://cdn.jsdelivr.net/npm//vega@5?noext\",\n      \"vega-lib\": \"https://cdn.jsdelivr.net/npm//vega-lib?noext\",\n      \"vega-lite\": \"https://cdn.jsdelivr.net/npm//vega-lite@4.8.1?noext\",\n      \"vega-embed\": \"https://cdn.jsdelivr.net/npm//vega-embed@6?noext\",\n    };\n\n    function loadScript(lib) {\n      return new Promise(function(resolve, reject) {\n        var s = document.createElement('script');\n        s.src = paths[lib];\n        s.async = true;\n        s.onload = () => resolve(paths[lib]);\n        s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n        document.getElementsByTagName(\"head\")[0].appendChild(s);\n      });\n    }\n\n    function showError(err) {\n      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n      throw err;\n    }\n\n    function displayChart(vegaEmbed) {\n      vegaEmbed(outputDiv, spec, embedOpt)\n        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n    }\n\n    if(typeof define === \"function\" && define.amd) {\n      requirejs.config({paths});\n      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n    } else if (typeof vegaEmbed === \"function\") {\n      displayChart(vegaEmbed);\n    } else {\n      loadScript(\"vega\")\n        .then(() => loadScript(\"vega-lite\"))\n        .then(() => loadScript(\"vega-embed\"))\n        .catch(showError)\n        .then(() => displayChart(vegaEmbed));\n    }\n  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-c17690826a29f3a34e7037bd2199575d\"}, \"mark\": \"bar\", \"encoding\": {\"x\": {\"type\": \"nominal\", \"axis\": {\"labelAngle\": -45}, \"field\": \"Label\"}, \"y\": {\"type\": \"quantitative\", \"field\": \"Count\"}}, \"title\": \"Train Set Label Distribution\", \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.8.1.json\", \"datasets\": {\"data-c17690826a29f3a34e7037bd2199575d\": [{\"Label\": \"toxic\", \"Count\": 15294}, {\"Label\": \"severe_toxic\", \"Count\": 1595}, {\"Label\": \"obscene\", \"Count\": 8449}, {\"Label\": \"threat\", \"Count\": 478}, {\"Label\": \"insult\", \"Count\": 7877}, {\"Label\": \"identity_hate\", \"Count\": 1405}]}}, {\"mode\": \"vega-lite\"});\n</script>","text/plain":"alt.Chart(...)"},"metadata":{}}]},{"cell_type":"code","source":"test = pd.read_csv('test.csv.zip')\ntest.head()","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:52.909679Z","iopub.execute_input":"2021-06-17T06:20:52.910091Z","iopub.status.idle":"2021-06-17T06:20:54.614564Z","shell.execute_reply.started":"2021-06-17T06:20:52.910049Z","shell.execute_reply":"2021-06-17T06:20:54.613590Z"},"trusted":true},"execution_count":8,"outputs":[{"execution_count":8,"output_type":"execute_result","data":{"text/plain":"                 id                                       comment_text\n0  00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll...\n1  0000247867823ef7  == From RfC == \\n\\n The title is fine as it is...\n2  00013b17ad220c46  \" \\n\\n == Sources == \\n\\n * Zawe Ashton on Lap...\n3  00017563c3f7919a  :If you have a look back at the source, the in...\n4  00017695ad8997eb          I don't anonymously edit articles at all.","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>comment_text</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>00001cee341fdb12</td>\n      <td>Yo bitch Ja Rule is more succesful then you'll...</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>0000247867823ef7</td>\n      <td>== From RfC == \\n\\n The title is fine as it is...</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>00013b17ad220c46</td>\n      <td>\" \\n\\n == Sources == \\n\\n * Zawe Ashton on Lap...</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>00017563c3f7919a</td>\n      <td>:If you have a look back at the source, the in...</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>00017695ad8997eb</td>\n      <td>I don't anonymously edit articles at all.</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"test_labels = pd.read_csv('test_labels.csv.zip')\ntest_labels.head()","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:54.616392Z","iopub.execute_input":"2021-06-17T06:20:54.616785Z","iopub.status.idle":"2021-06-17T06:20:54.841431Z","shell.execute_reply.started":"2021-06-17T06:20:54.616749Z","shell.execute_reply":"2021-06-17T06:20:54.840803Z"},"trusted":true},"execution_count":9,"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":"                 id  toxic  severe_toxic  obscene  threat  insult  \\\n0  00001cee341fdb12     -1            -1       -1      -1      -1   \n1  0000247867823ef7     -1            -1       -1      -1      -1   \n2  00013b17ad220c46     -1            -1       -1      -1      -1   \n3  00017563c3f7919a     -1            -1       -1      -1      -1   \n4  00017695ad8997eb     -1            -1       -1      -1      -1   \n\n   identity_hate  \n0             -1  \n1             -1  \n2             -1  \n3             -1  \n4             -1  ","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>toxic</th>\n      <th>severe_toxic</th>\n      <th>obscene</th>\n      <th>threat</th>\n      <th>insult</th>\n      <th>identity_hate</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>00001cee341fdb12</td>\n      <td>-1</td>\n      <td>-1</td>\n      <td>-1</td>\n      <td>-1</td>\n      <td>-1</td>\n      <td>-1</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>0000247867823ef7</td>\n      <td>-1</td>\n      <td>-1</td>\n      <td>-1</td>\n      <td>-1</td>\n      <td>-1</td>\n      <td>-1</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>00013b17ad220c46</td>\n      <td>-1</td>\n      <td>-1</td>\n      <td>-1</td>\n      <td>-1</td>\n      <td>-1</td>\n      <td>-1</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>00017563c3f7919a</td>\n      <td>-1</td>\n      <td>-1</td>\n      <td>-1</td>\n      <td>-1</td>\n      <td>-1</td>\n      <td>-1</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>00017695ad8997eb</td>\n      <td>-1</td>\n      <td>-1</td>\n      <td>-1</td>\n      <td>-1</td>\n      <td>-1</td>\n      <td>-1</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"markdown","source":"That's weird. Why are the test labels all `-1`?","metadata":{}},{"cell_type":"code","source":"test_labels.describe()","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:54.842449Z","iopub.execute_input":"2021-06-17T06:20:54.842749Z","iopub.status.idle":"2021-06-17T06:20:54.905492Z","shell.execute_reply.started":"2021-06-17T06:20:54.842721Z","shell.execute_reply":"2021-06-17T06:20:54.904579Z"},"trusted":true},"execution_count":10,"outputs":[{"execution_count":10,"output_type":"execute_result","data":{"text/plain":"               toxic   severe_toxic        obscene         threat  \\\ncount  153164.000000  153164.000000  153164.000000  153164.000000   \nmean       -0.542530      -0.579895      -0.558193      -0.580913   \nstd         0.572465       0.498408       0.542966       0.496195   \nmin        -1.000000      -1.000000      -1.000000      -1.000000   \n25%        -1.000000      -1.000000      -1.000000      -1.000000   \n50%        -1.000000      -1.000000      -1.000000      -1.000000   \n75%         0.000000       0.000000       0.000000       0.000000   \nmax         1.000000       1.000000       1.000000       1.000000   \n\n              insult  identity_hate  \ncount  153164.000000  153164.000000  \nmean       -0.559916      -0.577642  \nstd         0.539594       0.503260  \nmin        -1.000000      -1.000000  \n25%        -1.000000      -1.000000  \n50%        -1.000000      -1.000000  \n75%         0.000000       0.000000  \nmax         1.000000       1.000000  ","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>toxic</th>\n      <th>severe_toxic</th>\n      <th>obscene</th>\n      <th>threat</th>\n      <th>insult</th>\n      <th>identity_hate</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>count</th>\n      <td>153164.000000</td>\n      <td>153164.000000</td>\n      <td>153164.000000</td>\n      <td>153164.000000</td>\n      <td>153164.000000</td>\n      <td>153164.000000</td>\n    </tr>\n    <tr>\n      <th>mean</th>\n      <td>-0.542530</td>\n      <td>-0.579895</td>\n      <td>-0.558193</td>\n      <td>-0.580913</td>\n      <td>-0.559916</td>\n      <td>-0.577642</td>\n    </tr>\n    <tr>\n      <th>std</th>\n      <td>0.572465</td>\n      <td>0.498408</td>\n      <td>0.542966</td>\n      <td>0.496195</td>\n      <td>0.539594</td>\n      <td>0.503260</td>\n    </tr>\n    <tr>\n      <th>min</th>\n      <td>-1.000000</td>\n      <td>-1.000000</td>\n      <td>-1.000000</td>\n      <td>-1.000000</td>\n      <td>-1.000000</td>\n      <td>-1.000000</td>\n    </tr>\n    <tr>\n      <th>25%</th>\n      <td>-1.000000</td>\n      <td>-1.000000</td>\n      <td>-1.000000</td>\n      <td>-1.000000</td>\n      <td>-1.000000</td>\n      <td>-1.000000</td>\n    </tr>\n    <tr>\n      <th>50%</th>\n      <td>-1.000000</td>\n      <td>-1.000000</td>\n      <td>-1.000000</td>\n      <td>-1.000000</td>\n      <td>-1.000000</td>\n      <td>-1.000000</td>\n    </tr>\n    <tr>\n      <th>75%</th>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n    </tr>\n    <tr>\n      <th>max</th>\n      <td>1.000000</td>\n      <td>1.000000</td>\n      <td>1.000000</td>\n      <td>1.000000</td>\n      <td>1.000000</td>\n      <td>1.000000</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"train.describe()","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:54.906820Z","iopub.execute_input":"2021-06-17T06:20:54.907165Z","iopub.status.idle":"2021-06-17T06:20:54.961386Z","shell.execute_reply.started":"2021-06-17T06:20:54.907126Z","shell.execute_reply":"2021-06-17T06:20:54.960265Z"},"trusted":true},"execution_count":11,"outputs":[{"execution_count":11,"output_type":"execute_result","data":{"text/plain":"               toxic   severe_toxic        obscene         threat  \\\ncount  159571.000000  159571.000000  159571.000000  159571.000000   \nmean        0.095844       0.009996       0.052948       0.002996   \nstd         0.294379       0.099477       0.223931       0.054650   \nmin         0.000000       0.000000       0.000000       0.000000   \n25%         0.000000       0.000000       0.000000       0.000000   \n50%         0.000000       0.000000       0.000000       0.000000   \n75%         0.000000       0.000000       0.000000       0.000000   \nmax         1.000000       1.000000       1.000000       1.000000   \n\n              insult  identity_hate  \ncount  159571.000000  159571.000000  \nmean        0.049364       0.008805  \nstd         0.216627       0.093420  \nmin         0.000000       0.000000  \n25%         0.000000       0.000000  \n50%         0.000000       0.000000  \n75%         0.000000       0.000000  \nmax         1.000000       1.000000  ","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>toxic</th>\n      <th>severe_toxic</th>\n      <th>obscene</th>\n      <th>threat</th>\n      <th>insult</th>\n      <th>identity_hate</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>count</th>\n      <td>159571.000000</td>\n      <td>159571.000000</td>\n      <td>159571.000000</td>\n      <td>159571.000000</td>\n      <td>159571.000000</td>\n      <td>159571.000000</td>\n    </tr>\n    <tr>\n      <th>mean</th>\n      <td>0.095844</td>\n      <td>0.009996</td>\n      <td>0.052948</td>\n      <td>0.002996</td>\n      <td>0.049364</td>\n      <td>0.008805</td>\n    </tr>\n    <tr>\n      <th>std</th>\n      <td>0.294379</td>\n      <td>0.099477</td>\n      <td>0.223931</td>\n      <td>0.054650</td>\n      <td>0.216627</td>\n      <td>0.093420</td>\n    </tr>\n    <tr>\n      <th>min</th>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n    </tr>\n    <tr>\n      <th>25%</th>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n    </tr>\n    <tr>\n      <th>50%</th>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n    </tr>\n    <tr>\n      <th>75%</th>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n    </tr>\n    <tr>\n      <th>max</th>\n      <td>1.000000</td>\n      <td>1.000000</td>\n      <td>1.000000</td>\n      <td>1.000000</td>\n      <td>1.000000</td>\n      <td>1.000000</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"markdown","source":"This is another good reason to `.describe()` any df because weird values will likely not show up in `.head()`. Let's fix this now:","metadata":{}},{"cell_type":"code","source":"test_labels = test_labels.drop(columns=['id'])\nfor col in test_labels.columns:\n    test_labels[col].loc[test_labels[col] == -1] = 0\ntest_labels.describe()","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:54.963773Z","iopub.execute_input":"2021-06-17T06:20:54.964199Z","iopub.status.idle":"2021-06-17T06:20:55.033905Z","shell.execute_reply.started":"2021-06-17T06:20:54.964159Z","shell.execute_reply":"2021-06-17T06:20:55.029310Z"},"trusted":true},"execution_count":12,"outputs":[{"execution_count":12,"output_type":"execute_result","data":{"text/plain":"               toxic   severe_toxic        obscene         threat  \\\ncount  153164.000000  153164.000000  153164.000000  153164.000000   \nmean        0.039761       0.002396       0.024098       0.001378   \nstd         0.195399       0.048892       0.153355       0.037091   \nmin         0.000000       0.000000       0.000000       0.000000   \n25%         0.000000       0.000000       0.000000       0.000000   \n50%         0.000000       0.000000       0.000000       0.000000   \n75%         0.000000       0.000000       0.000000       0.000000   \nmax         1.000000       1.000000       1.000000       1.000000   \n\n              insult  identity_hate  \ncount  153164.000000  153164.000000  \nmean        0.022375       0.004649  \nstd         0.147899       0.068022  \nmin         0.000000       0.000000  \n25%         0.000000       0.000000  \n50%         0.000000       0.000000  \n75%         0.000000       0.000000  \nmax         1.000000       1.000000  ","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>toxic</th>\n      <th>severe_toxic</th>\n      <th>obscene</th>\n      <th>threat</th>\n      <th>insult</th>\n      <th>identity_hate</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>count</th>\n      <td>153164.000000</td>\n      <td>153164.000000</td>\n      <td>153164.000000</td>\n      <td>153164.000000</td>\n      <td>153164.000000</td>\n      <td>153164.000000</td>\n    </tr>\n    <tr>\n      <th>mean</th>\n      <td>0.039761</td>\n      <td>0.002396</td>\n      <td>0.024098</td>\n      <td>0.001378</td>\n      <td>0.022375</td>\n      <td>0.004649</td>\n    </tr>\n    <tr>\n      <th>std</th>\n      <td>0.195399</td>\n      <td>0.048892</td>\n      <td>0.153355</td>\n      <td>0.037091</td>\n      <td>0.147899</td>\n      <td>0.068022</td>\n    </tr>\n    <tr>\n      <th>min</th>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n    </tr>\n    <tr>\n      <th>25%</th>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n    </tr>\n    <tr>\n      <th>50%</th>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n    </tr>\n    <tr>\n      <th>75%</th>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n    </tr>\n    <tr>\n      <th>max</th>\n      <td>1.000000</td>\n      <td>1.000000</td>\n      <td>1.000000</td>\n      <td>1.000000</td>\n      <td>1.000000</td>\n      <td>1.000000</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"test.info()","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:55.035358Z","iopub.execute_input":"2021-06-17T06:20:55.035773Z","iopub.status.idle":"2021-06-17T06:20:55.100754Z","shell.execute_reply.started":"2021-06-17T06:20:55.035730Z","shell.execute_reply":"2021-06-17T06:20:55.099622Z"},"trusted":true},"execution_count":13,"outputs":[{"name":"stdout","text":"<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 153164 entries, 0 to 153163\nData columns (total 2 columns):\n #   Column        Non-Null Count   Dtype \n---  ------        --------------   ----- \n 0   id            153164 non-null  object\n 1   comment_text  153164 non-null  object\ndtypes: object(2)\nmemory usage: 2.3+ MB\n","output_type":"stream"}]},{"cell_type":"code","source":"test_labels.info()","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:55.102308Z","iopub.execute_input":"2021-06-17T06:20:55.102719Z","iopub.status.idle":"2021-06-17T06:20:55.122570Z","shell.execute_reply.started":"2021-06-17T06:20:55.102678Z","shell.execute_reply":"2021-06-17T06:20:55.121521Z"},"trusted":true},"execution_count":14,"outputs":[{"name":"stdout","text":"<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 153164 entries, 0 to 153163\nData columns (total 6 columns):\n #   Column         Non-Null Count   Dtype\n---  ------         --------------   -----\n 0   toxic          153164 non-null  int64\n 1   severe_toxic   153164 non-null  int64\n 2   obscene        153164 non-null  int64\n 3   threat         153164 non-null  int64\n 4   insult         153164 non-null  int64\n 5   identity_hate  153164 non-null  int64\ndtypes: int64(6)\nmemory usage: 7.0 MB\n","output_type":"stream"}]},{"cell_type":"code","source":"label_counts = test_labels.sum().reset_index().rename(columns={'index': 'Label', 0: 'Count'})\nalt.Chart(label_counts).mark_bar().encode(\n    x=alt.X('Label', axis=alt.Axis(labelAngle=-45)),\n    y='Count'\n).properties(title='Dev Set Label Distribution')","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:55.124194Z","iopub.execute_input":"2021-06-17T06:20:55.124622Z","iopub.status.idle":"2021-06-17T06:20:55.155742Z","shell.execute_reply.started":"2021-06-17T06:20:55.124572Z","shell.execute_reply":"2021-06-17T06:20:55.154728Z"},"trusted":true},"execution_count":15,"outputs":[{"execution_count":15,"output_type":"execute_result","data":{"text/html":"\n<div id=\"altair-viz-0695990f152a428b88b61dcd24bf9877\"></div>\n<script type=\"text/javascript\">\n  (function(spec, embedOpt){\n    let outputDiv = document.currentScript.previousElementSibling;\n    if (outputDiv.id !== \"altair-viz-0695990f152a428b88b61dcd24bf9877\") {\n      outputDiv = document.getElementById(\"altair-viz-0695990f152a428b88b61dcd24bf9877\");\n    }\n    const paths = {\n      \"vega\": \"https://cdn.jsdelivr.net/npm//vega@5?noext\",\n      \"vega-lib\": \"https://cdn.jsdelivr.net/npm//vega-lib?noext\",\n      \"vega-lite\": \"https://cdn.jsdelivr.net/npm//vega-lite@4.8.1?noext\",\n      \"vega-embed\": \"https://cdn.jsdelivr.net/npm//vega-embed@6?noext\",\n    };\n\n    function loadScript(lib) {\n      return new Promise(function(resolve, reject) {\n        var s = document.createElement('script');\n        s.src = paths[lib];\n        s.async = true;\n        s.onload = () => resolve(paths[lib]);\n        s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n        document.getElementsByTagName(\"head\")[0].appendChild(s);\n      });\n    }\n\n    function showError(err) {\n      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n      throw err;\n    }\n\n    function displayChart(vegaEmbed) {\n      vegaEmbed(outputDiv, spec, embedOpt)\n        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n    }\n\n    if(typeof define === \"function\" && define.amd) {\n      requirejs.config({paths});\n      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n    } else if (typeof vegaEmbed === \"function\") {\n      displayChart(vegaEmbed);\n    } else {\n      loadScript(\"vega\")\n        .then(() => loadScript(\"vega-lite\"))\n        .then(() => loadScript(\"vega-embed\"))\n        .catch(showError)\n        .then(() => displayChart(vegaEmbed));\n    }\n  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-99a28f472e6fe776ccd3eb7e0bc0aa85\"}, \"mark\": \"bar\", \"encoding\": {\"x\": {\"type\": \"nominal\", \"axis\": {\"labelAngle\": -45}, \"field\": \"Label\"}, \"y\": {\"type\": \"quantitative\", \"field\": \"Count\"}}, \"title\": \"Dev Set Label Distribution\", \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.8.1.json\", \"datasets\": {\"data-99a28f472e6fe776ccd3eb7e0bc0aa85\": [{\"Label\": \"toxic\", \"Count\": 6090}, {\"Label\": \"severe_toxic\", \"Count\": 367}, {\"Label\": \"obscene\", \"Count\": 3691}, {\"Label\": \"threat\", \"Count\": 211}, {\"Label\": \"insult\", \"Count\": 3427}, {\"Label\": \"identity_hate\", \"Count\": 712}]}}, {\"mode\": \"vega-lite\"});\n</script>","text/plain":"alt.Chart(...)"},"metadata":{}}]},{"cell_type":"markdown","source":"The distribution of labels between the train and dev sets are about the same.","metadata":{}},{"cell_type":"code","source":"from sklearn.feature_extraction.text import TfidfVectorizer\ntfidf = TfidfVectorizer()\nxt = tfidf.fit_transform(train.comment_text)\nxd = tfidf.transform(test.comment_text)","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:55.157054Z","iopub.execute_input":"2021-06-17T06:20:55.157324Z","iopub.status.idle":"2021-06-17T06:21:20.565989Z","shell.execute_reply.started":"2021-06-17T06:20:55.157298Z","shell.execute_reply":"2021-06-17T06:21:20.564925Z"},"trusted":true},"execution_count":16,"outputs":[]},{"cell_type":"markdown","source":"This is a multi-label classification problem, where each example can have any number of the given labels. So let's set targets for every single one and train a classifer for each target separately. ","metadata":{}},{"cell_type":"code","source":"yt1, yd1 = train.toxic,         test_labels.toxic\nyt2, yd2 = train.severe_toxic,  test_labels.severe_toxic\nyt3, yd3 = train.obscene,       test_labels.obscene\nyt4, yd4 = train.threat,        test_labels.threat\nyt5, yd5 = train.insult,        test_labels.insult\nyt6, yd6 = train.identity_hate, test_labels.identity_hate","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:21:20.567269Z","iopub.execute_input":"2021-06-17T06:21:20.567575Z","iopub.status.idle":"2021-06-17T06:21:20.572533Z","shell.execute_reply.started":"2021-06-17T06:21:20.567545Z","shell.execute_reply":"2021-06-17T06:21:20.571855Z"},"trusted":true},"execution_count":17,"outputs":[]},{"cell_type":"code","source":"from sklearn.linear_model import LogisticRegression\n\nlr1 = LogisticRegression(max_iter=200)  # doesn't converge at 100 iters\nlr2 = LogisticRegression()\nlr3 = LogisticRegression()\nlr4 = LogisticRegression()\nlr5 = LogisticRegression()\nlr6 = LogisticRegression()\n\nlr1.fit(xt, yt1)\nlr2.fit(xt, yt2)\nlr3.fit(xt, yt3)\nlr4.fit(xt, yt4)\nlr5.fit(xt, yt5)\nlr6.fit(xt, yt6)","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:21:20.573592Z","iopub.execute_input":"2021-06-17T06:21:20.573956Z","iopub.status.idle":"2021-06-17T06:22:28.384752Z","shell.execute_reply.started":"2021-06-17T06:21:20.573919Z","shell.execute_reply":"2021-06-17T06:22:28.383712Z"},"trusted":true},"execution_count":18,"outputs":[{"execution_count":18,"output_type":"execute_result","data":{"text/plain":"LogisticRegression()"},"metadata":{}}]},{"cell_type":"code","source":"from sklearn.metrics import accuracy_score\n\nprint(f'Acc on toxic:         {accuracy_score(lr1.predict(xd), yd1)}')\nprint(f'Acc on severe_toxic:  {accuracy_score(lr2.predict(xd), yd2)}')\nprint(f'Acc on obscene:       {accuracy_score(lr3.predict(xd), yd3)}')\nprint(f'Acc on threat:        {accuracy_score(lr4.predict(xd), yd4)}')\nprint(f'Acc on insult:        {accuracy_score(lr5.predict(xd), yd5)}')\nprint(f'Acc on identity_hate: {accuracy_score(lr6.predict(xd), yd6)}')","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:22:28.386226Z","iopub.execute_input":"2021-06-17T06:22:28.386648Z","iopub.status.idle":"2021-06-17T06:22:28.626211Z","shell.execute_reply.started":"2021-06-17T06:22:28.386591Z","shell.execute_reply":"2021-06-17T06:22:28.625118Z"},"trusted":true},"execution_count":19,"outputs":[{"name":"stdout","text":"Acc on toxic:         0.8568593141991591\nAcc on severe_toxic:  0.9915254237288136\nAcc on obscene:       0.9175328406152882\nAcc on threat:        0.9979172651536915\nAcc on insult:        0.9340184377529968\nAcc on identity_hate: 0.9916951764122117\n","output_type":"stream"}]},{"cell_type":"markdown","source":"We can guess that the `toxic` class is more difficult merely because the variance of terms is likely high. Not bad for TF-IDF.","metadata":{}}]}