Created
March 29, 2021 20:03
-
-
Save kmcelwee/ac4545c1bc70698f2d4d604e15c910f8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 93, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"pd.set_option('display.max_colwidth', None)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 94, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_k = pd.read_csv('cdhweb-kludgy.csv')\n", | |
"df_s = pd.read_csv('cdhweb-scrapy.csv')\n", | |
"\n", | |
"# kludgy seems to replace %20 with ' ', whereas scrapy keeps the %20\n", | |
"df_k['url'] = df_k['url'].str.replace(' ', '%20')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Status code differences" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 95, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"200 6.0\n", | |
"301 13.0\n", | |
"404 6.0\n", | |
"dtype: float64" | |
] | |
}, | |
"execution_count": 95, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_k['status code'].astype(str).value_counts().subtract(df_s['status_code'].astype(str).value_counts(), fill_value=0)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"This should be read as: \"Kludgy found 6 more 200, 13 more 301s, and 6 more 404s than scrapy\". Scrapy does not seem to collect redirects. " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### URL differences" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 96, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Number exclusively in kludgy: 14\n", | |
"Number exclusively in scrapy: 2\n", | |
"Number shared: 1291\n" | |
] | |
} | |
], | |
"source": [ | |
"# kludgy seems to keep redirects, whereas spacy does not\n", | |
"df_k = df_k[df_k['status code'] != 301]\n", | |
"\n", | |
"url_k = set(df_k['url'].tolist())\n", | |
"url_s = set(df_s['url'].tolist())\n", | |
"\n", | |
"urls_k_only = [u for u in url_k if u not in url_s]\n", | |
"urls_s_only = [u for u in url_s if u not in url_k]\n", | |
"shared_urls = url_k | url_s\n", | |
"\n", | |
"print(f'Number exclusively in kludgy: {len(urls_k_only)}')\n", | |
"print(f'Number exclusively in scrapy: {len(urls_s_only)}')\n", | |
"print(f'Number shared: {len(shared_urls)}')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**Here are the 14 URLs in kludgy that weren't found in scrapy (after removing 301s)**\n", | |
"* `https://cdh.princeton.edu` is here because it doesn't have the trailing slash\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 97, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>url</th>\n", | |
" <th>status code</th>\n", | |
" <th>content type</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>https://cdh.princeton.edu/static/img/PUsig1-bw-bs.svg</td>\n", | |
" <td>200</td>\n", | |
" <td>image/svg+xml</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>50</th>\n", | |
" <td>https://cdh.princeton.edu/events/douglass-day/</td>\n", | |
" <td>200</td>\n", | |
" <td>text/html; charset=utf-8</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>59</th>\n", | |
" <td>https://cdh.princeton.edu/annual-report/2017/</td>\n", | |
" <td>200</td>\n", | |
" <td>text/html; charset=utf-8</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>78</th>\n", | |
" <td>https://cdh.princeton.edu</td>\n", | |
" <td>200</td>\n", | |
" <td>text/html; charset=utf-8</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>211</th>\n", | |
" <td>https://cdh.princeton.edu/media/uploads/blog/cdh_web_20_screenshot.png</td>\n", | |
" <td>200</td>\n", | |
" <td>image/png</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>520</th>\n", | |
" <td>https://cdh.princeton.edu/grants/public-humanities-seed-grants/</td>\n", | |
" <td>200</td>\n", | |
" <td>text/html; charset=utf-8</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>807</th>\n", | |
" <td>https://cdh.princeton.edu/media/uploads/prmapathon_poster.jpg</td>\n", | |
" <td>200</td>\n", | |
" <td>image/jpeg</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>945</th>\n", | |
" <td>https://cdh.princeton.edu/updates/2020/02/28/2018-dissertation-prize-honoree-reflects-research-community/</td>\n", | |
" <td>200</td>\n", | |
" <td>text/html; charset=utf-8</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>https://cdh.princeton.edu/updates/rss/pul-logo-new.svg/</td>\n", | |
" <td>404</td>\n", | |
" <td>text/html; charset=utf-8</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>125</th>\n", | |
" <td>https://cdh.princeton.edu/media/uploads/documents/grad_fellows_call_for_sp_2020.pdf</td>\n", | |
" <td>404</td>\n", | |
" <td>text/html; charset=iso-8859-1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>166</th>\n", | |
" <td>https://cdh.princeton.edu/events/reading-group/</td>\n", | |
" <td>404</td>\n", | |
" <td>text/html; charset=utf-8</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>964</th>\n", | |
" <td>https://cdh.princeton.edu/engage/jobs/</td>\n", | |
" <td>404</td>\n", | |
" <td>text/html; charset=utf-8</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1054</th>\n", | |
" <td>https://cdh.princeton.edu/grants/graduate-/</td>\n", | |
" <td>404</td>\n", | |
" <td>text/html; charset=utf-8</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1056</th>\n", | |
" <td>https://cdh.princeton.edu/projects/derridas-margins/%20/</td>\n", | |
" <td>404</td>\n", | |
" <td>text/html; charset=utf-8</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" url \\\n", | |
"4 https://cdh.princeton.edu/static/img/PUsig1-bw-bs.svg \n", | |
"50 https://cdh.princeton.edu/events/douglass-day/ \n", | |
"59 https://cdh.princeton.edu/annual-report/2017/ \n", | |
"78 https://cdh.princeton.edu \n", | |
"211 https://cdh.princeton.edu/media/uploads/blog/cdh_web_20_screenshot.png \n", | |
"520 https://cdh.princeton.edu/grants/public-humanities-seed-grants/ \n", | |
"807 https://cdh.princeton.edu/media/uploads/prmapathon_poster.jpg \n", | |
"945 https://cdh.princeton.edu/updates/2020/02/28/2018-dissertation-prize-honoree-reflects-research-community/ \n", | |
"14 https://cdh.princeton.edu/updates/rss/pul-logo-new.svg/ \n", | |
"125 https://cdh.princeton.edu/media/uploads/documents/grad_fellows_call_for_sp_2020.pdf \n", | |
"166 https://cdh.princeton.edu/events/reading-group/ \n", | |
"964 https://cdh.princeton.edu/engage/jobs/ \n", | |
"1054 https://cdh.princeton.edu/grants/graduate-/ \n", | |
"1056 https://cdh.princeton.edu/projects/derridas-margins/%20/ \n", | |
"\n", | |
" status code content type \n", | |
"4 200 image/svg+xml \n", | |
"50 200 text/html; charset=utf-8 \n", | |
"59 200 text/html; charset=utf-8 \n", | |
"78 200 text/html; charset=utf-8 \n", | |
"211 200 image/png \n", | |
"520 200 text/html; charset=utf-8 \n", | |
"807 200 image/jpeg \n", | |
"945 200 text/html; charset=utf-8 \n", | |
"14 404 text/html; charset=utf-8 \n", | |
"125 404 text/html; charset=iso-8859-1 \n", | |
"166 404 text/html; charset=utf-8 \n", | |
"964 404 text/html; charset=utf-8 \n", | |
"1054 404 text/html; charset=utf-8 \n", | |
"1056 404 text/html; charset=utf-8 " | |
] | |
}, | |
"execution_count": 97, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_k[df_k['url'].isin(urls_k_only)][['url', 'status code', 'content type']].sort_values('status code')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**Here are the 2 URLs in scrapy that weren't found in kludgy**\n", | |
"* `https://cdh.princeton.edu/events/douglass-day/` above, even though it's not shown as a redirect, redirects to `https://cdh.princeton.edu/events/douglass-day-2021/`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 98, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>url</th>\n", | |
" <th>date</th>\n", | |
" <th>status_code</th>\n", | |
" <th>content_type</th>\n", | |
" <th>last_modified</th>\n", | |
" <th>content_length</th>\n", | |
" <th>size</th>\n", | |
" <th>referrer</th>\n", | |
" <th>timestamp</th>\n", | |
" <th>site_version</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>https://cdh.princeton.edu/events/douglass-day-2021/</td>\n", | |
" <td>2021-03-29 18:31:47.312460</td>\n", | |
" <td>200</td>\n", | |
" <td>text/html; charset=utf-8</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>33719</td>\n", | |
" <td>https://cdh.princeton.edu/</td>\n", | |
" <td>2021-03-29T18:31:47.312484</td>\n", | |
" <td>v. 2.8.1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>747</th>\n", | |
" <td>https://cdh.princeton.edu/static/img/rubberchicken.png</td>\n", | |
" <td>2021-03-29 18:34:18.800610</td>\n", | |
" <td>200</td>\n", | |
" <td>image/png</td>\n", | |
" <td>Mon, 15 Mar 2021 19:01:46 GMT</td>\n", | |
" <td>NaN</td>\n", | |
" <td>264023</td>\n", | |
" <td>https://cdh.princeton.edu/events/2019/11/seed-grant-deadline/</td>\n", | |
" <td>2021-03-29T18:34:18.800623</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" url \\\n", | |
"8 https://cdh.princeton.edu/events/douglass-day-2021/ \n", | |
"747 https://cdh.princeton.edu/static/img/rubberchicken.png \n", | |
"\n", | |
" date status_code content_type \\\n", | |
"8 2021-03-29 18:31:47.312460 200 text/html; charset=utf-8 \n", | |
"747 2021-03-29 18:34:18.800610 200 image/png \n", | |
"\n", | |
" last_modified content_length size \\\n", | |
"8 NaN NaN 33719 \n", | |
"747 Mon, 15 Mar 2021 19:01:46 GMT NaN 264023 \n", | |
"\n", | |
" referrer \\\n", | |
"8 https://cdh.princeton.edu/ \n", | |
"747 https://cdh.princeton.edu/events/2019/11/seed-grant-deadline/ \n", | |
"\n", | |
" timestamp site_version \n", | |
"8 2021-03-29T18:31:47.312484 v. 2.8.1 \n", | |
"747 2021-03-29T18:34:18.800623 NaN " | |
] | |
}, | |
"execution_count": 98, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_s[df_s['url'].isin(urls_s_only)]" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment