Skip to content

Instantly share code, notes, and snippets.

@kmcelwee
Created March 29, 2021 20:03
Show Gist options
  • Save kmcelwee/ac4545c1bc70698f2d4d604e15c910f8 to your computer and use it in GitHub Desktop.
Save kmcelwee/ac4545c1bc70698f2d4d604e15c910f8 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"pd.set_option('display.max_colwidth', None)"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"df_k = pd.read_csv('cdhweb-kludgy.csv')\n",
"df_s = pd.read_csv('cdhweb-scrapy.csv')\n",
"\n",
"# kludgy seems to replace %20 with ' ', whereas scrapy keeps the %20\n",
"df_k['url'] = df_k['url'].str.replace(' ', '%20')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Status code differences"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"200 6.0\n",
"301 13.0\n",
"404 6.0\n",
"dtype: float64"
]
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_k['status code'].astype(str).value_counts().subtract(df_s['status_code'].astype(str).value_counts(), fill_value=0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This should be read as: \"Kludgy found 6 more 200, 13 more 301s, and 6 more 404s than scrapy\". Scrapy does not seem to collect redirects. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### URL differences"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number exclusively in kludgy: 14\n",
"Number exclusively in scrapy: 2\n",
"Number shared: 1291\n"
]
}
],
"source": [
"# kludgy seems to keep redirects, whereas spacy does not\n",
"df_k = df_k[df_k['status code'] != 301]\n",
"\n",
"url_k = set(df_k['url'].tolist())\n",
"url_s = set(df_s['url'].tolist())\n",
"\n",
"urls_k_only = [u for u in url_k if u not in url_s]\n",
"urls_s_only = [u for u in url_s if u not in url_k]\n",
"shared_urls = url_k | url_s\n",
"\n",
"print(f'Number exclusively in kludgy: {len(urls_k_only)}')\n",
"print(f'Number exclusively in scrapy: {len(urls_s_only)}')\n",
"print(f'Number shared: {len(shared_urls)}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Here are the 14 URLs in kludgy that weren't found in scrapy (after removing 301s)**\n",
"* `https://cdh.princeton.edu` is here because it doesn't have the trailing slash\n"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>status code</th>\n",
" <th>content type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>https://cdh.princeton.edu/static/img/PUsig1-bw-bs.svg</td>\n",
" <td>200</td>\n",
" <td>image/svg+xml</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50</th>\n",
" <td>https://cdh.princeton.edu/events/douglass-day/</td>\n",
" <td>200</td>\n",
" <td>text/html; charset=utf-8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>59</th>\n",
" <td>https://cdh.princeton.edu/annual-report/2017/</td>\n",
" <td>200</td>\n",
" <td>text/html; charset=utf-8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>78</th>\n",
" <td>https://cdh.princeton.edu</td>\n",
" <td>200</td>\n",
" <td>text/html; charset=utf-8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>211</th>\n",
" <td>https://cdh.princeton.edu/media/uploads/blog/cdh_web_20_screenshot.png</td>\n",
" <td>200</td>\n",
" <td>image/png</td>\n",
" </tr>\n",
" <tr>\n",
" <th>520</th>\n",
" <td>https://cdh.princeton.edu/grants/public-humanities-seed-grants/</td>\n",
" <td>200</td>\n",
" <td>text/html; charset=utf-8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>807</th>\n",
" <td>https://cdh.princeton.edu/media/uploads/prmapathon_poster.jpg</td>\n",
" <td>200</td>\n",
" <td>image/jpeg</td>\n",
" </tr>\n",
" <tr>\n",
" <th>945</th>\n",
" <td>https://cdh.princeton.edu/updates/2020/02/28/2018-dissertation-prize-honoree-reflects-research-community/</td>\n",
" <td>200</td>\n",
" <td>text/html; charset=utf-8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>https://cdh.princeton.edu/updates/rss/pul-logo-new.svg/</td>\n",
" <td>404</td>\n",
" <td>text/html; charset=utf-8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>125</th>\n",
" <td>https://cdh.princeton.edu/media/uploads/documents/grad_fellows_call_for_sp_2020.pdf</td>\n",
" <td>404</td>\n",
" <td>text/html; charset=iso-8859-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>166</th>\n",
" <td>https://cdh.princeton.edu/events/reading-group/</td>\n",
" <td>404</td>\n",
" <td>text/html; charset=utf-8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>964</th>\n",
" <td>https://cdh.princeton.edu/engage/jobs/</td>\n",
" <td>404</td>\n",
" <td>text/html; charset=utf-8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1054</th>\n",
" <td>https://cdh.princeton.edu/grants/graduate-/</td>\n",
" <td>404</td>\n",
" <td>text/html; charset=utf-8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1056</th>\n",
" <td>https://cdh.princeton.edu/projects/derridas-margins/%20/</td>\n",
" <td>404</td>\n",
" <td>text/html; charset=utf-8</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"4 https://cdh.princeton.edu/static/img/PUsig1-bw-bs.svg \n",
"50 https://cdh.princeton.edu/events/douglass-day/ \n",
"59 https://cdh.princeton.edu/annual-report/2017/ \n",
"78 https://cdh.princeton.edu \n",
"211 https://cdh.princeton.edu/media/uploads/blog/cdh_web_20_screenshot.png \n",
"520 https://cdh.princeton.edu/grants/public-humanities-seed-grants/ \n",
"807 https://cdh.princeton.edu/media/uploads/prmapathon_poster.jpg \n",
"945 https://cdh.princeton.edu/updates/2020/02/28/2018-dissertation-prize-honoree-reflects-research-community/ \n",
"14 https://cdh.princeton.edu/updates/rss/pul-logo-new.svg/ \n",
"125 https://cdh.princeton.edu/media/uploads/documents/grad_fellows_call_for_sp_2020.pdf \n",
"166 https://cdh.princeton.edu/events/reading-group/ \n",
"964 https://cdh.princeton.edu/engage/jobs/ \n",
"1054 https://cdh.princeton.edu/grants/graduate-/ \n",
"1056 https://cdh.princeton.edu/projects/derridas-margins/%20/ \n",
"\n",
" status code content type \n",
"4 200 image/svg+xml \n",
"50 200 text/html; charset=utf-8 \n",
"59 200 text/html; charset=utf-8 \n",
"78 200 text/html; charset=utf-8 \n",
"211 200 image/png \n",
"520 200 text/html; charset=utf-8 \n",
"807 200 image/jpeg \n",
"945 200 text/html; charset=utf-8 \n",
"14 404 text/html; charset=utf-8 \n",
"125 404 text/html; charset=iso-8859-1 \n",
"166 404 text/html; charset=utf-8 \n",
"964 404 text/html; charset=utf-8 \n",
"1054 404 text/html; charset=utf-8 \n",
"1056 404 text/html; charset=utf-8 "
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_k[df_k['url'].isin(urls_k_only)][['url', 'status code', 'content type']].sort_values('status code')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Here are the 2 URLs in scrapy that weren't found in kludgy**\n",
"* `https://cdh.princeton.edu/events/douglass-day/` above, even though it's not shown as a redirect, redirects to `https://cdh.princeton.edu/events/douglass-day-2021/`"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>date</th>\n",
" <th>status_code</th>\n",
" <th>content_type</th>\n",
" <th>last_modified</th>\n",
" <th>content_length</th>\n",
" <th>size</th>\n",
" <th>referrer</th>\n",
" <th>timestamp</th>\n",
" <th>site_version</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>https://cdh.princeton.edu/events/douglass-day-2021/</td>\n",
" <td>2021-03-29 18:31:47.312460</td>\n",
" <td>200</td>\n",
" <td>text/html; charset=utf-8</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>33719</td>\n",
" <td>https://cdh.princeton.edu/</td>\n",
" <td>2021-03-29T18:31:47.312484</td>\n",
" <td>v. 2.8.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>747</th>\n",
" <td>https://cdh.princeton.edu/static/img/rubberchicken.png</td>\n",
" <td>2021-03-29 18:34:18.800610</td>\n",
" <td>200</td>\n",
" <td>image/png</td>\n",
" <td>Mon, 15 Mar 2021 19:01:46 GMT</td>\n",
" <td>NaN</td>\n",
" <td>264023</td>\n",
" <td>https://cdh.princeton.edu/events/2019/11/seed-grant-deadline/</td>\n",
" <td>2021-03-29T18:34:18.800623</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"8 https://cdh.princeton.edu/events/douglass-day-2021/ \n",
"747 https://cdh.princeton.edu/static/img/rubberchicken.png \n",
"\n",
" date status_code content_type \\\n",
"8 2021-03-29 18:31:47.312460 200 text/html; charset=utf-8 \n",
"747 2021-03-29 18:34:18.800610 200 image/png \n",
"\n",
" last_modified content_length size \\\n",
"8 NaN NaN 33719 \n",
"747 Mon, 15 Mar 2021 19:01:46 GMT NaN 264023 \n",
"\n",
" referrer \\\n",
"8 https://cdh.princeton.edu/ \n",
"747 https://cdh.princeton.edu/events/2019/11/seed-grant-deadline/ \n",
"\n",
" timestamp site_version \n",
"8 2021-03-29T18:31:47.312484 v. 2.8.1 \n",
"747 2021-03-29T18:34:18.800623 NaN "
]
},
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_s[df_s['url'].isin(urls_s_only)]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment