Skip to content

Instantly share code, notes, and snippets.

@afvanwoudenberg
Created September 2, 2023 15:46
Show Gist options
  • Save afvanwoudenberg/6674f492abe0441b3c25eb78950f46d1 to your computer and use it in GitHub Desktop.
Save afvanwoudenberg/6674f492abe0441b3c25eb78950f46d1 to your computer and use it in GitHub Desktop.
Analyze Chrome browser history using Python
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "b9e03e7f",
"metadata": {},
"source": [
"# Chrome History\n",
"\n",
"Aswin van Woudenberg (https://www.aswinvanwoudenberg.com | https://github.com/afvanwoudenberg)\n",
"\n",
"## Importing libraries"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9fb58066",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sqlite3\n",
"import time\n",
"import datetime\n",
"import tzlocal\n",
"import math\n",
"import calendar\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"import ipywidgets as widgets\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib.ticker import FixedLocator, MaxNLocator\n",
"%matplotlib inline\n",
"\n",
"from IPython.display import display, HTML\n",
"from urllib.parse import urlparse\n",
"from pathlib import Path\n",
"from wordcloud import WordCloud\n",
"from ipywidgets import interact"
]
},
{
"cell_type": "markdown",
"id": "0c47254c",
"metadata": {},
"source": [
"## Reading database tables"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "039003c7",
"metadata": {},
"outputs": [],
"source": [
"db_path = os.path.expanduser(\"~/.config/google-chrome/Default/History\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6a287548",
"metadata": {},
"outputs": [],
"source": [
"con = sqlite3.connect(f\"file:{db_path}?immutable=1\", uri=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "17548369",
"metadata": {},
"outputs": [],
"source": [
"(pd.read_sql_query(\"SELECT name FROM sqlite_master WHERE type='table';\", con)\n",
" .sort_values('name')\n",
" .style.hide(axis='index'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4fbbe10",
"metadata": {},
"outputs": [],
"source": [
"downloads = pd.read_sql_query(\"SELECT * FROM downloads;\", con)\n",
"keyword_search_terms = pd.read_sql_query(\"SELECT * FROM keyword_search_terms;\", con)\n",
"urls = pd.read_sql_query(\"SELECT * FROM urls;\", con)\n",
"visits = pd.read_sql_query(\"SELECT * FROM visits;\", con)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "09bc016e",
"metadata": {},
"outputs": [],
"source": [
"con.close()"
]
},
{
"cell_type": "markdown",
"id": "6b5aa509-a2ec-40e7-b642-ec29ecc56648",
"metadata": {},
"source": [
"## Common search terms"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "818386fc",
"metadata": {},
"outputs": [],
"source": [
"keyword_search_terms_urls = pd.merge(left=keyword_search_terms, right=urls, left_on='url_id', right_on='id')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b5ddf3cb",
"metadata": {},
"outputs": [],
"source": [
"keyword_search_terms_urls['netloc'] = keyword_search_terms_urls.url.map(lambda r: urlparse(r).netloc)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "090ff28e",
"metadata": {},
"outputs": [],
"source": [
"@interact(netloc=widgets.Dropdown(options=keyword_search_terms_urls.netloc.unique(), description='Website:'))\n",
"def search_wordcloud(netloc):\n",
" # Join all keywords into one lengthy string\n",
" keywords = ' '.join(keyword_search_terms_urls.query('netloc == @netloc').normalized_term)\n",
" width, height = 1200, 600 # Size of the word cloud\n",
" \n",
" px = 1/plt.rcParams['figure.dpi'] # pixel in inches \n",
" fig, ax = plt.subplots(figsize=(width*px, height*px), subplot_kw={'xticks': [], 'yticks': []})\n",
"\n",
" wordcloud = WordCloud(background_color=\"white\", width=width, height=height).generate_from_text(keywords)\n",
" ax.set_title(netloc)\n",
" ax.imshow(wordcloud, interpolation='bilinear', aspect='auto')\n",
" # plt.show()"
]
},
{
"cell_type": "markdown",
"id": "6a1d33f8-1b31-47f9-bd7f-78812844da03",
"metadata": {},
"source": [
"## Most visited websites\n",
"\n",
"### Most visited websites by time spent"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2a09a55f-11f2-4ea5-8cf4-60ad1eb7da77",
"metadata": {},
"outputs": [],
"source": [
"visits_urls = pd.merge(left=visits, right=urls, left_on='url', right_on='id')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8a97a030-7310-4c82-887c-fadd5f0f9b82",
"metadata": {},
"outputs": [],
"source": [
"visits_urls = visits_urls.drop(labels='url_x', axis=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d3ae92ef-fa34-4e13-a4cb-9165ddd8c9bf",
"metadata": {},
"outputs": [],
"source": [
"visits_urls = visits_urls.rename(columns={'id_x': 'visit_id', 'id_y': 'url_id', 'url_y': 'url'})"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "93ad99d4-589a-4a1a-81a5-1204c0bbd1b5",
"metadata": {},
"outputs": [],
"source": [
"visits_urls['netloc'] = visits_urls.url.map(lambda r: urlparse(r).netloc)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa2e206c",
"metadata": {},
"outputs": [],
"source": [
"def date_from_webkit(webkit_timestamp):\n",
" epoch_start = datetime.datetime(1601,1,1)\n",
" delta = datetime.timedelta(microseconds=int(webkit_timestamp))\n",
" return epoch_start + delta"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f3c0351d-219f-4d5e-a7c4-7d2542af6dae",
"metadata": {},
"outputs": [],
"source": [
"visits_urls['visit_time_dt'] = pd.to_datetime(visits_urls.visit_time.map(date_from_webkit), utc=True).dt.tz_convert(tzlocal.get_localzone().key)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ebb8a19f-f4bd-4f42-8a39-094f4e730567",
"metadata": {},
"outputs": [],
"source": [
"start_date = visits_urls['visit_time_dt'].min().date()\n",
"end_date = visits_urls['visit_time_dt'].max().date()\n",
"\n",
"dates = pd.date_range(start_date, end_date, freq='D')\n",
"\n",
"date_options = [(date.strftime('%d %b %Y'), date) for date in dates]\n",
"index = (0, len(date_options)-1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5a3e7c40-35d2-406c-9701-641b2ea6cd25",
"metadata": {},
"outputs": [],
"source": [
"@interact(date_range=widgets.SelectionRangeSlider(options=date_options, index=index, description='Date range:', orientation='horizontal', layout={'width': '500px'}))\n",
"def plot_time_spent(date_range):\n",
" fig, ax = plt.subplots(figsize=(10,8))\n",
"\n",
" tz = tzlocal.get_localzone().key\n",
" start_dt = pd.to_datetime(date_range[0]).tz_localize(tz)\n",
" end_dt = pd.to_datetime(date_range[1]).tz_localize(tz) + pd.Timedelta(days=1)\n",
" \n",
" top_visits_urls = visits_urls.query(\"visit_time_dt >= @start_dt and visit_time_dt < @end_dt\").groupby('netloc').visit_duration.sum()\n",
" top_visits_urls.sort_values().tail(20).plot(kind='barh', ax=ax, ylabel='', title=\"Top 20 websites most time spent on\")\n",
" \n",
" td = pd.to_timedelta(top_visits_urls.max(), unit='us')\n",
" for unit, n in td.components._asdict().items():\n",
" if n > 0:\n",
" break\n",
" ticks_loc = [int(pd.to_timedelta(x, unit=unit) / pd.to_timedelta(1, unit='us')) for x in range(0, n+2, math.ceil((n+2)/5))]\n",
" \n",
" ax.xaxis.set_major_locator(FixedLocator(ticks_loc))\n",
" ax.set_xticklabels([\"{} {}\".format(pd.to_timedelta(x, unit='us').components._asdict()[unit], unit) for x in ticks_loc])\n",
" \n",
" plt.show()"
]
},
{
"cell_type": "markdown",
"id": "8e19773e-a49f-4807-ab76-bd736477c456",
"metadata": {},
"source": [
"### Most visited websites by frequency"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0c6aeac8-3809-4474-98fb-cae7617972bc",
"metadata": {},
"outputs": [],
"source": [
"@interact(date_range=widgets.SelectionRangeSlider(options=date_options, index=index, description='Date range:', orientation='horizontal', layout={'width': '500px'}))\n",
"def plot_frequency(date_range):\n",
" fig, ax = plt.subplots(figsize=(10,8))\n",
"\n",
" tz = tzlocal.get_localzone().key\n",
" start_dt = pd.to_datetime(date_range[0]).tz_localize(tz)\n",
" end_dt = pd.to_datetime(date_range[1]).tz_localize(tz) + pd.Timedelta(days=1)\n",
" \n",
" (visits_urls.query(\"visit_time_dt >= @start_dt and visit_time_dt < @end_dt\")\n",
" .groupby('netloc').size().sort_values().tail(20)\n",
" .plot(kind='barh', ax=ax, ylabel='', title=\"Top 20 most frequently visited websites\"))\n",
" plt.show()"
]
},
{
"cell_type": "markdown",
"id": "388953db-6922-4771-b77e-7662a1b68446",
"metadata": {},
"source": [
"## YouTube videos"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dc0d8234-11c1-40e5-a902-625d04a1493a",
"metadata": {},
"outputs": [],
"source": [
"def make_clickable(url, name):\n",
" return '<a href=\"{}\" rel=\"noopener noreferrer\" target=\"_blank\">{}</a>'.format(url, name)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e4a7b85e-d40f-4cb0-830a-492c0d0c28e3",
"metadata": {},
"outputs": [],
"source": [
"@interact(date=widgets.DatePicker(description='Date: ', disabled=False, value=end_date), \n",
" content_type=widgets.Dropdown(\n",
" options=[(\"Videos\", \"youtube.com/watch\"), \n",
" (\"Shorts\", \"youtube.com/shorts\")], \n",
" value=\"youtube.com/watch\", description='Type:'))\n",
"def show_watched_videos(date, content_type): \n",
" tz = tzlocal.get_localzone().key\n",
" start_dt = pd.to_datetime(date).tz_localize(tz)\n",
" end_dt = start_dt + pd.Timedelta(days=1)\n",
" \n",
" df = visits_urls.query(\"url.str.contains(@content_type) and visit_time_dt >= @start_dt and visit_time_dt < @end_dt\").copy()\n",
" df['YouTube video'] = df.apply(lambda x: make_clickable(x['url'], x['title']), axis=1)\n",
" if df.size:\n",
" display(df[['YouTube video']].style.hide(axis='index'))\n",
" else:\n",
" print(\"No YouTube URLs found for this type of content.\")"
]
},
{
"cell_type": "markdown",
"id": "575fdd50-1289-49c5-8982-5ede091fa8be",
"metadata": {},
"source": [
"## When am I most active?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b5826046-8455-4c74-b9d9-4fdfec6fe4e2",
"metadata": {},
"outputs": [],
"source": [
"dates = pd.date_range(start_date, end_date, freq='W-MON')\n",
"options = [(\"{} - {}\".format(date.strftime('%d %b %Y'), (date + pd.Timedelta(days=7)).strftime('%d %b %Y')), (date, date + pd.Timedelta(days=7))) for date in dates]\n",
"options.insert(0, ('All weeks', (pd.to_datetime(start_date), pd.to_datetime(end_date))))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "97bc4319-689d-4bf4-862a-2f0b418830a8",
"metadata": {},
"outputs": [],
"source": [
"@interact(date_range=widgets.Dropdown(options=options, description='Week:'))\n",
"def plot_activity(date_range):\n",
" tz = tzlocal.get_localzone().key\n",
" start_dt = pd.to_datetime(date_range[0]).tz_localize(tz)\n",
" end_dt = pd.to_datetime(date_range[1]).tz_localize(tz) + pd.Timedelta(days=1)\n",
" \n",
" df = (visits_urls.query(\"visit_time_dt >= @start_dt and visit_time_dt < @end_dt\")\n",
" .groupby([visits_urls.visit_time_dt.dt.day_of_week, visits_urls.visit_time_dt.dt.hour])\n",
" .size().unstack().reset_index(drop=True)\n",
" .reindex(list(range(0,7)), fill_value=0)\n",
" .reindex(list(range(0,24)), axis='columns', fill_value=0)\n",
" )\n",
" vmax = df.max(axis=None)\n",
" df.columns.name = None\n",
" display(df.rename(mapper=lambda x: list(calendar.day_name)[x])\n",
" .rename(columns=lambda y: str(y if y else 12) + \" am\" if y < 12 else str(y if y==12 else y-12) + \" pm\")\n",
" .style.format(na_rep=0, precision=0).background_gradient(cmap='Blues', vmin=0, vmax=vmax)\n",
" .applymap(lambda x: 'background-color: white; color: black;' if pd.isnull(x) or x==0 else '')\n",
" )"
]
},
{
"cell_type": "markdown",
"id": "b39789e6",
"metadata": {},
"source": [
"## Downloads"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "20b4e683",
"metadata": {},
"outputs": [],
"source": [
"downloads['start_time_dt'] = pd.to_datetime(downloads.start_time.map(date_from_webkit), utc=True).dt.tz_convert(tzlocal.get_localzone().key)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ff2e295a",
"metadata": {},
"outputs": [],
"source": [
"downloads.loc[downloads.end_time > 0, \"end_time_dt\"] = (pd.to_datetime(downloads.loc[downloads.end_time > 0, \"end_time\"]\n",
" .map(date_from_webkit), utc=True).dt.tz_convert(tzlocal.get_localzone().key)\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1046242c",
"metadata": {},
"outputs": [],
"source": [
"downloads.loc[downloads.last_access_time > 0, \"last_access_time_dt\"] = (pd.to_datetime(downloads.loc[downloads.last_access_time > 0, \"last_access_time\"]\n",
" .map(date_from_webkit), utc=True).dt.tz_convert(tzlocal.get_localzone().key)\n",
")"
]
},
{
"cell_type": "markdown",
"id": "dc1f563b-a3d8-48bb-80f0-0cc0f82f6cce",
"metadata": {},
"source": [
"### Downloads per month"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "96634c88",
"metadata": {},
"outputs": [],
"source": [
"xticks = [\"{}, {}\".format(y, m) for y, _, m in downloads.groupby([downloads.start_time_dt.dt.year, downloads.start_time_dt.dt.month, downloads.start_time_dt.dt.month_name()]).size().index]\n",
"\n",
"ax = downloads.groupby([downloads.start_time_dt.dt.year, downloads.start_time_dt.dt.month]).received_bytes.sum().plot(kind='bar', xlabel='', title=\"Downloads in GB by month\")\n",
"ax.set_xticklabels(xticks)\n",
"\n",
"ticks_loc = ax.get_yticks().tolist()\n",
"ax.yaxis.set_major_locator(FixedLocator(ticks_loc))\n",
"ax.set_yticklabels(['{:,.0f}GB'.format(x/1e9) for x in ticks_loc])\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "1d90eec4-a463-4536-80f7-366fc8b82910",
"metadata": {},
"source": [
"### Downloads by file type"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aa8d0ed2-9cc5-4f08-bdf8-4efe2b200d68",
"metadata": {},
"outputs": [],
"source": [
"fig, ax = plt.subplots(figsize=(10,8))\n",
"(downloads.target_path.apply(lambda p: Path(p).suffix.lower())\n",
" .replace('', np.nan).dropna()\n",
" .value_counts().sort_values()\n",
" .plot(kind='barh', ax=ax, ylabel='', title=\"Most downloaded file types\"))\n",
"ax.xaxis.set_major_locator(MaxNLocator(integer=True))\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "f8e98c58-9d86-4931-8eee-f7586088ee42",
"metadata": {},
"source": [
"### Where I download most files from"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b7754f6e-7b37-4135-85d2-17c8b607132d",
"metadata": {},
"outputs": [],
"source": [
"downloads['netloc'] = downloads.tab_url.map(lambda r: urlparse(r).netloc)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8065ccda-cb3a-4574-adbf-3e1e2e0e8747",
"metadata": {},
"outputs": [],
"source": [
"(downloads.query('netloc.str.len() > 0')\n",
" .groupby('netloc').size().sort_values(ascending=False)\n",
" .to_frame().head(10).reset_index().rename({0: 'count'}, axis=1)\n",
" .style.hide(axis='index')\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4577b121-69a8-4c9d-b19e-0dbd9f6cb52e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment