Jorgemunive/cannibalization.ipynb

## cannibalization.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cannibalization\n",
    "<br>\n",
    "This notebook identifies possible <strong>keyword cannibalization issues</strong> using Google Search Console data. <br>  \n",
    "<br>\n",
    "    @author: Natzir Turrado: Technical SEO / Data Scientist. <a href=\"https://twitter.com/natzir9\">Twitter > @natzir9</a>\n",
    "<img src='https://www.analistaseo.es/wp-content/uploads/2019/11/keyword-cannibalization.png' alt='keyword-cannibalization'>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "VvfcpByztqkf"
   },
   "source": [
    "Importing libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "DljFdN6rtcyu"
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "import datetime\n",
    "import qgrid\n",
    "from collections import defaultdict\n",
    "from dateutil import relativedelta\n",
    "import httplib2\n",
    "from apiclient import errors\n",
    "from apiclient.discovery import build\n",
    "from oauth2client.client import OAuth2WebServerFlow\n",
    "import requests\n",
    "from bs4 import BeautifulSoup"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "JAVsB-I2t_AZ"
   },
   "source": [
    "Insert here your Google **CLIENT_ID**, **CLIENT_SECRET** & your Search Console **SITE PROPERTY**. \n",
    "<ul><li><a href=\"https://console.developers.google.com/flows/enableapi?apiid=webmasters&credential=client_key\">Create your API credentintials</a></li></ul>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "OCLNELb0tvrb"
   },
   "outputs": [],
   "source": [
    "CLIENT_ID = ''\n",
    "CLIENT_SECRET = ''\n",
    "site = ''"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "YEI-GX8Iu00E"
   },
   "source": [
    "Insert here the **date range** (last 3 month of SC data by default)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "inb3cm4Bu1N4"
   },
   "outputs": [],
   "source": [
    "end_date = datetime.date.today()\n",
    "start_date = end_date - relativedelta.relativedelta(months=3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "8H3oHaYgv6FC"
   },
   "source": [
    "Google Search Console API call. \n",
    "<ul><li><a href=\"https://developers.google.com/webmaster-tools/search-console-api-original/v3/quickstart/quickstart-python\">Quickstart: Run a Search Console App in Python</a></li>\n",
    "<li><a href=\"https://developers.google.com/apis-explorer/#p/webmasters/v3/\">Search Console API Explorer</a></li>\n",
    "</ul>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "OAUTH_SCOPE = 'https://www.googleapis.com/auth/webmasters.readonly'\n",
    "REDIRECT_URI = 'urn:ietf:wg:oauth:2.0:oob'\n",
    "\n",
    "# Run through the OAuth flow and retrieve credentials\n",
    "flow = OAuth2WebServerFlow(CLIENT_ID, CLIENT_SECRET, OAUTH_SCOPE, redirect_uri=REDIRECT_URI)\n",
    "authorize_url = flow.step1_get_authorize_url()\n",
    "print ('Go to the following link in your browser: ' + authorize_url)\n",
    "code = input('Enter verification code: ').strip()\n",
    "credentials = flow.step2_exchange(code)\n",
    "\n",
    "# Create an httplib2.Http object and authorize it with our credentials\n",
    "http = httplib2.Http()\n",
    "http = credentials.authorize(http)\n",
    "\n",
    "webmasters_service = build('webmasters', 'v3', http=http)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 73
    },
    "colab_type": "code",
    "id": "OlSOcxRsvqFZ",
    "outputId": "9dff2389-3d69-499b-f4b2-abcfb70a82b8"
   },
   "outputs": [],
   "source": [
    "def execute_request(service, property_uri, request):\n",
    "    return service.searchanalytics().query(siteUrl=property_uri, body=request).execute()\n",
    "\n",
    "request = {\n",
    "    'startDate': datetime.datetime.strftime(start_date,\"%Y-%m-%d\"),\n",
    "    'endDate': datetime.datetime.strftime(end_date,'%Y-%m-%d'),\n",
    "    'dimensions': ['page','query'],\n",
    "    'rowLimit': 25000 #up to 25.000 urls\n",
    "}\n",
    "\n",
    "#Adding a device filter to request\n",
    "device_category = input('Enter device category: MOBILE, DESKTOP or TABLET (leave it blank for all devices): ').strip()\n",
    "if device_category:\n",
    "    request['dimensionFilterGroups'] = [{'filters':[{'dimension':'device','expression':device_category}]}]\n",
    "\n",
    "#Request to SC API\n",
    "response = execute_request(webmasters_service, site, request)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "nWGKp7SIwTIN"
   },
   "source": [
    "<strong>Parsing the JSON returned</strong>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "SE-VkmsfwXMA"
   },
   "outputs": [],
   "source": [
    "scDict = defaultdict(list)\n",
    "\n",
    "for row in response['rows']:\n",
    "    scDict['page'].append(row['keys'][0] or 0)\n",
    "    scDict['query'].append(row['keys'][1] or 0)\n",
    "    scDict['clicks'].append(row['clicks'] or 0)\n",
    "    scDict['ctr'].append(row['ctr'] or 0)\n",
    "    scDict['impressions'].append(row['impressions'] or 0)\n",
    "    scDict['position'].append(row['position'] or 0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "i1iD74RjwhMS"
   },
   "source": [
    "<strong>DataFrame of Search Console data<strong>\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 71
    },
    "colab_type": "code",
    "id": "q8JlB1bWwhRG",
    "outputId": "2cc1cc78-41fa-47c9-b4a8-01e339cf7354"
   },
   "outputs": [],
   "source": [
    "df = pd.DataFrame(data = scDict)\n",
    "\n",
    "df['clicks'] = df['clicks'].astype('int')\n",
    "df['ctr'] = df['ctr']*100\n",
    "df['impressions'] = df['impressions'].astype('int')\n",
    "df['position'] = df['position'].round(2)\n",
    "df.sort_values('clicks',inplace=True,ascending=False)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<strong>Cleaning the DataFrame and sorting it by query</strong>\n",
    "<ul>\n",
    "   <li>Excludes page one. According to Mozcast, the average page one has only 8 listings, so we only look for URLs beyond this position (this behaviour could by changed in 'SERP_result').</li>\n",
    "   <li>Excludes branded queries (set yours in 'branded_queries' )</li>\n",
    "   <li>Excludes unique queries</li>    \n",
    "<ul>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "SERP_results = 8 #insert here your prefered value for SERP results\n",
    "branded_queries = 'brand|vrand|b rand...' #insert here your branded queries\n",
    "\n",
    "df_canibalized = df[df['position'] > SERP_results] \n",
    "df_canibalized = df_canibalized[~df_canibalized['query'].str.contains(branded_queries, regex=True)]\n",
    "df_canibalized = df_canibalized[df_canibalized.duplicated(subset=['query'], keep=False)]\n",
    "df_canibalized.set_index(['query'],inplace=True)\n",
    "df_canibalized.sort_index(inplace=True)\n",
    "df_canibalized.reset_index(inplace=True)\n",
    "df_canibalized"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<strong>Scraping URLs and Adding Titles and Meta Descriptions to the DataFrame</strong>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_meta(url):\n",
    "    page = requests.get(url)\n",
    "    soup = BeautifulSoup(page.content,'html.parser')\n",
    "    title = soup.find('title').get_text()\n",
    "    meta = soup.select('meta[name=\"description\"]')[0].attrs[\"content\"] \n",
    "    return title, meta\n",
    "\n",
    "df_canibalized['title'],df_canibalized['meta'] = zip(*df_canibalized['page'].apply(get_meta))\n",
    "df_canibalized"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<strong>Creating a dynamic grid to analyse the data</strong>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "grid = qgrid.show_grid(df_canibalized, show_toolbar=True)\n",
    "grid"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "name": "bubbles.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Cannibalization\n",
	"<br>\n",
	"This notebook identifies possible <strong>keyword cannibalization issues</strong> using Google Search Console data. <br> \n",
	"<br>\n",
	" @author: Natzir Turrado: Technical SEO / Data Scientist. <a href=\"https://twitter.com/natzir9\">Twitter > @natzir9</a>\n",
	"<img src='https://www.analistaseo.es/wp-content/uploads/2019/11/keyword-cannibalization.png' alt='keyword-cannibalization'>"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"colab_type": "text",
	"id": "VvfcpByztqkf"
	},
	"source": [
	"Importing libraries"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"colab": {},
	"colab_type": "code",
	"id": "DljFdN6rtcyu"
	},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import re\n",
	"import datetime\n",
	"import qgrid\n",
	"from collections import defaultdict\n",
	"from dateutil import relativedelta\n",
	"import httplib2\n",
	"from apiclient import errors\n",
	"from apiclient.discovery import build\n",
	"from oauth2client.client import OAuth2WebServerFlow\n",
	"import requests\n",
	"from bs4 import BeautifulSoup"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"colab_type": "text",
	"id": "JAVsB-I2t_AZ"
	},
	"source": [
	"Insert here your Google CLIENT_ID, CLIENT_SECRET & your Search Console SITE PROPERTY. \n",
	"<ul><li><a href=\"https://console.developers.google.com/flows/enableapi?apiid=webmasters&credential=client_key\">Create your API credentintials</a></li></ul>"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"colab": {},
	"colab_type": "code",
	"id": "OCLNELb0tvrb"
	},
	"outputs": [],
	"source": [
	"CLIENT_ID = ''\n",
	"CLIENT_SECRET = ''\n",
	"site = ''"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"colab_type": "text",
	"id": "YEI-GX8Iu00E"
	},
	"source": [
	"Insert here the date range (last 3 month of SC data by default)\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"colab": {},
	"colab_type": "code",
	"id": "inb3cm4Bu1N4"
	},
	"outputs": [],
	"source": [
	"end_date = datetime.date.today()\n",
	"start_date = end_date - relativedelta.relativedelta(months=3)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"colab_type": "text",
	"id": "8H3oHaYgv6FC"
	},
	"source": [
	"Google Search Console API call. \n",
	"<ul><li><a href=\"https://developers.google.com/webmaster-tools/search-console-api-original/v3/quickstart/quickstart-python\">Quickstart: Run a Search Console App in Python</a></li>\n",
	"<li><a href=\"https://developers.google.com/apis-explorer/#p/webmasters/v3/\">Search Console API Explorer</a></li>\n",
	"</ul>"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"OAUTH_SCOPE = 'https://www.googleapis.com/auth/webmasters.readonly'\n",
	"REDIRECT_URI = 'urn:ietf:wg:oauth:2.0:oob'\n",
	"\n",
	"# Run through the OAuth flow and retrieve credentials\n",
	"flow = OAuth2WebServerFlow(CLIENT_ID, CLIENT_SECRET, OAUTH_SCOPE, redirect_uri=REDIRECT_URI)\n",
	"authorize_url = flow.step1_get_authorize_url()\n",
	"print ('Go to the following link in your browser: ' + authorize_url)\n",
	"code = input('Enter verification code: ').strip()\n",
	"credentials = flow.step2_exchange(code)\n",
	"\n",
	"# Create an httplib2.Http object and authorize it with our credentials\n",
	"http = httplib2.Http()\n",
	"http = credentials.authorize(http)\n",
	"\n",
	"webmasters_service = build('webmasters', 'v3', http=http)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 73
	},
	"colab_type": "code",
	"id": "OlSOcxRsvqFZ",
	"outputId": "9dff2389-3d69-499b-f4b2-abcfb70a82b8"
	},
	"outputs": [],
	"source": [
	"def execute_request(service, property_uri, request):\n",
	" return service.searchanalytics().query(siteUrl=property_uri, body=request).execute()\n",
	"\n",
	"request = {\n",
	" 'startDate': datetime.datetime.strftime(start_date,\"%Y-%m-%d\"),\n",
	" 'endDate': datetime.datetime.strftime(end_date,'%Y-%m-%d'),\n",
	" 'dimensions': ['page','query'],\n",
	" 'rowLimit': 25000 #up to 25.000 urls\n",
	"}\n",
	"\n",
	"#Adding a device filter to request\n",
	"device_category = input('Enter device category: MOBILE, DESKTOP or TABLET (leave it blank for all devices): ').strip()\n",
	"if device_category:\n",
	" request['dimensionFilterGroups'] = [{'filters':[{'dimension':'device','expression':device_category}]}]\n",
	"\n",
	"#Request to SC API\n",
	"response = execute_request(webmasters_service, site, request)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"colab_type": "text",
	"id": "nWGKp7SIwTIN"
	},
	"source": [
	"<strong>Parsing the JSON returned</strong>"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"colab": {},
	"colab_type": "code",
	"id": "SE-VkmsfwXMA"
	},
	"outputs": [],
	"source": [
	"scDict = defaultdict(list)\n",
	"\n",
	"for row in response['rows']:\n",
	" scDict['page'].append(row['keys'][0] or 0)\n",
	" scDict['query'].append(row['keys'][1] or 0)\n",
	" scDict['clicks'].append(row['clicks'] or 0)\n",
	" scDict['ctr'].append(row['ctr'] or 0)\n",
	" scDict['impressions'].append(row['impressions'] or 0)\n",
	" scDict['position'].append(row['position'] or 0)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"colab_type": "text",
	"id": "i1iD74RjwhMS"
	},
	"source": [
	"<strong>DataFrame of Search Console data<strong>\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 71
	},
	"colab_type": "code",
	"id": "q8JlB1bWwhRG",
	"outputId": "2cc1cc78-41fa-47c9-b4a8-01e339cf7354"
	},
	"outputs": [],
	"source": [
	"df = pd.DataFrame(data = scDict)\n",
	"\n",
	"df['clicks'] = df['clicks'].astype('int')\n",
	"df['ctr'] = df['ctr']*100\n",
	"df['impressions'] = df['impressions'].astype('int')\n",
	"df['position'] = df['position'].round(2)\n",
	"df.sort_values('clicks',inplace=True,ascending=False)\n",
	"df"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"<strong>Cleaning the DataFrame and sorting it by query</strong>\n",
	"<ul>\n",
	" <li>Excludes page one. According to Mozcast, the average page one has only 8 listings, so we only look for URLs beyond this position (this behaviour could by changed in 'SERP_result').</li>\n",
	" <li>Excludes branded queries (set yours in 'branded_queries' )</li>\n",
	" <li>Excludes unique queries</li> \n",
	"<ul>"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"SERP_results = 8 #insert here your prefered value for SERP results\n",
	"branded_queries = 'brand\|vrand\|b rand...' #insert here your branded queries\n",
	"\n",
	"df_canibalized = df[df['position'] > SERP_results] \n",
	"df_canibalized = df_canibalized[~df_canibalized['query'].str.contains(branded_queries, regex=True)]\n",
	"df_canibalized = df_canibalized[df_canibalized.duplicated(subset=['query'], keep=False)]\n",
	"df_canibalized.set_index(['query'],inplace=True)\n",
	"df_canibalized.sort_index(inplace=True)\n",
	"df_canibalized.reset_index(inplace=True)\n",
	"df_canibalized"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"<strong>Scraping URLs and Adding Titles and Meta Descriptions to the DataFrame</strong>"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def get_meta(url):\n",
	" page = requests.get(url)\n",
	" soup = BeautifulSoup(page.content,'html.parser')\n",
	" title = soup.find('title').get_text()\n",
	" meta = soup.select('meta[name=\"description\"]')[0].attrs[\"content\"] \n",
	" return title, meta\n",
	"\n",
	"df_canibalized['title'],df_canibalized['meta'] = zip(*df_canibalized['page'].apply(get_meta))\n",
	"df_canibalized"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"<strong>Creating a dynamic grid to analyse the data</strong>"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"grid = qgrid.show_grid(df_canibalized, show_toolbar=True)\n",
	"grid"
	]
	}
	],
	"metadata": {
	"colab": {
	"name": "bubbles.ipynb",
	"provenance": []
	},
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}