Skip to content

Instantly share code, notes, and snippets.

@pshapiro
Created January 29, 2019 21:07
Show Gist options
  • Save pshapiro/a86dc340f57c38fc22d0545ddec1fc9e to your computer and use it in GitHub Desktop.
Save pshapiro/a86dc340f57c38fc22d0545ddec1fc9e to your computer and use it in GitHub Desktop.
Jupyter Notebook that input outlink from Screaming Frog crawl, grabs PA & DA from Moz API, and uses WHOIS API to determine domain availability.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Expired Domain Finder"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Change the `client` variable to include your Moz API *Access ID* and *Secret Key*. You'll need access to the Moz API."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"from mozscape import Mozscape\n",
"import pandas as pd\n",
"import numpy as np\n",
"import requests\n",
"import json\n",
"import time\n",
"\n",
"def divide_chunks(l, n): \n",
" for i in range(0, len(l), n): \n",
" yield l[i:i + n] \n",
" \n",
"client = Mozscape('my_access_id', 'my_secret_key')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The `csv` variable is equal to a an *All Outlinks* report from [Screaming Frog](https://www.screamingfrog.co.uk/seo-spider/)."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"csv = pd.read_csv('./all_outlinks.csv', skiprows=1)\n",
"\n",
"links = csv[csv['Type'] == 'AHREF']\n",
"links = csv[~csv['Destination'].str.match('https?://boardgamegeek.com/.*|https?://rpggeek.com/.*|https?://boardgamegeekstore.com/.*|https?://.*.\\.geekdo-.*.com/.*|https?://videogamegeek.com/.*|https?://.*\\.amazon-.*.com.*')]\n",
"\n",
"Domains = links['Destination'].replace(to_replace=\"(.*://)?([^/?]+).*\", value=r\"\\1\\2\", regex=True)\n",
"\n",
"x = list(divide_chunks(Domains.unique().tolist(), 5)) \n",
"\n",
"df = pd.DataFrame(columns=['pda','upa','url','status'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`headers` is set up spoof the Googlebot user agent to avoid the servers from blocking the status code checks. It is sleeping for 5 seconds for every 5 domains checked with the Moz API."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'upa': 20, 'pda': 13, 'url': 'http://www.qmlogistics.com', 'status': 200}\n",
"{'upa': 100, 'pda': 100, 'url': 'https://www.youtube.com', 'status': 200}\n",
"{'upa': 37, 'pda': 73, 'url': 'https://moedaseco.lojaintegrada.com.br', 'status': 200}\n",
"{'upa': 22, 'pda': 19, 'url': 'https://www.eggertspiele.com', 'status': 200}\n",
"{'upa': 80, 'pda': 94, 'url': 'https://www.amazon.co.uk', 'status': 200}\n",
"{'upa': 30, 'pda': 23, 'url': 'https://boardgameprices.co.uk', 'status': 200}\n",
"{'upa': 22, 'pda': 22, 'url': 'http://firestormcards.co.uk', 'status': 200}\n",
"{'upa': 65, 'pda': 83, 'url': 'http://www.boardgamegeek.com', 'status': 200}\n",
"{'upa': 56, 'pda': 68, 'url': 'https://challonge.com', 'status': 403}\n",
"{'upa': 31, 'pda': 28, 'url': 'https://www.gamenerdz.com', 'status': 200}\n",
"{'upa': 40, 'pda': 36, 'url': 'https://www.thebrokentoken.com', 'status': 200}\n",
"{'upa': 50, 'pda': 49, 'url': 'https://www.plaidhatgames.com', 'status': 200}\n",
"{'upa': 1, 'pda': 0, 'url': 'http://www.moedaseco.com.br', 'status': 200}\n",
"{'upa': 41, 'pda': 43, 'url': 'https://www.maydaygames.com', 'status': 200}\n",
"{'upa': 37, 'pda': 35, 'url': 'http://www.summoner.nl', 'status': 200}\n",
"{'upa': 66, 'pda': 94, 'url': 'https://cdn.shopify.com', 'status': 403}\n",
"{'upa': 59, 'pda': 71, 'url': 'https://www.fantasyflightgames.com', 'status': 403}\n",
"{'upa': 59, 'pda': 92, 'url': 'https://media.giphy.com', 'status': 403}\n",
"{'upa': 63, 'pda': 76, 'url': 'https://memegenerator.net', 'status': 200}\n",
"{'upa': 32, 'pda': 30, 'url': 'https://www.planbgames.com', 'status': 200}\n",
"{'upa': 42, 'pda': 37, 'url': 'https://strongholdgames.com', 'status': 200}\n",
"{'upa': 52, 'pda': 58, 'url': 'https://www.yourlogicalfallacyis.com', 'status': 200}\n",
"{'upa': 36, 'pda': 33, 'url': 'http://www.bordspelmania.eu', 'status': 200}\n",
"{'upa': 30, 'pda': 28, 'url': 'http://bordspeler.nl', 'status': 200}\n",
"{'upa': 100, 'pda': 100, 'url': 'https://twitter.com', 'status': 200}\n",
"{'upa': 88, 'pda': 97, 'url': 'https://en.wikipedia.org', 'status': 200}\n",
"{'upa': 52, 'pda': 54, 'url': 'http://www.coolstuffinc.com', 'status': 200}\n",
"{'upa': 65, 'pda': 92, 'url': 'https://i.ytimg.com', 'status': 404}\n",
"{'upa': 91, 'pda': 97, 'url': 'https://www.amazon.com', 'status': 200}\n",
"{'upa': 71, 'pda': 91, 'url': 'https://www.amazon.ca', 'status': 200}\n",
"{'upa': 27, 'pda': 26, 'url': 'http://www.apttogame.com', 'status': 200}\n",
"{'upa': 40, 'pda': 34, 'url': 'http://www.eggertspiele.de', 'status': 200}\n",
"{'upa': 65, 'pda': 93, 'url': 'https://s-media-cache-ak0.pinimg.com', 'status': 403}\n",
"{'upa': 51, 'pda': 70, 'url': 'https://tshaonline.org', 'status': 200}\n",
"{'upa': 85, 'pda': 95, 'url': 'https://www.etsy.com', 'status': 200}\n",
"{'upa': 24, 'pda': 20, 'url': 'https://boardgameinnovation.com', 'status': 200}\n",
"{'upa': 37, 'pda': 36, 'url': 'http://www.boardgamebliss.com', 'status': 200}\n",
"{'upa': 37, 'pda': 42, 'url': 'http://frpgames.com', 'status': 200}\n",
"{'upa': 50, 'pda': 52, 'url': 'http://www.philibertnet.com', 'status': 200}\n",
"{'upa': 39, 'pda': 34, 'url': 'http://www.thirstymeeples.co.uk', 'status': 200}\n",
"{'upa': 54, 'pda': 57, 'url': 'http://www.artscow.com', 'status': 200}\n",
"{'upa': 81, 'pda': 97, 'url': 'https://itunes.apple.com', 'status': 200}\n",
"{'upa': 6, 'pda': 7, 'url': 'http://boardgames.bplaced.net', 'status': 200}\n",
"{'upa': 51, 'pda': 95, 'url': 'https://opinionatedgamers.files.wordpress.com', 'status': 200}\n",
"{'upa': 7, 'pda': 9, 'url': 'http://eggertspiele.bplaced.net', 'status': 403}\n",
"{'upa': 37, 'pda': 37, 'url': 'http://www.strongholdgames.com', 'status': 200}\n",
"{'upa': 62, 'pda': 93, 'url': 'https://i.pinimg.com', 'status': 403}\n",
"{'upa': 26, 'pda': 20, 'url': 'http://www.athenagames.com', 'status': 200}\n",
"{'upa': 28, 'pda': 23, 'url': 'http://boardgamesinsider.com', 'status': 200}\n",
"{'upa': 37, 'pda': 33, 'url': 'http://store.401games.ca', 'status': 200}\n",
"{'upa': 41, 'pda': 46, 'url': 'http://www.boardgamequest.com', 'status': 200}\n",
"{'upa': 33, 'pda': 35, 'url': 'http://brettspielbox.de', 'status': 200}\n",
"{'upa': 25, 'pda': 26, 'url': 'http://www.brettspiel-news.de', 'status': 200}\n",
"{'upa': 68, 'pda': 92, 'url': 'https://pbs.twimg.com', 'status': 400}\n",
"{'upa': 25, 'pda': 36, 'url': 'https://www.cpforbes.net', 'status': 403}\n",
"{'upa': 85, 'pda': 97, 'url': 'http://goo.gl', 'status': 200}\n",
"{'upa': 65, 'pda': 83, 'url': 'https://www.boardgamegeek.com', 'status': 200}\n",
"{'upa': 22, 'pda': 17, 'url': 'http://www.argfx.at', 'status': 200}\n",
"{'upa': 43, 'pda': 42, 'url': 'https://www.blend4web.com', 'status': 200}\n",
"{'upa': 37, 'pda': 32, 'url': 'http://www.plato-magazine.com', 'status': 200}\n",
"{'upa': 35, 'pda': 37, 'url': 'http://www.vindjeu.eu', 'status': 200}\n",
"{'upa': 52, 'pda': 54, 'url': 'https://www.coolstuffinc.com', 'status': 200}\n",
"{'upa': 41, 'pda': 37, 'url': 'http://www.cardhaus.com', 'status': 200}\n",
"{'upa': 53, 'pda': 56, 'url': 'http://www.webhallen.com', 'status': 200}\n",
"{'upa': 48, 'pda': 53, 'url': 'http://www.cowcow.com', 'status': 200}\n",
"{'upa': 67, 'pda': 80, 'url': 'https://www.rotary.org', 'status': 200}\n",
"{'upa': 15, 'pda': 12, 'url': 'http://controlledareagaming.com', 'status': 200}\n",
"{'upa': 75, 'pda': 92, 'url': 'https://www.twitch.tv', 'status': 200}\n",
"{'upa': 78, 'pda': 93, 'url': 'https://www.amazon.de', 'status': 200}\n",
"{'upa': 68, 'pda': 83, 'url': 'http://www.thingiverse.com', 'status': 200}\n",
"{'upa': 23, 'pda': 20, 'url': 'http://www.boardgameinnovation.com', 'status': 200}\n",
"{'upa': 67, 'pda': 95, 'url': 'https://m.imgur.com', 'status': 200}\n",
"{'upa': 82, 'pda': 96, 'url': 'https://play.google.com', 'status': 200}\n",
"{'upa': 1, 'pda': 0, 'url': 'http://concordiascore.azurewebsites.net', 'status': -1}\n",
"{'upa': 48, 'pda': 95, 'url': 'https://thevirginiantv.files.wordpress.com', 'status': 200}\n",
"{'upa': 51, 'pda': 50, 'url': 'http://www.miniaturemarket.com', 'status': 200}\n",
"{'upa': 31, 'pda': 28, 'url': 'http://www.greatboardgames.ca', 'status': 200}\n",
"{'upa': 89, 'pda': 98, 'url': 'https://www.reddit.com', 'status': 429}\n",
"{'upa': 51, 'pda': 52, 'url': 'http://www.pegasus.de', 'status': 200}\n",
"{'upa': 30, 'pda': 25, 'url': 'https://www.topshelfgamer.com', 'status': 200}\n",
"{'upa': 19, 'pda': 16, 'url': 'http://fatcatgaming.co.uk', 'status': 200}\n",
"{'upa': 40, 'pda': 36, 'url': 'http://www.thebrokentoken.com', 'status': 200}\n",
"{'upa': 41, 'pda': 38, 'url': 'http://www.meeplesource.com', 'status': 200}\n",
"{'upa': 79, 'pda': 93, 'url': 'https://www.kickstarter.com', 'status': 200}\n",
"{'upa': 43, 'pda': 39, 'url': 'http://www.eaglegames.net', 'status': 200}\n",
"{'upa': 81, 'pda': 96, 'url': 'https://youtu.be', 'status': 200}\n",
"{'upa': 37, 'pda': 36, 'url': 'https://www.boardgamebliss.com', 'status': 200}\n",
"{'upa': 45, 'pda': 44, 'url': 'http://1856.com', 'status': 200}\n",
"{'upa': 23, 'pda': 18, 'url': 'http://www.unhalfbricking.com', 'status': -1}\n",
"{'upa': 27, 'pda': 18, 'url': 'http://www.boardgamesearch.com.au', 'status': 200}\n",
"{'upa': 46, 'pda': 74, 'url': 'https://m.media-amazon.com', 'status': 400}\n",
"{'upa': 58, 'pda': 91, 'url': 'https://images-na.ssl-images-amazon.com', 'status': 400}\n",
"{'upa': 30, 'pda': 34, 'url': 'http://eggertspiele.de', 'status': 200}\n",
"{'upa': 82, 'pda': 95, 'url': 'https://imgur.com', 'status': 200}\n"
]
}
],
"source": [
"headers = {'user-agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'}\n",
"\n",
"for vals in x:\n",
" da_pa = client.urlMetrics(vals, Mozscape.UMCols.domainAuthority | Mozscape.UMCols.pageAuthority)\n",
" i = 0\n",
" for y in da_pa:\n",
" y['url'] = vals[i]\n",
" try:\n",
" r = requests.get(vals[i], headers=headers)\n",
" y['status'] = r.status_code\n",
" except requests.exceptions.ConnectionError:\n",
" y['status'] = -1\n",
" i = i+1\n",
" df = df.append(y, ignore_index=True)\n",
" print(y) \n",
" time.sleep(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Looking at every domain that shows a status code >= 400 with the `status_code_threshold` variable."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"status_code_threshold = 400\n",
"da_threshold = 25\n",
"error_urls = df[(df['status'] >= status_code_threshold) & (df['pda'] >= da_threshold)]['url'].tolist()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Change the `whois_api_key` variable to be equal to the [Whois XML API](https://main.whoisxmlapi.com/) API key. 500 credits are available for free."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"amazon.co.uk status: UNAVAILABLE\n",
"shopify.com status: UNAVAILABLE\n",
"giphy.com status: UNAVAILABLE\n",
"coolstuffinc.com status: UNAVAILABLE\n",
"ytimg.com status: UNAVAILABLE\n",
"amazon.com status: UNAVAILABLE\n",
"pinimg.com status: AVAILABLE\n",
"pinimg.com status: AVAILABLE\n",
"twimg.com status: UNAVAILABLE\n",
"coolstuffinc.com status: UNAVAILABLE\n",
"challonge.com status: AVAILABLE\n",
"shopify.com status: UNAVAILABLE\n",
"fantasyflightgames.com status: AVAILABLE\n",
"giphy.com status: UNAVAILABLE\n",
"ytimg.com status: UNAVAILABLE\n",
"pinimg.com status: AVAILABLE\n",
"pinimg.com status: UNAVAILABLE\n",
"twimg.com status: UNAVAILABLE\n",
"cpforbes.net status: UNAVAILABLE\n",
"reddit.com status: UNAVAILABLE\n",
"media-amazon.com status: UNAVAILABLE\n",
"ssl-images-amazon.com status: UNAVAILABLE\n"
]
}
],
"source": [
"whois_api_key = \"your_key\"\n",
"\n",
"for x in error_urls:\n",
" dnsapi = \"https://www.whoisxmlapi.com/whoisserver/WhoisService?apiKey=\" + whois_api_key + \"&outputFormat=JSON&cmd=GET_DN_AVAILABILITY&domainName=\" + x\n",
" r = requests.get(dnsapi) \n",
" parsed_json = json.loads(r.text)\n",
" print(parsed_json['DomainInfo']['domainName'] + \" status: \" + parsed_json['DomainInfo']['domainAvailability'])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment