Created
January 28, 2019 22:08
-
-
Save voltek62/1f3eb995d443f1d515835797af821bd1 to your computer and use it in GitHub Desktop.
Jupyter Notebook that input outlink from Screaming Frog crawl, grabs PA & DA from Moz API, and uses WHOIS API to determine domain availability.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 148, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from mozscape import Mozscape\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import requests\n", | |
"import json\n", | |
"import time\n", | |
"\n", | |
"def divide_chunks(l, n): \n", | |
" for i in range(0, len(l), n): \n", | |
" yield l[i:i + n] \n", | |
"\n", | |
"client = Mozscape('my_access_id', 'my_secret_key')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 149, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"csv = pd.read_csv('./all_outlinks.csv', skiprows=1)\n", | |
"\n", | |
"links = csv[csv['Type'] == 'AHREF']\n", | |
"links = csv[~csv['Destination'].str.match('https?://boardgamegeek.com/.*|https?://rpggeek.com/.*|https?://boardgamegeekstore.com/.*|https?://.*.\\.geekdo-.*.com/.*|https?://videogamegeek.com/.*|https?://.*\\.amazon-.*.com.*')]\n", | |
"\n", | |
"Domains = links['Destination'].replace(to_replace=\"(.*://)?([^/?]+).*\", value=r\"\\1\\2\", regex=True)\n", | |
"\n", | |
"x = list(divide_chunks(Domains.unique().tolist(), 5)) \n", | |
"\n", | |
"df = pd.DataFrame(columns=['pda','upa','url','status'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 150, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'upa': 20, 'pda': 13, 'url': 'http://www.qmlogistics.com', 'status': 403}\n", | |
"{'upa': 100, 'pda': 100, 'url': 'https://www.youtube.com', 'status': 200}\n", | |
"{'upa': 37, 'pda': 73, 'url': 'https://moedaseco.lojaintegrada.com.br', 'status': 200}\n", | |
"{'upa': 22, 'pda': 19, 'url': 'https://www.eggertspiele.com', 'status': 503}\n", | |
"{'upa': 80, 'pda': 94, 'url': 'https://www.amazon.co.uk', 'status': 503}\n", | |
"{'upa': 29, 'pda': 23, 'url': 'https://boardgameprices.co.uk', 'status': 200}\n", | |
"{'upa': 22, 'pda': 22, 'url': 'http://firestormcards.co.uk', 'status': 200}\n", | |
"{'upa': 65, 'pda': 83, 'url': 'http://www.boardgamegeek.com', 'status': 200}\n", | |
"{'upa': 56, 'pda': 68, 'url': 'https://challonge.com', 'status': 200}\n", | |
"{'upa': 30, 'pda': 28, 'url': 'https://www.gamenerdz.com', 'status': -1}\n", | |
"{'upa': 40, 'pda': 36, 'url': 'https://www.thebrokentoken.com', 'status': 200}\n", | |
"{'upa': 50, 'pda': 49, 'url': 'https://www.plaidhatgames.com', 'status': 200}\n", | |
"{'upa': 1, 'pda': 0, 'url': 'http://www.moedaseco.com.br', 'status': 200}\n", | |
"{'upa': 41, 'pda': 42, 'url': 'https://www.maydaygames.com', 'status': 200}\n", | |
"{'upa': 37, 'pda': 35, 'url': 'http://www.summoner.nl', 'status': 200}\n", | |
"{'upa': 66, 'pda': 94, 'url': 'https://cdn.shopify.com', 'status': 403}\n", | |
"{'upa': 59, 'pda': 71, 'url': 'https://www.fantasyflightgames.com', 'status': 200}\n", | |
"{'upa': 59, 'pda': 92, 'url': 'https://media.giphy.com', 'status': 403}\n", | |
"{'upa': 63, 'pda': 76, 'url': 'https://memegenerator.net', 'status': 200}\n", | |
"{'upa': 32, 'pda': 30, 'url': 'https://www.planbgames.com', 'status': 200}\n", | |
"{'upa': 42, 'pda': 37, 'url': 'https://strongholdgames.com', 'status': 200}\n", | |
"{'upa': 52, 'pda': 58, 'url': 'https://www.yourlogicalfallacyis.com', 'status': 200}\n", | |
"{'upa': 36, 'pda': 33, 'url': 'http://www.bordspelmania.eu', 'status': 200}\n", | |
"{'upa': 29, 'pda': 28, 'url': 'http://bordspeler.nl', 'status': 200}\n", | |
"{'upa': 100, 'pda': 100, 'url': 'https://twitter.com', 'status': 200}\n", | |
"{'upa': 88, 'pda': 97, 'url': 'https://en.wikipedia.org', 'status': 200}\n", | |
"{'upa': 52, 'pda': 54, 'url': 'http://www.coolstuffinc.com', 'status': 403}\n", | |
"{'upa': 65, 'pda': 92, 'url': 'https://i.ytimg.com', 'status': 404}\n", | |
"{'upa': 91, 'pda': 97, 'url': 'https://www.amazon.com', 'status': 503}\n", | |
"{'upa': 71, 'pda': 91, 'url': 'https://www.amazon.ca', 'status': 200}\n", | |
"{'upa': 27, 'pda': 26, 'url': 'http://www.apttogame.com', 'status': 200}\n", | |
"{'upa': 40, 'pda': 34, 'url': 'http://www.eggertspiele.de', 'status': 200}\n", | |
"{'upa': 65, 'pda': 93, 'url': 'https://s-media-cache-ak0.pinimg.com', 'status': 403}\n", | |
"{'upa': 51, 'pda': 70, 'url': 'https://tshaonline.org', 'status': 200}\n", | |
"{'upa': 85, 'pda': 95, 'url': 'https://www.etsy.com', 'status': 200}\n", | |
"{'upa': 24, 'pda': 19, 'url': 'https://boardgameinnovation.com', 'status': 200}\n", | |
"{'upa': 37, 'pda': 36, 'url': 'http://www.boardgamebliss.com', 'status': 200}\n", | |
"{'upa': 37, 'pda': 42, 'url': 'http://frpgames.com', 'status': 200}\n", | |
"{'upa': 49, 'pda': 52, 'url': 'http://www.philibertnet.com', 'status': 200}\n", | |
"{'upa': 39, 'pda': 34, 'url': 'http://www.thirstymeeples.co.uk', 'status': 200}\n", | |
"{'upa': 53, 'pda': 57, 'url': 'http://www.artscow.com', 'status': 200}\n", | |
"{'upa': 81, 'pda': 97, 'url': 'https://itunes.apple.com', 'status': 200}\n", | |
"{'upa': 6, 'pda': 7, 'url': 'http://boardgames.bplaced.net', 'status': 200}\n", | |
"{'upa': 51, 'pda': 95, 'url': 'https://opinionatedgamers.files.wordpress.com', 'status': 200}\n", | |
"{'upa': 7, 'pda': 9, 'url': 'http://eggertspiele.bplaced.net', 'status': 403}\n", | |
"{'upa': 37, 'pda': 37, 'url': 'http://www.strongholdgames.com', 'status': 200}\n", | |
"{'upa': 62, 'pda': 93, 'url': 'https://i.pinimg.com', 'status': 403}\n", | |
"{'upa': 26, 'pda': 20, 'url': 'http://www.athenagames.com', 'status': 200}\n", | |
"{'upa': 28, 'pda': 23, 'url': 'http://boardgamesinsider.com', 'status': 200}\n", | |
"{'upa': 36, 'pda': 33, 'url': 'http://store.401games.ca', 'status': 200}\n", | |
"{'upa': 41, 'pda': 46, 'url': 'http://www.boardgamequest.com', 'status': 200}\n", | |
"{'upa': 32, 'pda': 34, 'url': 'http://brettspielbox.de', 'status': 200}\n", | |
"{'upa': 25, 'pda': 25, 'url': 'http://www.brettspiel-news.de', 'status': 200}\n", | |
"{'upa': 68, 'pda': 92, 'url': 'https://pbs.twimg.com', 'status': 400}\n", | |
"{'upa': 25, 'pda': 36, 'url': 'https://www.cpforbes.net', 'status': 200}\n", | |
"{'upa': 85, 'pda': 97, 'url': 'http://goo.gl', 'status': 200}\n", | |
"{'upa': 65, 'pda': 83, 'url': 'https://www.boardgamegeek.com', 'status': 200}\n", | |
"{'upa': 22, 'pda': 17, 'url': 'http://www.argfx.at', 'status': 200}\n", | |
"{'upa': 43, 'pda': 42, 'url': 'https://www.blend4web.com', 'status': 200}\n", | |
"{'upa': 37, 'pda': 32, 'url': 'http://www.plato-magazine.com', 'status': 200}\n", | |
"{'upa': 35, 'pda': 37, 'url': 'http://www.vindjeu.eu', 'status': 200}\n", | |
"{'upa': 52, 'pda': 54, 'url': 'https://www.coolstuffinc.com', 'status': 403}\n", | |
"{'upa': 41, 'pda': 37, 'url': 'http://www.cardhaus.com', 'status': 200}\n", | |
"{'upa': 53, 'pda': 56, 'url': 'http://www.webhallen.com', 'status': 200}\n", | |
"{'upa': 48, 'pda': 53, 'url': 'http://www.cowcow.com', 'status': 200}\n", | |
"{'upa': 67, 'pda': 80, 'url': 'https://www.rotary.org', 'status': 200}\n", | |
"{'upa': 15, 'pda': 12, 'url': 'http://controlledareagaming.com', 'status': 200}\n", | |
"{'upa': 75, 'pda': 92, 'url': 'https://www.twitch.tv', 'status': 200}\n", | |
"{'upa': 78, 'pda': 93, 'url': 'https://www.amazon.de', 'status': 503}\n", | |
"{'upa': 68, 'pda': 83, 'url': 'http://www.thingiverse.com', 'status': 200}\n", | |
"{'upa': 23, 'pda': 19, 'url': 'http://www.boardgameinnovation.com', 'status': 200}\n", | |
"{'upa': 67, 'pda': 95, 'url': 'https://m.imgur.com', 'status': 200}\n", | |
"{'upa': 82, 'pda': 96, 'url': 'https://play.google.com', 'status': 200}\n", | |
"{'upa': 1, 'pda': 0, 'url': 'http://concordiascore.azurewebsites.net', 'status': -1}\n", | |
"{'upa': 48, 'pda': 95, 'url': 'https://thevirginiantv.files.wordpress.com', 'status': 200}\n", | |
"{'upa': 51, 'pda': 50, 'url': 'http://www.miniaturemarket.com', 'status': 200}\n", | |
"{'upa': 31, 'pda': 28, 'url': 'http://www.greatboardgames.ca', 'status': 200}\n", | |
"{'upa': 89, 'pda': 98, 'url': 'https://www.reddit.com', 'status': 429}\n", | |
"{'upa': 51, 'pda': 52, 'url': 'http://www.pegasus.de', 'status': 200}\n", | |
"{'upa': 30, 'pda': 25, 'url': 'https://www.topshelfgamer.com', 'status': 200}\n", | |
"{'upa': 19, 'pda': 16, 'url': 'http://fatcatgaming.co.uk', 'status': 200}\n", | |
"{'upa': 40, 'pda': 36, 'url': 'http://www.thebrokentoken.com', 'status': 200}\n", | |
"{'upa': 41, 'pda': 38, 'url': 'http://www.meeplesource.com', 'status': 200}\n", | |
"{'upa': 79, 'pda': 93, 'url': 'https://www.kickstarter.com', 'status': 200}\n", | |
"{'upa': 43, 'pda': 39, 'url': 'http://www.eaglegames.net', 'status': 200}\n", | |
"{'upa': 81, 'pda': 96, 'url': 'https://youtu.be', 'status': 200}\n", | |
"{'upa': 37, 'pda': 36, 'url': 'https://www.boardgamebliss.com', 'status': 200}\n", | |
"{'upa': 45, 'pda': 44, 'url': 'http://1856.com', 'status': 403}\n", | |
"{'upa': 23, 'pda': 18, 'url': 'http://www.unhalfbricking.com', 'status': 200}\n", | |
"{'upa': 27, 'pda': 18, 'url': 'http://www.boardgamesearch.com.au', 'status': 200}\n", | |
"{'upa': 46, 'pda': 74, 'url': 'https://m.media-amazon.com', 'status': 400}\n", | |
"{'upa': 58, 'pda': 91, 'url': 'https://images-na.ssl-images-amazon.com', 'status': 400}\n", | |
"{'upa': 30, 'pda': 34, 'url': 'http://eggertspiele.de', 'status': 200}\n", | |
"{'upa': 82, 'pda': 95, 'url': 'https://imgur.com', 'status': 200}\n" | |
] | |
} | |
], | |
"source": [ | |
"for vals in x:\n", | |
" da_pa = client.urlMetrics(vals, Mozscape.UMCols.domainAuthority | Mozscape.UMCols.pageAuthority)\n", | |
" i = 0\n", | |
" for y in da_pa:\n", | |
" y['url'] = vals[i]\n", | |
" try:\n", | |
" r = requests.get(vals[i])\n", | |
" y['status'] = r.status_code\n", | |
" except requests.exceptions.ConnectionError:\n", | |
" y['status'] = -1\n", | |
" i = i+1\n", | |
" df = df.append(y, ignore_index=True)\n", | |
" print(y) \n", | |
" time.sleep(5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 151, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"status_code_threshold = 400\n", | |
"da_threshold = 25\n", | |
"error_urls = df[(df['status'] >= status_code_threshold) & (df['pda'] >= da_threshold)]['url'].tolist()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 152, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"amazon.co.uk status: UNAVAILABLE\n", | |
"shopify.com status: UNAVAILABLE\n", | |
"giphy.com status: UNAVAILABLE\n", | |
"coolstuffinc.com status: UNAVAILABLE\n", | |
"ytimg.com status: UNAVAILABLE\n", | |
"amazon.com status: UNAVAILABLE\n", | |
"pinimg.com status: UNAVAILABLE\n", | |
"pinimg.com status: UNAVAILABLE\n", | |
"twimg.com status: UNAVAILABLE\n", | |
"coolstuffinc.com status: UNAVAILABLE\n", | |
"amazon.de status: UNAVAILABLE\n", | |
"reddit.com status: UNAVAILABLE\n", | |
"1856.com status: UNAVAILABLE\n", | |
"media-amazon.com status: UNAVAILABLE\n", | |
"ssl-images-amazon.com status: UNAVAILABLE\n" | |
] | |
} | |
], | |
"source": [ | |
"whois_api_key = \"foo\"\n", | |
"\n", | |
"for x in error_urls:\n", | |
" dnsapi = \"https://www.whoisxmlapi.com/whoisserver/WhoisService?apiKey=\" + whois_api_key + \"&outputFormat=JSON&cmd=GET_DN_AVAILABILITY&domainName=\" + x\n", | |
" r = requests.get(dnsapi) \n", | |
" parsed_json = json.loads(r.text)\n", | |
" print(parsed_json['DomainInfo']['domainName'] + \" status: \" + parsed_json['DomainInfo']['domainAvailability'])" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment