Skip to content

Instantly share code, notes, and snippets.

@baderdean
Last active September 6, 2023 11:37
Show Gist options
  • Save baderdean/cc4643ecd95d3ccde31dee80ebdbea28 to your computer and use it in GitHub Desktop.
Save baderdean/cc4643ecd95d3ccde31dee80ebdbea28 to your computer and use it in GitHub Desktop.
Benchmark quality and performance of 3 python libraries regarding the registrant name/organization field quality using Google's domain dataset
#!/bin/env python3
"""
Benchmark various whois library against the "registrant organization" value using Google's domain dataset
- whoisdomain uses whois command line
- whoisit uses rdap (but requires bootstrap)
- asyncwhois uses async
- pythonwhois(alt) older one
"""
import asyncio
import requests
import time
from pprint import pprint
import whoisdomain
import whoisit
import asyncwhois
import pythonwhois
GOOGLE_DOMAINS_URL = "https://www.google.com/supported_domains"
REGISTRANT_ORGANIZATION = "GOOGLE"
def google_domains(url=GOOGLE_DOMAINS_URL):
return [d[1:] for d in requests.get(url).text.split()]
async def whoisdomain_registrant(domain, org=REGISTRANT_ORGANIZATION):
found = False
try:
w = whoisdomain.query(domain)
except:
return found
if hasattr(w, 'registrant') and w.registrant:
found = org in w.registrant.upper()
return found
async def whoisit_registrant(domain, org=REGISTRANT_ORGANIZATION):
if not whoisit.is_bootstrapped():
whoisit.bootstrap()
found = False
try:
w = whoisit.domain(domain)
except Exception:
return found
if 'registrant' not in w.get('entities', {}):
return found
for registrant in w['entities']['registrant']:
if registrant.get('organization'):
found = org in registrant['organization'].upper()
elif registrant.get('name'):
found = org in registrant['name'].upper()
if found:
return found
return found
async def asyncwhois_registrant(domain, org=REGISTRANT_ORGANIZATION):
found = False
try:
w = await asyncwhois.aio_whois_domain(domain)
except Exception:
return found
if hasattr(w, 'parser_output'):
if w.parser_output.get('registrant_organization'):
found = org in w.parser_output['registrant_organization'].upper()
elif w.parser_output.get('registrant'):
found = org in w.parser_output['registrant'].upper()
return found
async def pythonwhoisalt_registrant(domain, org=REGISTRANT_ORGANIZATION):
found = False
try:
w_data = pythonwhois.net.get_whois_raw(domain)
w = pythonwhois.parse.parse_registrants(w_data)
except Exception:
return found
if w.get('registrant'):
found = org in w['registrant'].get('name', '').upper()
return found
async def benchmark_parser(parser, domains):
start = time.perf_counter()
results = {
'count': [await parser(domain) for domain in domains].count(True),
}
results['duration'] = time.perf_counter() - start
results['percentage'] = f"{round(results['count'] / len(domains) * 100)}%"
return results
if __name__ == "__main__":
import sys
domains = google_domains()
parsers = {
'whoisdomain': whoisdomain_registrant,
'whoisit': whoisit_registrant,
'asyncwhois': asyncwhois_registrant,
'pythonwhoisalt': pythonwhoisalt_registrant,
}
if len(sys.argv) > 1:
_parsers = {}
for arg in sys.argv[1:]:
if arg in parsers:
_parsers[arg] = parsers[arg]
parsers = _parsers
print(f"Parsing with parsers: {list(parsers.keys())}")
loop = asyncio.get_event_loop()
results = {name: loop.run_until_complete(benchmark_parser(parser, domains)) for name, parser in parsers.items()}
pprint(results)
#!/bin/env python3
"""
Benchmark various whois library against the "registrant organization" value using Google's domain dataset
- whoisdomain uses whois command line
- whoisit uses rdap (but requires bootstrap)
- asyncwhois uses async (while not used here)
- pythonwhois(alt) older one
"""
import requests
import time
from pprint import pprint
import whoisdomain
import whoisit
import asyncwhois
import pythonwhois
GOOGLE_DOMAINS_URL = "https://www.google.com/supported_domains"
REGISTRANT_ORGANIZATION = "GOOGLE"
def google_domains(url=GOOGLE_DOMAINS_URL):
return [d[1:] for d in requests.get(url).text.split()]
def whoisdomain_registrant(domain, org=REGISTRANT_ORGANIZATION):
found = False
try:
w = whoisdomain.query(domain)
except:
return found
if hasattr(w, 'registrant') and w.registrant:
found = org in w.registrant.upper()
return found
def whoisit_registrant(domain, org=REGISTRANT_ORGANIZATION):
if not whoisit.is_bootstrapped():
whoisit.bootstrap()
found = False
try:
w = whoisit.domain(domain)
except Exception:
return found
if 'registrant' not in w.get('entities', {}):
return found
for registrant in w['entities']['registrant']:
if registrant.get('organization'):
found = org in registrant['organization'].upper()
elif registrant.get('name'):
found = org in registrant['name'].upper()
if found:
return found
return found
def asyncwhois_registrant(domain, org=REGISTRANT_ORGANIZATION):
found = False
try:
w = asyncwhois.whois_domain(domain)
except Exception:
return found
if hasattr(w, 'parser_output'):
if w.parser_output.get('registrant_organization'):
found = org in w.parser_output['registrant_organization'].upper()
elif w.parser_output.get('registrant'):
found = org in w.parser_output['registrant'].upper()
return found
def pythonwhoisalt_registrant(domain, org=REGISTRANT_ORGANIZATION):
found = False
try:
w_data = pythonwhois.net.get_whois_raw(domain)
w = pythonwhois.parse.parse_registrants(w_data)
except Exception:
return found
if w.get('registrant'):
found = org in w['registrant'].get('name', '').upper()
return found
def benchmark_parser(parser, domains):
start = time.perf_counter()
results = {
'count': [parser(domain) for domain in domains].count(True),
}
results['duration'] = time.perf_counter() - start
results['percentage'] = f"{round(results['count'] / len(domains) * 100)}%"
return results
if __name__ == "__main__":
import sys
domains = google_domains()
parsers = {
'whoisdomain': whoisdomain_registrant,
'whoisit': whoisit_registrant,
'asyncwhois': asyncwhois_registrant,
'pythonwhoisalt': pythonwhoisalt_registrant,
}
if len(sys.argv) > 1:
_parsers = {}
for arg in sys.argv[1:]:
if arg in parsers:
_parsers[arg] = parsers[arg]
parsers = _parsers
print(f"Parsing with parsers: {list(parsers.keys())}")
results = {name: benchmark_parser(parser, domains) for name, parser in parsers.items()}
pprint(results)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment