Last active
September 6, 2023 11:37
-
-
Save baderdean/cc4643ecd95d3ccde31dee80ebdbea28 to your computer and use it in GitHub Desktop.
Benchmark quality and performance of 3 python libraries regarding the registrant name/organization field quality using Google's domain dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env python3 | |
""" | |
Benchmark various whois library against the "registrant organization" value using Google's domain dataset | |
- whoisdomain uses whois command line | |
- whoisit uses rdap (but requires bootstrap) | |
- asyncwhois uses async | |
- pythonwhois(alt) older one | |
""" | |
import asyncio | |
import requests | |
import time | |
from pprint import pprint | |
import whoisdomain | |
import whoisit | |
import asyncwhois | |
import pythonwhois | |
GOOGLE_DOMAINS_URL = "https://www.google.com/supported_domains" | |
REGISTRANT_ORGANIZATION = "GOOGLE" | |
def google_domains(url=GOOGLE_DOMAINS_URL): | |
return [d[1:] for d in requests.get(url).text.split()] | |
async def whoisdomain_registrant(domain, org=REGISTRANT_ORGANIZATION): | |
found = False | |
try: | |
w = whoisdomain.query(domain) | |
except: | |
return found | |
if hasattr(w, 'registrant') and w.registrant: | |
found = org in w.registrant.upper() | |
return found | |
async def whoisit_registrant(domain, org=REGISTRANT_ORGANIZATION): | |
if not whoisit.is_bootstrapped(): | |
whoisit.bootstrap() | |
found = False | |
try: | |
w = whoisit.domain(domain) | |
except Exception: | |
return found | |
if 'registrant' not in w.get('entities', {}): | |
return found | |
for registrant in w['entities']['registrant']: | |
if registrant.get('organization'): | |
found = org in registrant['organization'].upper() | |
elif registrant.get('name'): | |
found = org in registrant['name'].upper() | |
if found: | |
return found | |
return found | |
async def asyncwhois_registrant(domain, org=REGISTRANT_ORGANIZATION): | |
found = False | |
try: | |
w = await asyncwhois.aio_whois_domain(domain) | |
except Exception: | |
return found | |
if hasattr(w, 'parser_output'): | |
if w.parser_output.get('registrant_organization'): | |
found = org in w.parser_output['registrant_organization'].upper() | |
elif w.parser_output.get('registrant'): | |
found = org in w.parser_output['registrant'].upper() | |
return found | |
async def pythonwhoisalt_registrant(domain, org=REGISTRANT_ORGANIZATION): | |
found = False | |
try: | |
w_data = pythonwhois.net.get_whois_raw(domain) | |
w = pythonwhois.parse.parse_registrants(w_data) | |
except Exception: | |
return found | |
if w.get('registrant'): | |
found = org in w['registrant'].get('name', '').upper() | |
return found | |
async def benchmark_parser(parser, domains): | |
start = time.perf_counter() | |
results = { | |
'count': [await parser(domain) for domain in domains].count(True), | |
} | |
results['duration'] = time.perf_counter() - start | |
results['percentage'] = f"{round(results['count'] / len(domains) * 100)}%" | |
return results | |
if __name__ == "__main__": | |
import sys | |
domains = google_domains() | |
parsers = { | |
'whoisdomain': whoisdomain_registrant, | |
'whoisit': whoisit_registrant, | |
'asyncwhois': asyncwhois_registrant, | |
'pythonwhoisalt': pythonwhoisalt_registrant, | |
} | |
if len(sys.argv) > 1: | |
_parsers = {} | |
for arg in sys.argv[1:]: | |
if arg in parsers: | |
_parsers[arg] = parsers[arg] | |
parsers = _parsers | |
print(f"Parsing with parsers: {list(parsers.keys())}") | |
loop = asyncio.get_event_loop() | |
results = {name: loop.run_until_complete(benchmark_parser(parser, domains)) for name, parser in parsers.items()} | |
pprint(results) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env python3 | |
""" | |
Benchmark various whois library against the "registrant organization" value using Google's domain dataset | |
- whoisdomain uses whois command line | |
- whoisit uses rdap (but requires bootstrap) | |
- asyncwhois uses async (while not used here) | |
- pythonwhois(alt) older one | |
""" | |
import requests | |
import time | |
from pprint import pprint | |
import whoisdomain | |
import whoisit | |
import asyncwhois | |
import pythonwhois | |
GOOGLE_DOMAINS_URL = "https://www.google.com/supported_domains" | |
REGISTRANT_ORGANIZATION = "GOOGLE" | |
def google_domains(url=GOOGLE_DOMAINS_URL): | |
return [d[1:] for d in requests.get(url).text.split()] | |
def whoisdomain_registrant(domain, org=REGISTRANT_ORGANIZATION): | |
found = False | |
try: | |
w = whoisdomain.query(domain) | |
except: | |
return found | |
if hasattr(w, 'registrant') and w.registrant: | |
found = org in w.registrant.upper() | |
return found | |
def whoisit_registrant(domain, org=REGISTRANT_ORGANIZATION): | |
if not whoisit.is_bootstrapped(): | |
whoisit.bootstrap() | |
found = False | |
try: | |
w = whoisit.domain(domain) | |
except Exception: | |
return found | |
if 'registrant' not in w.get('entities', {}): | |
return found | |
for registrant in w['entities']['registrant']: | |
if registrant.get('organization'): | |
found = org in registrant['organization'].upper() | |
elif registrant.get('name'): | |
found = org in registrant['name'].upper() | |
if found: | |
return found | |
return found | |
def asyncwhois_registrant(domain, org=REGISTRANT_ORGANIZATION): | |
found = False | |
try: | |
w = asyncwhois.whois_domain(domain) | |
except Exception: | |
return found | |
if hasattr(w, 'parser_output'): | |
if w.parser_output.get('registrant_organization'): | |
found = org in w.parser_output['registrant_organization'].upper() | |
elif w.parser_output.get('registrant'): | |
found = org in w.parser_output['registrant'].upper() | |
return found | |
def pythonwhoisalt_registrant(domain, org=REGISTRANT_ORGANIZATION): | |
found = False | |
try: | |
w_data = pythonwhois.net.get_whois_raw(domain) | |
w = pythonwhois.parse.parse_registrants(w_data) | |
except Exception: | |
return found | |
if w.get('registrant'): | |
found = org in w['registrant'].get('name', '').upper() | |
return found | |
def benchmark_parser(parser, domains): | |
start = time.perf_counter() | |
results = { | |
'count': [parser(domain) for domain in domains].count(True), | |
} | |
results['duration'] = time.perf_counter() - start | |
results['percentage'] = f"{round(results['count'] / len(domains) * 100)}%" | |
return results | |
if __name__ == "__main__": | |
import sys | |
domains = google_domains() | |
parsers = { | |
'whoisdomain': whoisdomain_registrant, | |
'whoisit': whoisit_registrant, | |
'asyncwhois': asyncwhois_registrant, | |
'pythonwhoisalt': pythonwhoisalt_registrant, | |
} | |
if len(sys.argv) > 1: | |
_parsers = {} | |
for arg in sys.argv[1:]: | |
if arg in parsers: | |
_parsers[arg] = parsers[arg] | |
parsers = _parsers | |
print(f"Parsing with parsers: {list(parsers.keys())}") | |
results = {name: benchmark_parser(parser, domains) for name, parser in parsers.items()} | |
pprint(results) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment