Skip to content

Instantly share code, notes, and snippets.

@nuno-andre
Created February 22, 2021 11:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nuno-andre/52d19a3665c6ef2c67d81811ffb16bd0 to your computer and use it in GitHub Desktop.
Save nuno-andre/52d19a3665c6ef2c67d81811ffb16bd0 to your computer and use it in GitHub Desktop.
Retrieve a URL's eTLD (effective top-level domain) and its operator from the Public Suffix List
"""
Retrieve a URL's eTLD (effective top-level domain) and its operator
from the Public Suffix List
https://wiki.mozilla.org/Public_Suffix_List
"""
from functools import cached_property
from urllib.parse import urlparse
from typing import Optional
from io import StringIO
import requests
import re
SOURCE = 'https://raw.githubusercontent.com/publicsuffix/list/master/public_suffix_list.dat'
REOPER = re.compile(r'^//\s(?P<root>.*?)\s:\s(?:[\-0-9]*\s)?(?P<operator>.*?)\s*?$')
class EtldParse:
@cached_property
def etlds(self) -> dict[str, str]:
'''Dictionary {TLD: operator}.
'''
etlds = dict()
operator = None
data = StringIO(requests.get(SOURCE).text)
for line in iter(data.readline, ''):
if line[0] not in '/\n':
etlds[line.strip()] = operator
elif (match := REOPER.match(line)):
operator = match.group(2)
return etlds
def parse(self, url: str) -> dict[str, Optional[str]]:
'''Returns host and eTLD.
'''
host = urlparse(url).hostname.split('.')
etld = list()
operator = None
while host:
if (_op := self.etlds.get('.'.join((host[-1], *etld)))):
operator = _op
etld.insert(0, host.pop(-1))
else:
break
return dict(host='.'.join(host) or None,
etld='.'.join(etld) or None,
operator=operator)
if __name__ == '__main__':
tldp = EtldParse()
print(tldp.parse('http://foo.bar.gal/whatever'))
# returns: {'host': 'foo.bar', 'etld': 'gal', 'operator': 'Asociación puntoGAL'}
print(tldp.parse('http://foo.bar.kouzushima.tokyo.jp/whatever'))
# returns: {'host': 'foo.bar', 'etld': 'kouzushima.tokyo.jp', 'operator': 'https://en.wikipedia.org/wiki/.jp'}
print(tldp.parse('http://kouzushima.tokyo.jp/whatever'))
# returns: {'host': None, 'etld': 'kouzushima.tokyo.jp', 'operator': 'https://en.wikipedia.org/wiki/.jp'}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment