Skip to content

Instantly share code, notes, and snippets.

Last active December 25, 2022 00:43
What would you like to do?
import string
import sys
import requests
import whois
from nltk import tokenize
BOOKFILE = sys.argv[1]
OUTPUTFILE = BOOKFILE + '.possible-domains.txt'
tlds = []
known_unavailable = ['smile', 'windows','active','amazon','apple','audible',
r = requests.get("")
for d in r.text.splitlines():
if d.startswith("#") or d.startswith('XN--'):
d = d.lower()
if d not in known_unavailable:
with open(BOOKFILE, 'r') as f:
md = ' '.join([l.strip() for l in f.readlines()])
md_sents = tokenize.sent_tokenize(md)
possible_domains = {}
for s in md_sents:
wl = tokenize.word_tokenize(s)
wl = [w.lower() for w in wl]
wl = [''.join([c for c in w if c in string.ascii_lowercase]) for w in wl]
wl = [w for w in wl if w]
for i, w in enumerate(wl):
if (i > 1 and w in tlds and len(w) > 3
and len(wl[i-1]) > 5 and len(wl[i-2]) > 5):
full_domain = '.'.join([wl[i-2], wl[i-1], w])
d = whois.query(full_domain.split('.',1)[1])
possible_domains[full_domain] = 'reg' if d else 'unreg'
except (whois.exceptions.UnknownTld,
possible_domains[full_domain] = 'unknown'
emoji_prefix = {'reg':'❌', 'unreg':'✔️', 'unknown':'❔'}
with open(OUTPUTFILE, 'w') as f:
for d in possible_domains:
f.write(f'{emoji_prefix[possible_domains[d]]} {d}\n')
Copy link

nemobis commented Nov 4, 2022

Thanks for sharing! For Italian there aren't that many suitable TLDs so I got a first crop simply by filtering for TLDs which end with an Italian vowel (works well because domains like .be .li .re are very cheap):

Copy link

This script was expanded into a better and more robust package, now available on Github.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment