Skip to content

Instantly share code, notes, and snippets.

@rfong
Forked from stepchowfun/domain_finder.py
Last active December 3, 2019 23:45
Show Gist options
  • Save rfong/7220527 to your computer and use it in GitHub Desktop.
Save rfong/7220527 to your computer and use it in GitHub Desktop.
Fork of boyers's domain name n-gram generator that lets user specify initial prefix, or fallback on other domain extensions
#!/usr/bin/python -O
from optparse import OptionParser
import random
import string
import subprocess
import threading
import time
import re
# get a list of words with only ASCII characters, and surround them with ^ and $ to demarcate the word boundaries
words = [w.strip().lower() for w in open("/usr/share/dict/words").readlines()]
words = ["^" + w + "$" for w in words if w != "" and all([c in string.ascii_lowercase for c in w])]
# construct a discrete-time markov chain of n-grams
n = 5 # this is the "n" in n-grams, try adjusting this for different results
transitions = {} # keys are n-grams, values are dicts mapping subsequent n-grams to probabilities
frequencies = {} # keys are n-grams, values are normalized frequencies [0, 1] of occurrence in the wordlist
for word in words:
for i in range(len(word) + 1 - n):
gram = word[i : i + n]
if gram in frequencies:
frequencies[gram] += 1
else:
frequencies[gram] = 1
for i in range(len(word) - n):
gram = word[i : i + n]
next = word[i + 1 : i + n + 1]
if gram not in transitions:
transitions[gram] = {}
if next in transitions[gram]:
transitions[gram][next] += 1
else:
transitions[gram][next] = 1
for gram in frequencies:
frequencies[gram] /= float(len(frequencies))
for gram in transitions:
total = 0
for next in transitions[gram]:
total += transitions[gram][next]
for next in transitions[gram]:
transitions[gram][next] /= float(total)
# sample a probability mass function
# pmf: dict mapping elements to probabilities
def sample(pmf):
sample = random.random()
cdf = 0.0
for e in pmf:
cdf += pmf[e]
if cdf >= sample:
return e
return random.choice(pmf.keys())
# compute a conditional probability mass function
# pmf: dict mapping elements to probabilities
# condition: boolean-valued function to condition on
def conditional(pmf, condition):
total_mass = 0.0
cond = {}
for e in pmf:
if condition(e):
cond[e] = pmf[e]
total_mass += pmf[e]
for e in cond:
cond[e] /= total_mass
return cond
# compute the prefix frequencies
# prefixes are n-grams that appear at the beginning of words
prefix_frequencies = conditional(frequencies, lambda x: x[0] == "^")
# generate a new letter according to the markov chain (make sure len(word) >= n)
def evolve(word):
# grab the last n characters and make sure the n-gram is in our model
gram = word[-n:]
if gram not in transitions:
# uh oh, just return a random letter to keep things moving
return random.choice(string.ascii_lowercase + "$")
# sample the n-grams that we can transition to
return sample(transitions[gram])[-1:]
# generate a word according to the markov chain
def gen_word(word=None):
# start with a prefix
if word==None:
word = sample(prefix_frequencies)
# wait until the markov chain adds a terminator to the word
while word[-1] != "$":
# generate a new letter and append it to the word
word += evolve(word)
# optional: sometimes domains are multiple word-like lexemes concatenated together
if word[-1] == "$" and random.random() > 0.7 and len(word) < 8:
word += sample(prefix_frequencies)
# remove the boundary markers and return the word
return word.replace("^", "").replace("$", "")
# check whether a domain is taken (e.g., "example.com")
# returns True if the domain is taken, False if it is available, or None if the request timed out
def check(domain):
formatter = lambda s:re.sub(r'\s+',' ',s.lower())
# use the "whois" command to determine availability, and timeout after a few seconds in case it hangs
process = subprocess.Popen(["whois", domain], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
end_time = time.clock() + 4.0
while time.clock() < end_time:
if process.poll() is not None:
stdout = process.stdout.read()
return all([formatter(s) not in formatter(stdout) for s in [
"No match",
"Not found",
"no entries found",
"not registered",
"status: available",
"status: free",
"is available for", #io,ac
"query_status: 200 Available", #co.nz
"No Data Found", #ae,com.au
"no data was found", #il
"was not found", #im
"nothing found", #at
"no domain entries were found to match", #coop
"This query returned 0 objects", #int
"incorrect domain name", #al
"no records matching", #ax
"does not exist in database!", #bg
"invalid query or domain name not known", #cf,ga
"we do not have an entry in our database", #ch
"no existe", #cl
"no matching record", #cn
"domain has not been registered", #hk
]])
time.sleep(0.1)
try:
process.kill()
except:
pass
return None
def main():
# allow user to specify initial prefix
parser = OptionParser()
parser.add_option("-w", dest="word",
help="specify initial prefix")
parser.add_option("-x", dest="extensions",
help="comma-delimited list of allowed domain extensions, no spaces or leading dots")
(options,args) = parser.parse_args()
# mutate/slice user specified word to get valid prefix
word = options.word
if options.word:
word = "^" + word
if len(word)>n:
word = word[:n+1]
elif len(word)<n:
print "Prefix too short; ignoring"
word = None
elif word not in prefix_frequencies:
print "Prefix invalid; ignoring"
word = None
# format domain extensions
extensions = options.extensions or ""
extensions = filter(lambda x:x, extensions.split(','))
# also filter ones not supported by whois
extensions = filter(lambda x:x not in ['hn','bo'], extensions)
extensions = map(lambda x:'.'+x, extensions)
# remember previously generated names
visited = []
# generate domain names forever
while True:
# generate a few domains and pick the smallest
domain = sorted([gen_word(word) for i in range(3)], key=lambda x: len(x))[0]
# avoid redundancy if prefix is specified
if word:
if domain in visited:
continue
visited.append(domain)
# report whether the domain is available
if check(domain + ".com") == False: # could be True, False, or None
print domain + ".com <-- Available!"
else:
print domain + ".com"
# fall back to other extensions
for ext in extensions:
if check(domain + ext) == False:
print domain + ext + " <-- Available!"
if __name__=="__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment