Created
October 30, 2013 10:31
-
-
Save adamneilson/7230370 to your computer and use it in GitHub Desktop.
Generates available domain names
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python -O | |
import random | |
import string | |
import subprocess | |
import threading | |
import time | |
# get a list of words with only ASCII characters, and surround them with ^ and $ to demarcate the word boundaries | |
words = [w.strip().lower() for w in open("/usr/share/dict/words").readlines()] | |
words = ["^" + w + "$" for w in words if w != "" and all([c in string.ascii_lowercase for c in w])] | |
# construct a discrete-time markov chain of n-grams | |
n = 5 # this is the "n" in n-grams, try adjusting this for different results | |
transitions = {} # keys are n-grams, values are dicts mapping subsequent n-grams to probabilities | |
frequencies = {} # keys are n-grams, values are normalized frequencies [0, 1] of occurrence in the wordlist | |
for word in words: | |
for i in range(len(word) + 1 - n): | |
gram = word[i : i + n] | |
if gram in frequencies: | |
frequencies[gram] += 1 | |
else: | |
frequencies[gram] = 1 | |
for i in range(len(word) - n): | |
gram = word[i : i + n] | |
next = word[i + 1 : i + n + 1] | |
if gram not in transitions: | |
transitions[gram] = {} | |
if next in transitions[gram]: | |
transitions[gram][next] += 1 | |
else: | |
transitions[gram][next] = 1 | |
for gram in frequencies: | |
frequencies[gram] /= float(len(frequencies)) | |
for gram in transitions: | |
total = 0 | |
for next in transitions[gram]: | |
total += transitions[gram][next] | |
for next in transitions[gram]: | |
transitions[gram][next] /= float(total) | |
# sample a probability mass function | |
# pmf: dict mapping elements to probabilities | |
def sample(pmf): | |
sample = random.random() | |
cdf = 0.0 | |
for e in pmf: | |
cdf += pmf[e] | |
if cdf >= sample: | |
return e | |
return random.choice(pmf.keys()) | |
# compute a conditional probability mass function | |
# pmf: dict mapping elements to probabilities | |
# condition: boolean-valued function to condition on | |
def conditional(pmf, condition): | |
total_mass = 0.0 | |
cond = {} | |
for e in pmf: | |
if condition(e): | |
cond[e] = pmf[e] | |
total_mass += pmf[e] | |
for e in cond: | |
cond[e] /= total_mass | |
return cond | |
# compute the prefix frequencies | |
# prefixes are n-grams that appear at the beginning of words | |
prefix_frequencies = conditional(frequencies, lambda x: x[0] == "^") | |
# generate a new letter according to the markov chain (make sure len(word) >= n) | |
def evolve(word): | |
# grab the last n characters and make sure the n-gram is in our model | |
gram = word[-n:] | |
if gram not in transitions: | |
# uh oh, just return a random letter to keep things moving | |
return random.choice(string.ascii_lowercase + "$") | |
# sample the n-grams that we can transition to | |
return sample(transitions[gram])[-1:] | |
# generate a word according to the markov chain | |
def gen_word(): | |
# start with a prefix | |
word = sample(prefix_frequencies) | |
# wait until the markov chain adds a terminator to the word | |
while word[-1] != "$": | |
# generate a new letter and append it to the word | |
word += evolve(word) | |
# optional: sometimes domains are multiple word-like lexemes concatenated together | |
if word[-1] == "$" and random.random() > 0.7 and len(word) < 8: | |
word += sample(prefix_frequencies) | |
# remove the boundary markers and return the word | |
return word.replace("^", "").replace("$", "") | |
# check whether a domain is taken (e.g., "example.com") | |
# returns True if the domain is taken, False if it is available, or None if the request timed out | |
def check(domain): | |
# use the "whois" command to determine availability, and timeout after a few seconds in case it hangs | |
process = subprocess.Popen(["whois", domain], stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
end_time = time.clock() + 4.0 | |
while time.clock() < end_time: | |
if process.poll() is not None: | |
return "No match for" not in process.stdout.read() | |
time.sleep(0.1) | |
try: | |
process.kill() | |
except: | |
pass | |
return None | |
# generate domain names forever | |
while True: | |
# generate a few domains and pick the smallest | |
domain = sorted([gen_word() for i in range(3)], key=lambda x: len(x))[0] + ".com" | |
# report whether the domain is available | |
if check(domain) == False: # could be True, False, or None | |
print domain + " <-- Available!" | |
#else: | |
# print domain | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment