adamneilson/gist:7230370

## gistfile1.txt
#!/usr/bin/python -O
import random
import string
import subprocess
import threading
import time

# get a list of words with only ASCII characters, and surround them with ^ and $ to demarcate the word boundaries
words = [w.strip().lower() for w in open("/usr/share/dict/words").readlines()]
words = ["^" + w + "$" for w in words if w != "" and all([c in string.ascii_lowercase for c in w])]

# construct a discrete-time markov chain of n-grams
n = 5 # this is the "n" in n-grams, try adjusting this for different results
transitions = {} # keys are n-grams, values are dicts mapping subsequent n-grams to probabilities
frequencies = {} # keys are n-grams, values are normalized frequencies [0, 1] of occurrence in the wordlist
for word in words:
  for i in range(len(word) + 1 - n):
    gram = word[i : i + n]
    if gram in frequencies:
      frequencies[gram] += 1
    else:
      frequencies[gram] = 1
  for i in range(len(word) - n):
    gram = word[i : i + n]
    next = word[i + 1 : i + n + 1]
    if gram not in transitions:
      transitions[gram] = {}
    if next in transitions[gram]:
      transitions[gram][next] += 1
    else:
      transitions[gram][next] = 1
for gram in frequencies:
  frequencies[gram] /= float(len(frequencies))
for gram in transitions:
  total = 0
  for next in transitions[gram]:
    total += transitions[gram][next]
  for next in transitions[gram]:
    transitions[gram][next] /= float(total)

# sample a probability mass function
#   pmf: dict mapping elements to probabilities
def sample(pmf):
  sample = random.random()
  cdf = 0.0
  for e in pmf:
    cdf += pmf[e]
    if cdf >= sample:
      return e
  return random.choice(pmf.keys())

# compute a conditional probability mass function
#   pmf:       dict mapping elements to probabilities
#   condition: boolean-valued function to condition on
def conditional(pmf, condition):
  total_mass = 0.0
  cond = {}
  for e in pmf:
    if condition(e):
      cond[e] = pmf[e]
      total_mass += pmf[e]
  for e in cond:
    cond[e] /= total_mass
  return cond

# compute the prefix frequencies
# prefixes are n-grams that appear at the beginning of words
prefix_frequencies = conditional(frequencies, lambda x: x[0] == "^")

# generate a new letter according to the markov chain (make sure len(word) >= n)
def evolve(word):
  # grab the last n characters and make sure the n-gram is in our model
  gram = word[-n:]
  if gram not in transitions:
    # uh oh, just return a random letter to keep things moving
    return random.choice(string.ascii_lowercase + "$")

  # sample the n-grams that we can transition to
  return sample(transitions[gram])[-1:]

# generate a word according to the markov chain
def gen_word():
  # start with a prefix
  word = sample(prefix_frequencies)

  # wait until the markov chain adds a terminator to the word
  while word[-1] != "$":
    # generate a new letter and append it to the word
    word += evolve(word)

    # optional: sometimes domains are multiple word-like lexemes concatenated together
    if word[-1] == "$" and random.random() > 0.7 and len(word) < 8:
      word += sample(prefix_frequencies)

  # remove the boundary markers and return the word
  return word.replace("^", "").replace("$", "")

# check whether a domain is taken (e.g., "example.com")
# returns True if the domain is taken, False if it is available, or None if the request timed out
def check(domain):
  # use the "whois" command to determine availability, and timeout after a few seconds in case it hangs
  process = subprocess.Popen(["whois", domain], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  end_time = time.clock() + 4.0
  while time.clock() < end_time:
    if process.poll() is not None:
      return "No match for" not in process.stdout.read()
    time.sleep(0.1)
  try:
    process.kill()
  except:
    pass
  return None

# generate domain names forever
while True:
  # generate a few domains and pick the smallest
  domain = sorted([gen_word() for i in range(3)], key=lambda x: len(x))[0] + ".com"

  # report whether the domain is available
  if check(domain) == False: # could be True, False, or None
    print domain + " <-- Available!"
  #else:
  #  print domain
	#!/usr/bin/python -O
	import random
	import string
	import subprocess
	import threading
	import time

	# get a list of words with only ASCII characters, and surround them with ^ and $ to demarcate the word boundaries
	words = [w.strip().lower() for w in open("/usr/share/dict/words").readlines()]
	words = ["^" + w + "$" for w in words if w != "" and all([c in string.ascii_lowercase for c in w])]

	# construct a discrete-time markov chain of n-grams
	n = 5 # this is the "n" in n-grams, try adjusting this for different results
	transitions = {} # keys are n-grams, values are dicts mapping subsequent n-grams to probabilities
	frequencies = {} # keys are n-grams, values are normalized frequencies [0, 1] of occurrence in the wordlist
	for word in words:
	for i in range(len(word) + 1 - n):
	gram = word[i : i + n]
	if gram in frequencies:
	frequencies[gram] += 1
	else:
	frequencies[gram] = 1
	for i in range(len(word) - n):
	gram = word[i : i + n]
	next = word[i + 1 : i + n + 1]
	if gram not in transitions:
	transitions[gram] = {}
	if next in transitions[gram]:
	transitions[gram][next] += 1
	else:
	transitions[gram][next] = 1
	for gram in frequencies:
	frequencies[gram] /= float(len(frequencies))
	for gram in transitions:
	total = 0
	for next in transitions[gram]:
	total += transitions[gram][next]
	for next in transitions[gram]:
	transitions[gram][next] /= float(total)

	# sample a probability mass function
	# pmf: dict mapping elements to probabilities
	def sample(pmf):
	sample = random.random()
	cdf = 0.0
	for e in pmf:
	cdf += pmf[e]
	if cdf >= sample:
	return e
	return random.choice(pmf.keys())

	# compute a conditional probability mass function
	# pmf: dict mapping elements to probabilities
	# condition: boolean-valued function to condition on
	def conditional(pmf, condition):
	total_mass = 0.0
	cond = {}
	for e in pmf:
	if condition(e):
	cond[e] = pmf[e]
	total_mass += pmf[e]
	for e in cond:
	cond[e] /= total_mass
	return cond

	# compute the prefix frequencies
	# prefixes are n-grams that appear at the beginning of words
	prefix_frequencies = conditional(frequencies, lambda x: x[0] == "^")

	# generate a new letter according to the markov chain (make sure len(word) >= n)
	def evolve(word):
	# grab the last n characters and make sure the n-gram is in our model
	gram = word[-n:]
	if gram not in transitions:
	# uh oh, just return a random letter to keep things moving
	return random.choice(string.ascii_lowercase + "$")

	# sample the n-grams that we can transition to
	return sample(transitions[gram])[-1:]

	# generate a word according to the markov chain
	def gen_word():
	# start with a prefix
	word = sample(prefix_frequencies)

	# wait until the markov chain adds a terminator to the word
	while word[-1] != "$":
	# generate a new letter and append it to the word
	word += evolve(word)

	# optional: sometimes domains are multiple word-like lexemes concatenated together
	if word[-1] == "$" and random.random() > 0.7 and len(word) < 8:
	word += sample(prefix_frequencies)

	# remove the boundary markers and return the word
	return word.replace("^", "").replace("$", "")

	# check whether a domain is taken (e.g., "example.com")
	# returns True if the domain is taken, False if it is available, or None if the request timed out
	def check(domain):
	# use the "whois" command to determine availability, and timeout after a few seconds in case it hangs
	process = subprocess.Popen(["whois", domain], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	end_time = time.clock() + 4.0
	while time.clock() < end_time:
	if process.poll() is not None:
	return "No match for" not in process.stdout.read()
	time.sleep(0.1)
	try:
	process.kill()
	except:
	pass
	return None

	# generate domain names forever
	while True:
	# generate a few domains and pick the smallest
	domain = sorted([gen_word() for i in range(3)], key=lambda x: len(x))[0] + ".com"

	# report whether the domain is available
	if check(domain) == False: # could be True, False, or None
	print domain + " <-- Available!"
	#else:
	# print domain