Skip to content

Instantly share code, notes, and snippets.

@backupbrain
Created November 14, 2018 18:08
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save backupbrain/17a97ff87981afa655f6f9fafee935c2 to your computer and use it in GitHub Desktop.
Save backupbrain/17a97ff87981afa655f6f9fafee935c2 to your computer and use it in GitHub Desktop.
import re
import requests
import nltk
from lxml import html
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
# A Company Name
company_name = "The Boring Company"
search_terms = ["flamethrower"] # The Boring Company sells flamethrowers
# Split company name into words
company_words = nltk.word_tokenize(company_name.lower())
# Blend company words into various potential domain names
blended_words = []
num_words = len(company_words)
for end in range(0, num_words + 1):
for start in range(0, end):
domain_parts = []
for word in range(start, end):
domain_parts.append(company_words[word])
blended_word = "".join(domain_parts)
blended_words.append(blended_word)
# Remove stop words
blended_words = [
blended_word
for blended_word in blended_words
if blended_word not in stop_words
]
# Order by longest to shortest
blended_words = sorted(blended_words, key=len, reverse=True)
# Domain parts
domain_suffixes = [
'com', 'co', 'io', 'ca', 'net', 'org'
]
popular_prefixes = [
'', 'get'
]
# Build domain parts
possible_domains = []
for domain_suffix in domain_suffixes:
for blended_word in blended_words:
for prefix in popular_prefixes:
domain = "{}{}.{}".format(
prefix,
blended_word,
domain_suffix
)
possible_domains.append(domain)
# Test domain connectivity
headers = requests.utils.default_headers()
headers.update({
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/69.0.3497.100 Safari/537.36"
})
domains = {}
for possible_domain in possible_domains:
is_live = False
url = "http://{}".format(possible_domain)
response = None
try:
print("Testing {}".format(url))
response = requests.get(
url,
headers=headers,
timeout=3.0
)
if response.status_code < 400:
is_live = True
except:
pass
domains[possible_domain] = {
"url": url,
"is_live": is_live,
"response": response
}
# Test if <title> tag matches company name
for domain, data in domains.items():
do_titles_match = False
dom = None
if data["is_live"] is True:
response = data["response"]
dom = html.fromstring(response.text)
titles = dom.xpath("/html/head/title")
if len(titles) > 0:
title = titles[0].text
if company_name in title:
do_titles_match = True
domains[domain]["dom"] = dom
domains[domain]["do_titles_match"] = do_titles_match
# Remove unwanted elements from HTML DOM
def strip_elements(dom_tree, xpaths):
for xpath in xpaths:
elements = dom_tree.xpath(xpath)
for element in elements:
element.getparent().remove(element)
return dom_tree
# Seach for context-sensitive words in website text
for domain, data in domains.items():
does_content_match = False
if data["is_live"] is True:
dom = data["dom"]
dom_tree_stripped = strip_elements(
dom,
["//head", "//script", "//style", "//link"]
)
text_content = dom_tree_stripped.text_content().lower()
text_content = re.sub("[\r\n]+", "\n", text_content)
text_content = re.sub("[\t ]+", " ", text_content)
text_content = re.sub("(\n )+", "\n", text_content)
for search_term in search_terms:
if search_term in text_content:
does_content_match = True
domains[domain]["does_content_match"] = does_content_match
# Settle on company URL
url = ""
for domain, data in domains.items():
if data["is_live"] is True and \
data["do_titles_match"] is True and\
data["does_content_match"] is True:
url = domain
break
print(url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment