backupbrain/guess_url_from_company_name.py

## guess_url_from_company_name.py
import re
import requests
import nltk
from lxml import html
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

# A Company Name
company_name = "The Boring Company"
search_terms = ["flamethrower"]  # The Boring Company sells flamethrowers

# Split company name into words
company_words = nltk.word_tokenize(company_name.lower())

# Blend company words into various potential domain names
blended_words = []
num_words = len(company_words)
for end in range(0, num_words + 1):
    for start in range(0, end):
        domain_parts = []
        for word in range(start, end):
            domain_parts.append(company_words[word])
        blended_word = "".join(domain_parts)
        blended_words.append(blended_word)

# Remove stop words
blended_words = [
    blended_word
    for blended_word in blended_words
    if blended_word not in stop_words
]

# Order by longest to shortest
blended_words = sorted(blended_words, key=len, reverse=True)

# Domain parts
domain_suffixes = [
    'com', 'co', 'io', 'ca', 'net', 'org'
]
popular_prefixes = [
    '', 'get'
]

# Build domain parts
possible_domains = []
for domain_suffix in domain_suffixes:
    for blended_word in blended_words:
        for prefix in popular_prefixes:
            domain = "{}{}.{}".format(
                prefix,
                blended_word,
                domain_suffix
            )
            possible_domains.append(domain)

# Test domain connectivity
headers = requests.utils.default_headers()
headers.update({
    "User-Agent":
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/69.0.3497.100 Safari/537.36"
})

domains = {}
for possible_domain in possible_domains:
    is_live = False
    url = "http://{}".format(possible_domain)
    response = None
    try:
        print("Testing {}".format(url))
        response = requests.get(
            url,
            headers=headers,
            timeout=3.0
        )
        if response.status_code < 400:
            is_live = True
    except:
        pass
    domains[possible_domain] = {
        "url": url,
        "is_live": is_live,
        "response": response
    }

# Test if <title> tag matches company name
for domain, data in domains.items():
    do_titles_match = False
    dom = None
    if data["is_live"] is True:
        response = data["response"]
        dom = html.fromstring(response.text)
        titles = dom.xpath("/html/head/title")
        if len(titles) > 0:
            title = titles[0].text
            if company_name in title:
                do_titles_match = True
        domains[domain]["dom"] = dom
    domains[domain]["do_titles_match"] = do_titles_match

# Remove unwanted elements from HTML DOM
def strip_elements(dom_tree, xpaths):
    for xpath in xpaths:
        elements = dom_tree.xpath(xpath)
        for element in elements:
            element.getparent().remove(element)
    return dom_tree

# Seach for context-sensitive words in website text
for domain, data in domains.items():
    does_content_match = False
    if data["is_live"] is True:
        dom = data["dom"]
        dom_tree_stripped = strip_elements(
            dom,
            ["//head", "//script", "//style", "//link"]
        )
        text_content = dom_tree_stripped.text_content().lower()
        text_content = re.sub("[\r\n]+", "\n", text_content)
        text_content = re.sub("[\t ]+", " ", text_content)
        text_content = re.sub("(\n )+", "\n", text_content)
        for search_term in search_terms:
            if search_term in text_content:
                does_content_match = True
    domains[domain]["does_content_match"] = does_content_match

# Settle on company URL
url = ""
for domain, data in domains.items():
    if data["is_live"] is True and \
        data["do_titles_match"] is True and\
        data["does_content_match"] is True:
        url = domain
        break
print(url)
	import re
	import requests
	import nltk
	from lxml import html
	from nltk.corpus import stopwords
	stop_words = set(stopwords.words("english"))

	# A Company Name
	company_name = "The Boring Company"
	search_terms = ["flamethrower"] # The Boring Company sells flamethrowers

	# Split company name into words
	company_words = nltk.word_tokenize(company_name.lower())

	# Blend company words into various potential domain names
	blended_words = []
	num_words = len(company_words)
	for end in range(0, num_words + 1):
	for start in range(0, end):
	domain_parts = []
	for word in range(start, end):
	domain_parts.append(company_words[word])
	blended_word = "".join(domain_parts)
	blended_words.append(blended_word)

	# Remove stop words
	blended_words = [
	blended_word
	for blended_word in blended_words
	if blended_word not in stop_words
	]

	# Order by longest to shortest
	blended_words = sorted(blended_words, key=len, reverse=True)

	# Domain parts
	domain_suffixes = [
	'com', 'co', 'io', 'ca', 'net', 'org'
	]
	popular_prefixes = [
	'', 'get'
	]

	# Build domain parts
	possible_domains = []
	for domain_suffix in domain_suffixes:
	for blended_word in blended_words:
	for prefix in popular_prefixes:
	domain = "{}{}.{}".format(
	prefix,
	blended_word,
	domain_suffix
	)
	possible_domains.append(domain)

	# Test domain connectivity
	headers = requests.utils.default_headers()
	headers.update({
	"User-Agent":
	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/69.0.3497.100 Safari/537.36"
	})

	domains = {}
	for possible_domain in possible_domains:
	is_live = False
	url = "http://{}".format(possible_domain)
	response = None
	try:
	print("Testing {}".format(url))
	response = requests.get(
	url,
	headers=headers,
	timeout=3.0
	)
	if response.status_code < 400:
	is_live = True
	except:
	pass
	domains[possible_domain] = {
	"url": url,
	"is_live": is_live,
	"response": response
	}

	# Test if <title> tag matches company name
	for domain, data in domains.items():
	do_titles_match = False
	dom = None
	if data["is_live"] is True:
	response = data["response"]
	dom = html.fromstring(response.text)
	titles = dom.xpath("/html/head/title")
	if len(titles) > 0:
	title = titles[0].text
	if company_name in title:
	do_titles_match = True
	domains[domain]["dom"] = dom
	domains[domain]["do_titles_match"] = do_titles_match

	# Remove unwanted elements from HTML DOM
	def strip_elements(dom_tree, xpaths):
	for xpath in xpaths:
	elements = dom_tree.xpath(xpath)
	for element in elements:
	element.getparent().remove(element)
	return dom_tree

	# Seach for context-sensitive words in website text
	for domain, data in domains.items():
	does_content_match = False
	if data["is_live"] is True:
	dom = data["dom"]
	dom_tree_stripped = strip_elements(
	dom,
	["//head", "//script", "//style", "//link"]
	)
	text_content = dom_tree_stripped.text_content().lower()
	text_content = re.sub("[\r\n]+", "\n", text_content)
	text_content = re.sub("[\t ]+", " ", text_content)
	text_content = re.sub("(\n )+", "\n", text_content)
	for search_term in search_terms:
	if search_term in text_content:
	does_content_match = True
	domains[domain]["does_content_match"] = does_content_match

	# Settle on company URL
	url = ""
	for domain, data in domains.items():
	if data["is_live"] is True and \
	data["do_titles_match"] is True and\
	data["does_content_match"] is True:
	url = domain
	break
	print(url)