hcgatewood/shoutkey-comb.py

## shoutkey-comb.py
# Comb through plausible shoutkey.com URLs looking for redirections.
#
# Prints out URLs with valid redirects. Will also write them to a file
# if KNOWN_WORDS_LOC is not None.
#
# NOTES
# - Words list: https://github.com/first20hours/google-10000-english/blob/master/google-10000-english-usa.txt
# - Kill the process with Ctrl+Z since the requests module improperly
#   catches exceptions

import os
import requests

# Relevant URLs, file locations, and HTML strings
BASE_URL = 'http://shoutkey.com/{}'
WORDS_LOC = os.path.expanduser('~/Desktop/common-words.txt')
KNOWN_WORDS_LOC = os.path.expanduser('/Users/Hunter/Desktop/known-words.txt')
FAIL_STR = 'is not an active ShoutKey!'

# Numberic constants
MIN_KEY_LEN = 3
MAX_KEY_LEN = 12
UPDATE_INTERVAL = 100
TIMEOUT = 1  # seconds
FIRST_WORDS_IDX = 0  # skip initial words in words list
MAX_RES_LEN = 1 * 10**6
VERBOSE = 1  # { 0, 1, 2 }

include_known_words = False  # include words we already know shoutkey uses
write_known = False  # write newly discovered shoutkey words to file

# Known words file exists?
if KNOWN_WORDS_LOC is not None and os.path.isfile(KNOWN_WORDS_LOC):
    confirmed_file = open(KNOWN_WORDS_LOC)
    write_known = True
else:
    include_known_words = False

# Get words file
words_file = open(WORDS_LOC)
words = words_file.read().split()
words = [
        val.lower() for val in words
        if len(val) >= MIN_KEY_LEN and len(val) <= MAX_KEY_LEN]
words = words[FIRST_WORDS_IDX:]  # chop off beginning words

if include_known_words:
    with open(KNOWN_WORDS_LOC, 'r') as known:
        known_words = list(set(known.read().split()))
        known_words.sort()
        words = known_words + words

if VERBOSE >= 1:
    print('KEYS TO CONSIDER:', len(words))
    print('FIRST WORDS:', words[:10])

# Browse shoutkey words
count = FIRST_WORDS_IDX
for key in words:
    # Handle verbosity
    if VERBOSE >= 1:
        if count % UPDATE_INTERVAL == 0:
            print('-----{}-----'.format(count))
    if VERBOSE >= 2:
        print(key)

    # Get the url's response
    url = BASE_URL.format(key)
    try:
        res = requests.get(url, timeout=TIMEOUT, stream=True)
    except:
        continue

    # Continue early if the res isn't html
    if 'text/html' not in res.headers['content-type']:
        continue
    res_str = res.text

    # Continue early if str
    if len(res_str) > MAX_RES_LEN:
        success = True
    else:
        success = FAIL_STR not in res_str

    # Found a link!
    if success:
        # Write the word to file
        if write_known:
            with open(KNOWN_WORDS_LOC, 'a') as known:
                known.write(key + '\n')
                if VERBOSE >= 1:
                    print(url, '(key written to file)')
                else:
                    print(url)
        # Just print to console
        else:
            print(url)

    count += 1
	# Comb through plausible shoutkey.com URLs looking for redirections.
	#
	# Prints out URLs with valid redirects. Will also write them to a file
	# if KNOWN_WORDS_LOC is not None.
	#
	# NOTES
	# - Words list: https://github.com/first20hours/google-10000-english/blob/master/google-10000-english-usa.txt
	# - Kill the process with Ctrl+Z since the requests module improperly
	# catches exceptions

	import os
	import requests

	# Relevant URLs, file locations, and HTML strings
	BASE_URL = 'http://shoutkey.com/{}'
	WORDS_LOC = os.path.expanduser('~/Desktop/common-words.txt')
	KNOWN_WORDS_LOC = os.path.expanduser('/Users/Hunter/Desktop/known-words.txt')
	FAIL_STR = 'is not an active ShoutKey!'

	# Numberic constants
	MIN_KEY_LEN = 3
	MAX_KEY_LEN = 12
	UPDATE_INTERVAL = 100
	TIMEOUT = 1 # seconds
	FIRST_WORDS_IDX = 0 # skip initial words in words list
	MAX_RES_LEN = 1 * 10**6
	VERBOSE = 1 # { 0, 1, 2 }

	include_known_words = False # include words we already know shoutkey uses
	write_known = False # write newly discovered shoutkey words to file

	# Known words file exists?
	if KNOWN_WORDS_LOC is not None and os.path.isfile(KNOWN_WORDS_LOC):
	confirmed_file = open(KNOWN_WORDS_LOC)
	write_known = True
	else:
	include_known_words = False

	# Get words file
	words_file = open(WORDS_LOC)
	words = words_file.read().split()
	words = [
	val.lower() for val in words
	if len(val) >= MIN_KEY_LEN and len(val) <= MAX_KEY_LEN]
	words = words[FIRST_WORDS_IDX:] # chop off beginning words

	if include_known_words:
	with open(KNOWN_WORDS_LOC, 'r') as known:
	known_words = list(set(known.read().split()))
	known_words.sort()
	words = known_words + words

	if VERBOSE >= 1:
	print('KEYS TO CONSIDER:', len(words))
	print('FIRST WORDS:', words[:10])

	# Browse shoutkey words
	count = FIRST_WORDS_IDX
	for key in words:
	# Handle verbosity
	if VERBOSE >= 1:
	if count % UPDATE_INTERVAL == 0:
	print('-----{}-----'.format(count))
	if VERBOSE >= 2:
	print(key)

	# Get the url's response
	url = BASE_URL.format(key)
	try:
	res = requests.get(url, timeout=TIMEOUT, stream=True)
	except:
	continue

	# Continue early if the res isn't html
	if 'text/html' not in res.headers['content-type']:
	continue
	res_str = res.text

	# Continue early if str
	if len(res_str) > MAX_RES_LEN:
	success = True
	else:
	success = FAIL_STR not in res_str

	# Found a link!
	if success:
	# Write the word to file
	if write_known:
	with open(KNOWN_WORDS_LOC, 'a') as known:
	known.write(key + '\n')
	if VERBOSE >= 1:
	print(url, '(key written to file)')
	else:
	print(url)
	# Just print to console
	else:
	print(url)

	count += 1