Skip to content

Instantly share code, notes, and snippets.

@hcgatewood
Last active February 11, 2017 06:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hcgatewood/a939d2bb1f5ad22535b0afe90ae79e9b to your computer and use it in GitHub Desktop.
Save hcgatewood/a939d2bb1f5ad22535b0afe90ae79e9b to your computer and use it in GitHub Desktop.
# Comb through plausible shoutkey.com URLs looking for redirections.
#
# Prints out URLs with valid redirects. Will also write them to a file
# if KNOWN_WORDS_LOC is not None.
#
# NOTES
# - Words list: https://github.com/first20hours/google-10000-english/blob/master/google-10000-english-usa.txt
# - Kill the process with Ctrl+Z since the requests module improperly
# catches exceptions
import os
import requests
# Relevant URLs, file locations, and HTML strings
BASE_URL = 'http://shoutkey.com/{}'
WORDS_LOC = os.path.expanduser('~/Desktop/common-words.txt')
KNOWN_WORDS_LOC = os.path.expanduser('/Users/Hunter/Desktop/known-words.txt')
FAIL_STR = 'is not an active ShoutKey!'
# Numberic constants
MIN_KEY_LEN = 3
MAX_KEY_LEN = 12
UPDATE_INTERVAL = 100
TIMEOUT = 1 # seconds
FIRST_WORDS_IDX = 0 # skip initial words in words list
MAX_RES_LEN = 1 * 10**6
VERBOSE = 1 # { 0, 1, 2 }
include_known_words = False # include words we already know shoutkey uses
write_known = False # write newly discovered shoutkey words to file
# Known words file exists?
if KNOWN_WORDS_LOC is not None and os.path.isfile(KNOWN_WORDS_LOC):
confirmed_file = open(KNOWN_WORDS_LOC)
write_known = True
else:
include_known_words = False
# Get words file
words_file = open(WORDS_LOC)
words = words_file.read().split()
words = [
val.lower() for val in words
if len(val) >= MIN_KEY_LEN and len(val) <= MAX_KEY_LEN]
words = words[FIRST_WORDS_IDX:] # chop off beginning words
if include_known_words:
with open(KNOWN_WORDS_LOC, 'r') as known:
known_words = list(set(known.read().split()))
known_words.sort()
words = known_words + words
if VERBOSE >= 1:
print('KEYS TO CONSIDER:', len(words))
print('FIRST WORDS:', words[:10])
# Browse shoutkey words
count = FIRST_WORDS_IDX
for key in words:
# Handle verbosity
if VERBOSE >= 1:
if count % UPDATE_INTERVAL == 0:
print('-----{}-----'.format(count))
if VERBOSE >= 2:
print(key)
# Get the url's response
url = BASE_URL.format(key)
try:
res = requests.get(url, timeout=TIMEOUT, stream=True)
except:
continue
# Continue early if the res isn't html
if 'text/html' not in res.headers['content-type']:
continue
res_str = res.text
# Continue early if str
if len(res_str) > MAX_RES_LEN:
success = True
else:
success = FAIL_STR not in res_str
# Found a link!
if success:
# Write the word to file
if write_known:
with open(KNOWN_WORDS_LOC, 'a') as known:
known.write(key + '\n')
if VERBOSE >= 1:
print(url, '(key written to file)')
else:
print(url)
# Just print to console
else:
print(url)
count += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment