This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Comb through plausible shoutkey.com URLs looking for redirections. | |
# | |
# Prints out URLs with valid redirects. Will also write them to a file | |
# if KNOWN_WORDS_LOC is not None. | |
# | |
# NOTES | |
# - Words list: https://github.com/first20hours/google-10000-english/blob/master/google-10000-english-usa.txt | |
# - Kill the process with Ctrl+Z since the requests module improperly | |
# catches exceptions | |
import os | |
import requests | |
# Relevant URLs, file locations, and HTML strings | |
BASE_URL = 'http://shoutkey.com/{}' | |
WORDS_LOC = os.path.expanduser('~/Desktop/common-words.txt') | |
KNOWN_WORDS_LOC = os.path.expanduser('/Users/Hunter/Desktop/known-words.txt') | |
FAIL_STR = 'is not an active ShoutKey!' | |
# Numberic constants | |
MIN_KEY_LEN = 3 | |
MAX_KEY_LEN = 12 | |
UPDATE_INTERVAL = 100 | |
TIMEOUT = 1 # seconds | |
FIRST_WORDS_IDX = 0 # skip initial words in words list | |
MAX_RES_LEN = 1 * 10**6 | |
VERBOSE = 1 # { 0, 1, 2 } | |
include_known_words = False # include words we already know shoutkey uses | |
write_known = False # write newly discovered shoutkey words to file | |
# Known words file exists? | |
if KNOWN_WORDS_LOC is not None and os.path.isfile(KNOWN_WORDS_LOC): | |
confirmed_file = open(KNOWN_WORDS_LOC) | |
write_known = True | |
else: | |
include_known_words = False | |
# Get words file | |
words_file = open(WORDS_LOC) | |
words = words_file.read().split() | |
words = [ | |
val.lower() for val in words | |
if len(val) >= MIN_KEY_LEN and len(val) <= MAX_KEY_LEN] | |
words = words[FIRST_WORDS_IDX:] # chop off beginning words | |
if include_known_words: | |
with open(KNOWN_WORDS_LOC, 'r') as known: | |
known_words = list(set(known.read().split())) | |
known_words.sort() | |
words = known_words + words | |
if VERBOSE >= 1: | |
print('KEYS TO CONSIDER:', len(words)) | |
print('FIRST WORDS:', words[:10]) | |
# Browse shoutkey words | |
count = FIRST_WORDS_IDX | |
for key in words: | |
# Handle verbosity | |
if VERBOSE >= 1: | |
if count % UPDATE_INTERVAL == 0: | |
print('-----{}-----'.format(count)) | |
if VERBOSE >= 2: | |
print(key) | |
# Get the url's response | |
url = BASE_URL.format(key) | |
try: | |
res = requests.get(url, timeout=TIMEOUT, stream=True) | |
except: | |
continue | |
# Continue early if the res isn't html | |
if 'text/html' not in res.headers['content-type']: | |
continue | |
res_str = res.text | |
# Continue early if str | |
if len(res_str) > MAX_RES_LEN: | |
success = True | |
else: | |
success = FAIL_STR not in res_str | |
# Found a link! | |
if success: | |
# Write the word to file | |
if write_known: | |
with open(KNOWN_WORDS_LOC, 'a') as known: | |
known.write(key + '\n') | |
if VERBOSE >= 1: | |
print(url, '(key written to file)') | |
else: | |
print(url) | |
# Just print to console | |
else: | |
print(url) | |
count += 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment