-
-
Save anonymous/d1f72a0d1f9adfe8e60e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys, json, time, re | |
import urllib.request, urllib.parse | |
import http.cookiejar | |
# limit on the number of posts to attempt to inspect each pass | |
POST_LIMIT = 100 | |
# wait 5 minutes between checking | |
DELAY_SECONDS = 5 * 60 | |
# API docs request no more than 1 request per 2 seconds | |
REDDIT_DELAY = 2 | |
# don't link to a post older than 3 days | |
AGE_RANGE = 3 * 24 * 60 * 60 | |
# formatting for a time object | |
TIME_FORMAT = "%a, %d %b %Y %I:%M:%S %p %Z" | |
from credentials import username, password | |
if len(sys.argv) > 2: prog, username, password = sys.argv | |
LOGIN_DATA = {'user' : username, 'passwd' : password, 'api_type': 'json'} | |
SEARCH_URL = 'http://www.reddit.com/search.json' | |
HTTP_HEADERS = {} | |
def compact_permalink(data): | |
fmt = '/r/{subreddit:s}/comments/{id:s}' | |
return fmt.format(**data) | |
def edit_distance(a, b): | |
"""Damerau-Levenshtein distance""" | |
m, n = len(a), len(b) | |
prev, curr = None, list(range(1, n + 1)) + [0] | |
for x in range(m): | |
z, prev, curr = prev, curr, [0] * n + [x + 1] | |
for y in range(n): | |
delcost, addcost = prev[y] + 1, curr[y - 1] + 1 | |
subcost = prev[y - 1] + (a[x] != b[y]) | |
curr[y] = min(delcost, addcost, subcost) | |
if (x > 0 and y > 0 and a[x] == b[y - 1] and a[x-1] == b[y] and a[x] != b[y]): | |
curr[y] = min(curr[y], z[y - 2] + 1) | |
return curr[n - 1] | |
# this is just to help cleanup the logic | |
replace_dict = { | |
'(': '[', '{': '[', '[[': '[', | |
')': ']', '}': ']', ']]': ']', | |
} | |
def cleanup_tags(orig_title): | |
title = orig_title | |
while True: | |
fnd = False | |
for f,r in replace_dict.items(): | |
if f not in title: continue | |
title, fnd = title.replace(f,r), True | |
if not fnd: break | |
return title.strip() | |
def clean(t): return cleanup_tags(t).strip() | |
keep_alphanum = re.compile("[^a-z0-9]", re.I) | |
r_whitespace = re.compile("[\s]", re.I) | |
def title_compare(a,b): | |
cleaner = lambda x: r_whitespace.sub('', clean(x)) | |
return edit_distance(cleaner(a), cleaner(b)) | |
def search(data): | |
query = data['title_clean'] #'%s reddit:%s' % (data['title_clean'], data['subreddit']) | |
paramdict = {'q': query, 'sort': 'new', 'limit': 100} | |
url = '{url:s}?{params:s}'.format(url=SEARCH_URL, params=urllib.parse.urlencode(paramdict)) | |
fixed_id, fixed_ctime = data['id'], data['created'] | |
req = urllib.request.Request(url, None, HTTP_HEADERS) | |
with urllib.request.urlopen(req) as f: | |
searchjson = json.loads(f.read().decode('ascii')) | |
related_links = [ c['data'] for c in searchjson['data']['children'] ] | |
related_links = [ c for c in related_links if c['id'] != fixed_id and | |
fixed_ctime > c['created'] and fixed_ctime - c['created'] < AGE_RANGE ] | |
if not len(related_links): return None | |
title = data['title_clean'] | |
original, min_edist = None, 10000 | |
for link in related_links: | |
# m(x) returns true if property x is different in original vs link | |
m = lambda x: link[x] != data[x] | |
diffsubr, diffnsfw = 5 * m('subreddit_id'), 2 * m('over_18') | |
edist = title_compare(link['title'], title) + diffsubr + diffnsfw | |
if edist < min_edist: original, min_edist = link, edist | |
return original | |
TIME_UNITS = [('s', 60), ('m', 60), ('h', 24), ('d', 31)] | |
def build_comment(fixdsubm, origsubm): | |
notes, flags = [], [] | |
origlink = 'http://www.reddit.com' + compact_permalink(origsubm) | |
ups, downs = origsubm['ups'], origsubm['downs'] | |
time.sleep(REDDIT_DELAY) | |
url = 'http://www.reddit.com/r/{subreddit:s}/about.json'.format(**origsubm) | |
with urllib.request.urlopen(url) as f: | |
aboutjson = json.loads(f.read().decode('ascii'))['data'] | |
ftitle = fixdsubm['title_clean'] | |
otitle = origsubm['title'] | |
edist = title_compare(ftitle.lower(), otitle.lower()) | |
if origsubm['subreddit_id'] != fixdsubm['subreddit_id']: | |
notes.append('*Link not posted to same subreddit: [{origsubr:s}](/r/{origsubr:s}) -> [{newsubr:s}](/r/{newsubr:s})*'.format( | |
origsubr=origsubm['subreddit'], newsubr=fixdsubm['subreddit'])) | |
edist *= 1.5 # this lowers our confidence somewhat | |
if origsubm['name'] != fixdsubm['name'] and origsubm['author'] == fixdsubm['author']: | |
notes.append('*Fixed by original author!*'.format()) | |
edist *= 0.2 # this actually raises our confidence greatly | |
if 'true' in ftitle.lower(): | |
notes.append('*Based on a true story! `[citation needed]`*') | |
if origsubm['over_18']: flags.append('NSFW') | |
# if the original entry is liked and popular, tag it with a popular tag! | |
vote_threshold = aboutjson['subscribers'] * 0.0005 # 0.05% | |
if ups / (ups + downs) >= 0.75 and ups + downs > vote_threshold: | |
flags.append('Popular') | |
flen, olen = len(ftitle), len(otitle) | |
conf = 100 - int(100 * edist / min(olen, flen)) | |
if conf < 0: conf = 0 | |
time_diff = int(fixdsubm['created'] - origsubm['created']) | |
for unit,conv in TIME_UNITS: | |
if time_diff < conv: break | |
time_diff = int(time_diff / conv) | |
confdesc = 'Strong' | |
if conf < 90: confdesc = 'Moderate' | |
if conf < 75: confdesc = 'Slight' | |
if conf < 50: return None | |
notifymsg = '**Incorrect match:** [{id:s}]({permalink:s}) *by {author:s}*' | |
orig_is_self = bool(origsubm['selftext'] != '') | |
message_data = { | |
'botuser': username, | |
'origtime': time.strftime(TIME_FORMAT, time.localtime(origsubm['created_utc'])), | |
'direct': '' if orig_is_self else '[[Direct Link]({url:s})]'.format(**origsubm), | |
'difftime': '{:d}{:s}'.format(time_diff, unit), | |
'confdesc': confdesc, | |
'confidence': conf, | |
'flags': '**[{:s}]**'.format(','.join(flags)) if len(flags) else '', | |
'notes': '\n'.join(notes), | |
'cparams': urllib.parse.urlencode({ | |
'subject': 'Error: ' + fixdsubm['id'], | |
'message': notifymsg.format(**fixdsubm), | |
'to': username, | |
}), | |
'notifyhelp': 'Please provide as much information as possible to help improve these results.', | |
} | |
# add in keywords from original submission | |
for k,v in origsubm.items(): | |
message_data[k] = v | |
message = """ | |
**Original Submission:** [{title:s}]({permalink:s} "Posted by {author:s} to /r/{subreddit:s} on {origtime:s}") {flags:s} | |
**Posted:** {difftime:s} before this post *by [{author:s}](/user/{author:s})* | |
**{confdesc:s} Confidence:** {confidence:d}% certain that these submissions are related. | |
{notes:s} | |
--- | |
*Providing source for [FIXED] posts, linking to the original in comments* | |
*Confidence based on factors including title similarity and subreddit id.* | |
*This comment generated by an automated bot.* **[Is this match wrong?](/message/compose/?{cparams:s} "{notifyhelp:s}")**""" | |
post_time = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(origsubm['created_utc'])) | |
return '\n\n'.join(x for x in message.strip().replace('\t', '').splitlines() if x).format(**message_data) | |
ratelimit_rgx = re.compile('you are doing that too much. try again in (\d+) (\w+).', re.I) | |
def post_comment(opener, mhash, fixdsubm, origsubm): | |
fixdsubm['permalink'] = compact_permalink(fixdsubm) | |
origsubm['permalink'] = compact_permalink(origsubm) | |
POST_DATA = { | |
'r': fixdsubm['subreddit'], | |
'text': build_comment(fixdsubm, origsubm), | |
'thing_id': fixdsubm['name'], | |
'uh': mhash, | |
} | |
# lack the confidence to associate these posts | |
if POST_DATA['text'] is None: return -1 | |
time.sleep(REDDIT_DELAY) | |
post_params = urllib.parse.urlencode(POST_DATA).encode('utf8') | |
with cj_opener.open('http://www.reddit.com/api/comment', post_params) as f: | |
json_data = f.read().decode('ascii') | |
match = ratelimit_rgx.search(json_data) | |
if match is not None: | |
unit = 60 if match.group(2).startswith('min') else 1 | |
return unit * int(match.group(1)) | |
return 0 | |
def original_posted(fixedsub, origsubm): | |
time.sleep(REDDIT_DELAY) | |
comments_url = 'http://www.reddit.com' + compact_permalink(fixedsub) + '.json' | |
with urllib.request.urlopen(comments_url, None) as f: | |
comment_data = json.loads(f.read().decode('ascii')) | |
fposter = comment_data[0]['data']['children'][0]['data']['author'] | |
origid = origsubm['id'] | |
for comment in comment_data[1]['data']['children']: | |
cauth, ctext = comment['data']['author'], comment['data']['body'] | |
if cauth in (username, fposter) and ('orig' in ctext.lower() or origid in ctext): | |
return cauth | |
return None | |
class SmartRedirectHandler(urllib.request.HTTPRedirectHandler): | |
def http_error_301(self, req, fp, code, msg, headers): | |
result = urllib.request.HTTPRedirectHandler.http_error_301( | |
self, req, fp, code, msg, headers) | |
result.status = code | |
return result | |
def http_error_302(self, req, fp, code, msg, headers): | |
result = urllib.request.HTTPRedirectHandler.http_error_302( | |
self, req, fp, code, msg, headers) | |
result.status = code | |
return result | |
try: | |
cj = http.cookiejar.CookieJar() | |
cj_opener = urllib.request.build_opener(SmartRedirectHandler(), | |
urllib.request.HTTPCookieProcessor(cj)) | |
login_params = urllib.parse.urlencode(LOGIN_DATA).encode('utf8') | |
with cj_opener.open('http://www.reddit.com/api/login/' + username, login_params) as f: | |
login_response = json.loads(f.read().decode('ascii')) | |
modhash = login_response['json']['data']['modhash'] | |
print('modhash: ' + modhash) | |
except Exception as e: | |
print('Login failed...') ; raise | |
if __name__ == '__main__': | |
last_check = None | |
time.sleep(REDDIT_DELAY) | |
while True: | |
try: | |
get_fixed = urllib.parse.urlencode({'q': 'fixed', 'sort': 'new', 'limit': POST_LIMIT}) | |
req = urllib.request.Request('{:s}?{:s}'.format(SEARCH_URL, get_fixed), None, HTTP_HEADERS) | |
with urllib.request.urlopen(req) as f: | |
fixed = json.loads(f.read().decode('ascii')) | |
submissions = [c['data'] for c in fixed['data']['children']] | |
# no ratelimit set | |
ratelimit = 0 | |
# check all submissions, stop when we get to where we left off | |
for fixdsubm in submissions: | |
if fixdsubm['id'] == last_check: break | |
time.sleep(REDDIT_DELAY) | |
# data needed for information on the original | |
clean_title = cleanup_tags(fixdsubm['title']) | |
fixed_ndx = clean_title.lower().rindex('fixed') | |
bracketL, bracketR = clean_title.rfind('[', 0, fixed_ndx), 1 + clean_title.find(']', fixed_ndx) | |
if -1 in (bracketL, bracketR): continue | |
fixdsubm['title_clean'] = (clean_title[:bracketL] + clean_title[bracketR:]).strip() | |
original = search(fixdsubm) | |
if original is None: continue | |
oposter = original_posted(fixdsubm, original) | |
if oposter == username: break | |
if oposter is not None: continue | |
# let's post this thing! | |
ratelimit = post_comment(cj_opener, modhash, fixdsubm, original) | |
if ratelimit > 0: print('RATELIMITED: %d seconds...' % ratelimit) ; break | |
if ratelimit == 0: print('Posting info for [%s]' % compact_permalink(fixdsubm)) | |
# save the first entry of this set so that we can stop at this point next time | |
if len(submissions): last_check = submissions[0]['id'] | |
except Exception as e: | |
print('>>> ERROR: {:s} occurred...'.format(e.__class__.__name__)) | |
# sleep for no less than the requested delay, but for the amount of time between requests | |
sleeptime = max(ratelimit, REDDIT_DELAY, DELAY_SECONDS) ; time.sleep(sleeptime) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment