Skip to content

Instantly share code, notes, and snippets.

Created July 11, 2011 07:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save anonymous/d1f72a0d1f9adfe8e60e to your computer and use it in GitHub Desktop.
Save anonymous/d1f72a0d1f9adfe8e60e to your computer and use it in GitHub Desktop.
import sys, json, time, re
import urllib.request, urllib.parse
import http.cookiejar
# limit on the number of posts to attempt to inspect each pass
POST_LIMIT = 100
# wait 5 minutes between checking
DELAY_SECONDS = 5 * 60
# API docs request no more than 1 request per 2 seconds
REDDIT_DELAY = 2
# don't link to a post older than 3 days
AGE_RANGE = 3 * 24 * 60 * 60
# formatting for a time object
TIME_FORMAT = "%a, %d %b %Y %I:%M:%S %p %Z"
from credentials import username, password
if len(sys.argv) > 2: prog, username, password = sys.argv
LOGIN_DATA = {'user' : username, 'passwd' : password, 'api_type': 'json'}
SEARCH_URL = 'http://www.reddit.com/search.json'
HTTP_HEADERS = {}
def compact_permalink(data):
fmt = '/r/{subreddit:s}/comments/{id:s}'
return fmt.format(**data)
def edit_distance(a, b):
"""Damerau-Levenshtein distance"""
m, n = len(a), len(b)
prev, curr = None, list(range(1, n + 1)) + [0]
for x in range(m):
z, prev, curr = prev, curr, [0] * n + [x + 1]
for y in range(n):
delcost, addcost = prev[y] + 1, curr[y - 1] + 1
subcost = prev[y - 1] + (a[x] != b[y])
curr[y] = min(delcost, addcost, subcost)
if (x > 0 and y > 0 and a[x] == b[y - 1] and a[x-1] == b[y] and a[x] != b[y]):
curr[y] = min(curr[y], z[y - 2] + 1)
return curr[n - 1]
# this is just to help cleanup the logic
replace_dict = {
'(': '[', '{': '[', '[[': '[',
')': ']', '}': ']', ']]': ']',
}
def cleanup_tags(orig_title):
title = orig_title
while True:
fnd = False
for f,r in replace_dict.items():
if f not in title: continue
title, fnd = title.replace(f,r), True
if not fnd: break
return title.strip()
def clean(t): return cleanup_tags(t).strip()
keep_alphanum = re.compile("[^a-z0-9]", re.I)
r_whitespace = re.compile("[\s]", re.I)
def title_compare(a,b):
cleaner = lambda x: r_whitespace.sub('', clean(x))
return edit_distance(cleaner(a), cleaner(b))
def search(data):
query = data['title_clean'] #'%s reddit:%s' % (data['title_clean'], data['subreddit'])
paramdict = {'q': query, 'sort': 'new', 'limit': 100}
url = '{url:s}?{params:s}'.format(url=SEARCH_URL, params=urllib.parse.urlencode(paramdict))
fixed_id, fixed_ctime = data['id'], data['created']
req = urllib.request.Request(url, None, HTTP_HEADERS)
with urllib.request.urlopen(req) as f:
searchjson = json.loads(f.read().decode('ascii'))
related_links = [ c['data'] for c in searchjson['data']['children'] ]
related_links = [ c for c in related_links if c['id'] != fixed_id and
fixed_ctime > c['created'] and fixed_ctime - c['created'] < AGE_RANGE ]
if not len(related_links): return None
title = data['title_clean']
original, min_edist = None, 10000
for link in related_links:
# m(x) returns true if property x is different in original vs link
m = lambda x: link[x] != data[x]
diffsubr, diffnsfw = 5 * m('subreddit_id'), 2 * m('over_18')
edist = title_compare(link['title'], title) + diffsubr + diffnsfw
if edist < min_edist: original, min_edist = link, edist
return original
TIME_UNITS = [('s', 60), ('m', 60), ('h', 24), ('d', 31)]
def build_comment(fixdsubm, origsubm):
notes, flags = [], []
origlink = 'http://www.reddit.com' + compact_permalink(origsubm)
ups, downs = origsubm['ups'], origsubm['downs']
time.sleep(REDDIT_DELAY)
url = 'http://www.reddit.com/r/{subreddit:s}/about.json'.format(**origsubm)
with urllib.request.urlopen(url) as f:
aboutjson = json.loads(f.read().decode('ascii'))['data']
ftitle = fixdsubm['title_clean']
otitle = origsubm['title']
edist = title_compare(ftitle.lower(), otitle.lower())
if origsubm['subreddit_id'] != fixdsubm['subreddit_id']:
notes.append('*Link not posted to same subreddit: [{origsubr:s}](/r/{origsubr:s}) -> [{newsubr:s}](/r/{newsubr:s})*'.format(
origsubr=origsubm['subreddit'], newsubr=fixdsubm['subreddit']))
edist *= 1.5 # this lowers our confidence somewhat
if origsubm['name'] != fixdsubm['name'] and origsubm['author'] == fixdsubm['author']:
notes.append('*Fixed by original author!*'.format())
edist *= 0.2 # this actually raises our confidence greatly
if 'true' in ftitle.lower():
notes.append('*Based on a true story! `[citation needed]`*')
if origsubm['over_18']: flags.append('NSFW')
# if the original entry is liked and popular, tag it with a popular tag!
vote_threshold = aboutjson['subscribers'] * 0.0005 # 0.05%
if ups / (ups + downs) >= 0.75 and ups + downs > vote_threshold:
flags.append('Popular')
flen, olen = len(ftitle), len(otitle)
conf = 100 - int(100 * edist / min(olen, flen))
if conf < 0: conf = 0
time_diff = int(fixdsubm['created'] - origsubm['created'])
for unit,conv in TIME_UNITS:
if time_diff < conv: break
time_diff = int(time_diff / conv)
confdesc = 'Strong'
if conf < 90: confdesc = 'Moderate'
if conf < 75: confdesc = 'Slight'
if conf < 50: return None
notifymsg = '**Incorrect match:** [{id:s}]({permalink:s}) *by {author:s}*'
orig_is_self = bool(origsubm['selftext'] != '')
message_data = {
'botuser': username,
'origtime': time.strftime(TIME_FORMAT, time.localtime(origsubm['created_utc'])),
'direct': '' if orig_is_self else '[[Direct Link]({url:s})]'.format(**origsubm),
'difftime': '{:d}{:s}'.format(time_diff, unit),
'confdesc': confdesc,
'confidence': conf,
'flags': '**[{:s}]**'.format(','.join(flags)) if len(flags) else '',
'notes': '\n'.join(notes),
'cparams': urllib.parse.urlencode({
'subject': 'Error: ' + fixdsubm['id'],
'message': notifymsg.format(**fixdsubm),
'to': username,
}),
'notifyhelp': 'Please provide as much information as possible to help improve these results.',
}
# add in keywords from original submission
for k,v in origsubm.items():
message_data[k] = v
message = """
**Original Submission:** [{title:s}]({permalink:s} "Posted by {author:s} to /r/{subreddit:s} on {origtime:s}") {flags:s}
**Posted:** {difftime:s} before this post *by [{author:s}](/user/{author:s})*
**{confdesc:s} Confidence:** {confidence:d}% certain that these submissions are related.
{notes:s}
---
*Providing source for [FIXED] posts, linking to the original in comments*
*Confidence based on factors including title similarity and subreddit id.*
*This comment generated by an automated bot.* **[Is this match wrong?](/message/compose/?{cparams:s} "{notifyhelp:s}")**"""
post_time = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(origsubm['created_utc']))
return '\n\n'.join(x for x in message.strip().replace('\t', '').splitlines() if x).format(**message_data)
ratelimit_rgx = re.compile('you are doing that too much. try again in (\d+) (\w+).', re.I)
def post_comment(opener, mhash, fixdsubm, origsubm):
fixdsubm['permalink'] = compact_permalink(fixdsubm)
origsubm['permalink'] = compact_permalink(origsubm)
POST_DATA = {
'r': fixdsubm['subreddit'],
'text': build_comment(fixdsubm, origsubm),
'thing_id': fixdsubm['name'],
'uh': mhash,
}
# lack the confidence to associate these posts
if POST_DATA['text'] is None: return -1
time.sleep(REDDIT_DELAY)
post_params = urllib.parse.urlencode(POST_DATA).encode('utf8')
with cj_opener.open('http://www.reddit.com/api/comment', post_params) as f:
json_data = f.read().decode('ascii')
match = ratelimit_rgx.search(json_data)
if match is not None:
unit = 60 if match.group(2).startswith('min') else 1
return unit * int(match.group(1))
return 0
def original_posted(fixedsub, origsubm):
time.sleep(REDDIT_DELAY)
comments_url = 'http://www.reddit.com' + compact_permalink(fixedsub) + '.json'
with urllib.request.urlopen(comments_url, None) as f:
comment_data = json.loads(f.read().decode('ascii'))
fposter = comment_data[0]['data']['children'][0]['data']['author']
origid = origsubm['id']
for comment in comment_data[1]['data']['children']:
cauth, ctext = comment['data']['author'], comment['data']['body']
if cauth in (username, fposter) and ('orig' in ctext.lower() or origid in ctext):
return cauth
return None
class SmartRedirectHandler(urllib.request.HTTPRedirectHandler):
def http_error_301(self, req, fp, code, msg, headers):
result = urllib.request.HTTPRedirectHandler.http_error_301(
self, req, fp, code, msg, headers)
result.status = code
return result
def http_error_302(self, req, fp, code, msg, headers):
result = urllib.request.HTTPRedirectHandler.http_error_302(
self, req, fp, code, msg, headers)
result.status = code
return result
try:
cj = http.cookiejar.CookieJar()
cj_opener = urllib.request.build_opener(SmartRedirectHandler(),
urllib.request.HTTPCookieProcessor(cj))
login_params = urllib.parse.urlencode(LOGIN_DATA).encode('utf8')
with cj_opener.open('http://www.reddit.com/api/login/' + username, login_params) as f:
login_response = json.loads(f.read().decode('ascii'))
modhash = login_response['json']['data']['modhash']
print('modhash: ' + modhash)
except Exception as e:
print('Login failed...') ; raise
if __name__ == '__main__':
last_check = None
time.sleep(REDDIT_DELAY)
while True:
try:
get_fixed = urllib.parse.urlencode({'q': 'fixed', 'sort': 'new', 'limit': POST_LIMIT})
req = urllib.request.Request('{:s}?{:s}'.format(SEARCH_URL, get_fixed), None, HTTP_HEADERS)
with urllib.request.urlopen(req) as f:
fixed = json.loads(f.read().decode('ascii'))
submissions = [c['data'] for c in fixed['data']['children']]
# no ratelimit set
ratelimit = 0
# check all submissions, stop when we get to where we left off
for fixdsubm in submissions:
if fixdsubm['id'] == last_check: break
time.sleep(REDDIT_DELAY)
# data needed for information on the original
clean_title = cleanup_tags(fixdsubm['title'])
fixed_ndx = clean_title.lower().rindex('fixed')
bracketL, bracketR = clean_title.rfind('[', 0, fixed_ndx), 1 + clean_title.find(']', fixed_ndx)
if -1 in (bracketL, bracketR): continue
fixdsubm['title_clean'] = (clean_title[:bracketL] + clean_title[bracketR:]).strip()
original = search(fixdsubm)
if original is None: continue
oposter = original_posted(fixdsubm, original)
if oposter == username: break
if oposter is not None: continue
# let's post this thing!
ratelimit = post_comment(cj_opener, modhash, fixdsubm, original)
if ratelimit > 0: print('RATELIMITED: %d seconds...' % ratelimit) ; break
if ratelimit == 0: print('Posting info for [%s]' % compact_permalink(fixdsubm))
# save the first entry of this set so that we can stop at this point next time
if len(submissions): last_check = submissions[0]['id']
except Exception as e:
print('>>> ERROR: {:s} occurred...'.format(e.__class__.__name__))
# sleep for no less than the requested delay, but for the amount of time between requests
sleeptime = max(ratelimit, REDDIT_DELAY, DELAY_SECONDS) ; time.sleep(sleeptime)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment