secret
Created

  • Download Gist
gistfile1.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308
import sys, json, time, re
import urllib.request, urllib.parse
import http.cookiejar
 
# limit on the number of posts to attempt to inspect each pass
POST_LIMIT = 100
 
# wait 5 minutes between checking
DELAY_SECONDS = 5 * 60
 
# API docs request no more than 1 request per 2 seconds
REDDIT_DELAY = 2
 
# don't link to a post older than 3 days
AGE_RANGE = 3 * 24 * 60 * 60
 
# formatting for a time object
TIME_FORMAT = "%a, %d %b %Y %I:%M:%S %p %Z"
 
from credentials import username, password
if len(sys.argv) > 2: prog, username, password = sys.argv
LOGIN_DATA = {'user' : username, 'passwd' : password, 'api_type': 'json'}
 
SEARCH_URL = 'http://www.reddit.com/search.json'
HTTP_HEADERS = {}
 
def compact_permalink(data):
fmt = '/r/{subreddit:s}/comments/{id:s}'
return fmt.format(**data)
 
def edit_distance(a, b):
"""Damerau-Levenshtein distance"""
m, n = len(a), len(b)
prev, curr = None, list(range(1, n + 1)) + [0]
for x in range(m):
z, prev, curr = prev, curr, [0] * n + [x + 1]
for y in range(n):
delcost, addcost = prev[y] + 1, curr[y - 1] + 1
subcost = prev[y - 1] + (a[x] != b[y])
curr[y] = min(delcost, addcost, subcost)
if (x > 0 and y > 0 and a[x] == b[y - 1] and a[x-1] == b[y] and a[x] != b[y]):
curr[y] = min(curr[y], z[y - 2] + 1)
return curr[n - 1]
 
# this is just to help cleanup the logic
replace_dict = {
'(': '[', '{': '[', '[[': '[',
')': ']', '}': ']', ']]': ']',
}
 
def cleanup_tags(orig_title):
title = orig_title
while True:
fnd = False
for f,r in replace_dict.items():
if f not in title: continue
title, fnd = title.replace(f,r), True
if not fnd: break
return title.strip()
def clean(t): return cleanup_tags(t).strip()
keep_alphanum = re.compile("[^a-z0-9]", re.I)
r_whitespace = re.compile("[\s]", re.I)
 
def title_compare(a,b):
cleaner = lambda x: r_whitespace.sub('', clean(x))
return edit_distance(cleaner(a), cleaner(b))
def search(data):
query = data['title_clean'] #'%s reddit:%s' % (data['title_clean'], data['subreddit'])
paramdict = {'q': query, 'sort': 'new', 'limit': 100}
url = '{url:s}?{params:s}'.format(url=SEARCH_URL, params=urllib.parse.urlencode(paramdict))
fixed_id, fixed_ctime = data['id'], data['created']
req = urllib.request.Request(url, None, HTTP_HEADERS)
with urllib.request.urlopen(req) as f:
searchjson = json.loads(f.read().decode('ascii'))
related_links = [ c['data'] for c in searchjson['data']['children'] ]
related_links = [ c for c in related_links if c['id'] != fixed_id and
fixed_ctime > c['created'] and fixed_ctime - c['created'] < AGE_RANGE ]
 
if not len(related_links): return None
 
title = data['title_clean']
original, min_edist = None, 10000
for link in related_links:
# m(x) returns true if property x is different in original vs link
m = lambda x: link[x] != data[x]
 
diffsubr, diffnsfw = 5 * m('subreddit_id'), 2 * m('over_18')
edist = title_compare(link['title'], title) + diffsubr + diffnsfw
if edist < min_edist: original, min_edist = link, edist
return original
TIME_UNITS = [('s', 60), ('m', 60), ('h', 24), ('d', 31)]
def build_comment(fixdsubm, origsubm):
notes, flags = [], []
origlink = 'http://www.reddit.com' + compact_permalink(origsubm)
ups, downs = origsubm['ups'], origsubm['downs']
 
time.sleep(REDDIT_DELAY)
url = 'http://www.reddit.com/r/{subreddit:s}/about.json'.format(**origsubm)
with urllib.request.urlopen(url) as f:
aboutjson = json.loads(f.read().decode('ascii'))['data']
 
ftitle = fixdsubm['title_clean']
otitle = origsubm['title']
edist = title_compare(ftitle.lower(), otitle.lower())
if origsubm['subreddit_id'] != fixdsubm['subreddit_id']:
notes.append('*Link not posted to same subreddit: [{origsubr:s}](/r/{origsubr:s}) -> [{newsubr:s}](/r/{newsubr:s})*'.format(
origsubr=origsubm['subreddit'], newsubr=fixdsubm['subreddit']))
edist *= 1.5 # this lowers our confidence somewhat
if origsubm['name'] != fixdsubm['name'] and origsubm['author'] == fixdsubm['author']:
notes.append('*Fixed by original author!*'.format())
edist *= 0.2 # this actually raises our confidence greatly
 
if 'true' in ftitle.lower():
notes.append('*Based on a true story! `[citation needed]`*')
 
if origsubm['over_18']: flags.append('NSFW')
 
# if the original entry is liked and popular, tag it with a popular tag!
vote_threshold = aboutjson['subscribers'] * 0.0005 # 0.05%
if ups / (ups + downs) >= 0.75 and ups + downs > vote_threshold:
flags.append('Popular')
 
flen, olen = len(ftitle), len(otitle)
conf = 100 - int(100 * edist / min(olen, flen))
if conf < 0: conf = 0
time_diff = int(fixdsubm['created'] - origsubm['created'])
for unit,conv in TIME_UNITS:
if time_diff < conv: break
time_diff = int(time_diff / conv)
 
confdesc = 'Strong'
if conf < 90: confdesc = 'Moderate'
if conf < 75: confdesc = 'Slight'
if conf < 50: return None
 
notifymsg = '**Incorrect match:** [{id:s}]({permalink:s}) *by {author:s}*'
orig_is_self = bool(origsubm['selftext'] != '')
 
message_data = {
'botuser': username,
'origtime': time.strftime(TIME_FORMAT, time.localtime(origsubm['created_utc'])),
'direct': '' if orig_is_self else '[[Direct Link]({url:s})]'.format(**origsubm),
'difftime': '{:d}{:s}'.format(time_diff, unit),
'confdesc': confdesc,
'confidence': conf,
 
'flags': '**[{:s}]**'.format(','.join(flags)) if len(flags) else '',
'notes': '\n'.join(notes),
 
'cparams': urllib.parse.urlencode({
'subject': 'Error: ' + fixdsubm['id'],
'message': notifymsg.format(**fixdsubm),
'to': username,
}),
'notifyhelp': 'Please provide as much information as possible to help improve these results.',
}
# add in keywords from original submission
for k,v in origsubm.items():
message_data[k] = v
 
message = """
**Original Submission:** [{title:s}]({permalink:s} "Posted by {author:s} to /r/{subreddit:s} on {origtime:s}") {flags:s}
**Posted:** {difftime:s} before this post *by [{author:s}](/user/{author:s})*
**{confdesc:s} Confidence:** {confidence:d}% certain that these submissions are related.
{notes:s}
---
*Providing source for [FIXED] posts, linking to the original in comments*
*Confidence based on factors including title similarity and subreddit id.*
*This comment generated by an automated bot.* **[Is this match wrong?](/message/compose/?{cparams:s} "{notifyhelp:s}")**"""
post_time = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(origsubm['created_utc']))
return '\n\n'.join(x for x in message.strip().replace('\t', '').splitlines() if x).format(**message_data)
ratelimit_rgx = re.compile('you are doing that too much. try again in (\d+) (\w+).', re.I)
def post_comment(opener, mhash, fixdsubm, origsubm):
fixdsubm['permalink'] = compact_permalink(fixdsubm)
origsubm['permalink'] = compact_permalink(origsubm)
POST_DATA = {
'r': fixdsubm['subreddit'],
'text': build_comment(fixdsubm, origsubm),
'thing_id': fixdsubm['name'],
'uh': mhash,
}
 
# lack the confidence to associate these posts
if POST_DATA['text'] is None: return -1
time.sleep(REDDIT_DELAY)
 
post_params = urllib.parse.urlencode(POST_DATA).encode('utf8')
with cj_opener.open('http://www.reddit.com/api/comment', post_params) as f:
json_data = f.read().decode('ascii')
match = ratelimit_rgx.search(json_data)
if match is not None:
unit = 60 if match.group(2).startswith('min') else 1
return unit * int(match.group(1))
return 0
 
def original_posted(fixedsub, origsubm):
time.sleep(REDDIT_DELAY)
comments_url = 'http://www.reddit.com' + compact_permalink(fixedsub) + '.json'
with urllib.request.urlopen(comments_url, None) as f:
comment_data = json.loads(f.read().decode('ascii'))
fposter = comment_data[0]['data']['children'][0]['data']['author']
origid = origsubm['id']
for comment in comment_data[1]['data']['children']:
cauth, ctext = comment['data']['author'], comment['data']['body']
if cauth in (username, fposter) and ('orig' in ctext.lower() or origid in ctext):
return cauth
return None
 
class SmartRedirectHandler(urllib.request.HTTPRedirectHandler):
def http_error_301(self, req, fp, code, msg, headers):
result = urllib.request.HTTPRedirectHandler.http_error_301(
self, req, fp, code, msg, headers)
result.status = code
return result
 
def http_error_302(self, req, fp, code, msg, headers):
result = urllib.request.HTTPRedirectHandler.http_error_302(
self, req, fp, code, msg, headers)
result.status = code
return result
 
try:
cj = http.cookiejar.CookieJar()
cj_opener = urllib.request.build_opener(SmartRedirectHandler(),
urllib.request.HTTPCookieProcessor(cj))
login_params = urllib.parse.urlencode(LOGIN_DATA).encode('utf8')
with cj_opener.open('http://www.reddit.com/api/login/' + username, login_params) as f:
login_response = json.loads(f.read().decode('ascii'))
modhash = login_response['json']['data']['modhash']
print('modhash: ' + modhash)
except Exception as e:
print('Login failed...') ; raise
 
if __name__ == '__main__':
last_check = None
time.sleep(REDDIT_DELAY)
while True:
try:
get_fixed = urllib.parse.urlencode({'q': 'fixed', 'sort': 'new', 'limit': POST_LIMIT})
req = urllib.request.Request('{:s}?{:s}'.format(SEARCH_URL, get_fixed), None, HTTP_HEADERS)
with urllib.request.urlopen(req) as f:
fixed = json.loads(f.read().decode('ascii'))
submissions = [c['data'] for c in fixed['data']['children']]
# no ratelimit set
ratelimit = 0
# check all submissions, stop when we get to where we left off
for fixdsubm in submissions:
if fixdsubm['id'] == last_check: break
time.sleep(REDDIT_DELAY)
# data needed for information on the original
clean_title = cleanup_tags(fixdsubm['title'])
fixed_ndx = clean_title.lower().rindex('fixed')
bracketL, bracketR = clean_title.rfind('[', 0, fixed_ndx), 1 + clean_title.find(']', fixed_ndx)
if -1 in (bracketL, bracketR): continue
fixdsubm['title_clean'] = (clean_title[:bracketL] + clean_title[bracketR:]).strip()
original = search(fixdsubm)
if original is None: continue
oposter = original_posted(fixdsubm, original)
if oposter == username: break
if oposter is not None: continue
# let's post this thing!
ratelimit = post_comment(cj_opener, modhash, fixdsubm, original)
if ratelimit > 0: print('RATELIMITED: %d seconds...' % ratelimit) ; break
if ratelimit == 0: print('Posting info for [%s]' % compact_permalink(fixdsubm))
# save the first entry of this set so that we can stop at this point next time
if len(submissions): last_check = submissions[0]['id']
except Exception as e:
print('>>> ERROR: {:s} occurred...'.format(e.__class__.__name__))
# sleep for no less than the requested delay, but for the amount of time between requests
sleeptime = max(ratelimit, REDDIT_DELAY, DELAY_SECONDS) ; time.sleep(sleeptime)

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.