/gist:d1f72a0d1f9adfe8e60e Secret

## gistfile1.py
import sys, json, time, re
import urllib.request, urllib.parse
import http.cookiejar

# limit on the number of posts to attempt to inspect each pass
POST_LIMIT = 100

# wait 5 minutes between checking
DELAY_SECONDS = 5 * 60

# API docs request no more than 1 request per 2 seconds
REDDIT_DELAY = 2

# don't link to a post older than 3 days
AGE_RANGE = 3 * 24 * 60 * 60

# formatting for a time object
TIME_FORMAT = "%a, %d %b %Y %I:%M:%S %p %Z"

from credentials import username, password
if len(sys.argv) > 2: prog, username, password = sys.argv
LOGIN_DATA = {'user' : username, 'passwd' : password, 'api_type': 'json'}

SEARCH_URL = 'http://www.reddit.com/search.json'
HTTP_HEADERS = {}

def compact_permalink(data):
	fmt = '/r/{subreddit:s}/comments/{id:s}'
	return fmt.format(**data)

def edit_distance(a, b):
	"""Damerau-Levenshtein distance"""
	m, n = len(a), len(b)
	prev, curr = None, list(range(1, n + 1)) + [0]

	for x in range(m):
		z, prev, curr = prev, curr, [0] * n + [x + 1]
		for y in range(n):
			delcost, addcost = prev[y] + 1, curr[y - 1] + 1
			subcost = prev[y - 1] + (a[x] != b[y])
			curr[y] = min(delcost, addcost, subcost)

			if (x > 0 and y > 0 and a[x] == b[y - 1] and a[x-1] == b[y] and a[x] != b[y]):
				curr[y] = min(curr[y], z[y - 2] + 1)
	return curr[n - 1]

# this is just to help cleanup the logic
replace_dict = {
	'(': '[', '{': '[', '[[': '[',
	')': ']', '}': ']', ']]': ']',
}

def cleanup_tags(orig_title):
	title = orig_title
	while True:
		fnd = False
		for f,r in replace_dict.items():
			if f not in title: continue
			title, fnd = title.replace(f,r), True
		if not fnd: break
	return title.strip()

def clean(t): return cleanup_tags(t).strip()
keep_alphanum = re.compile("[^a-z0-9]", re.I)
r_whitespace = re.compile("[\s]", re.I)

def title_compare(a,b):
	cleaner = lambda x: r_whitespace.sub('', clean(x))
	return edit_distance(cleaner(a), cleaner(b))

def search(data):
	query = data['title_clean'] #'%s reddit:%s' % (data['title_clean'], data['subreddit'])
	paramdict = {'q': query, 'sort': 'new', 'limit': 100}

	url = '{url:s}?{params:s}'.format(url=SEARCH_URL, params=urllib.parse.urlencode(paramdict))
	fixed_id, fixed_ctime = data['id'], data['created']

	req = urllib.request.Request(url, None, HTTP_HEADERS)
	with urllib.request.urlopen(req) as f:
		searchjson = json.loads(f.read().decode('ascii'))

		related_links = [ c['data'] for c in searchjson['data']['children'] ]
		related_links = [ c for c in related_links if c['id'] != fixed_id and
			fixed_ctime > c['created'] and fixed_ctime - c['created'] < AGE_RANGE ]

	if not len(related_links): return None

	title = data['title_clean']
	original, min_edist = None, 10000

	for link in related_links:
		# m(x) returns true if property x is different in original vs link
		m = lambda x: link[x] != data[x]

		diffsubr, diffnsfw = 5 * m('subreddit_id'), 2 * m('over_18')
		edist = title_compare(link['title'], title) + diffsubr + diffnsfw
		if edist < min_edist: original, min_edist = link, edist
	return original

TIME_UNITS = [('s', 60), ('m', 60), ('h', 24), ('d', 31)]
def build_comment(fixdsubm, origsubm):
	notes, flags = [], []

	origlink = 'http://www.reddit.com' + compact_permalink(origsubm)
	ups, downs = origsubm['ups'], origsubm['downs']

	time.sleep(REDDIT_DELAY)
	url = 'http://www.reddit.com/r/{subreddit:s}/about.json'.format(**origsubm)
	with urllib.request.urlopen(url) as f:
		aboutjson = json.loads(f.read().decode('ascii'))['data']

	ftitle = fixdsubm['title_clean']
	otitle = origsubm['title']
	edist = title_compare(ftitle.lower(), otitle.lower())

	if origsubm['subreddit_id'] != fixdsubm['subreddit_id']:
		notes.append('*Link not posted to same subreddit: [{origsubr:s}](/r/{origsubr:s}) -> [{newsubr:s}](/r/{newsubr:s})*'.format(
			origsubr=origsubm['subreddit'], newsubr=fixdsubm['subreddit']))
		edist *= 1.5 # this lowers our confidence somewhat

	if origsubm['name'] != fixdsubm['name'] and origsubm['author'] == fixdsubm['author']:
		notes.append('*Fixed by original author!*'.format())
		edist *= 0.2 # this actually raises our confidence greatly

	if 'true' in ftitle.lower():
		notes.append('*Based on a true story! `[citation needed]`*')

	if origsubm['over_18']: flags.append('NSFW')

	# if the original entry is liked and popular, tag it with a popular tag!
	vote_threshold = aboutjson['subscribers'] * 0.0005 # 0.05%
	if ups / (ups + downs) >= 0.75 and ups + downs > vote_threshold:
		flags.append('Popular')

	flen, olen = len(ftitle), len(otitle)
	conf = 100 - int(100 * edist / min(olen, flen))
	if conf < 0: conf = 0

	time_diff = int(fixdsubm['created'] - origsubm['created'])
	for unit,conv in TIME_UNITS:
		if time_diff < conv: break
		time_diff = int(time_diff / conv)

	confdesc = 'Strong'
	if conf < 90: confdesc = 'Moderate'
	if conf < 75: confdesc = 'Slight'
	if conf < 50: return None

	notifymsg = '**Incorrect match:** [{id:s}]({permalink:s}) *by {author:s}*'
	orig_is_self = bool(origsubm['selftext'] != '')

	message_data = {
		'botuser': username,
		'origtime': time.strftime(TIME_FORMAT, time.localtime(origsubm['created_utc'])),
		'direct': '' if orig_is_self else '[[Direct Link]({url:s})]'.format(**origsubm),

		'difftime': '{:d}{:s}'.format(time_diff, unit),
		'confdesc': confdesc,
		'confidence': conf,

		'flags': '**[{:s}]**'.format(','.join(flags)) if len(flags) else '',
		'notes': '\n'.join(notes),

		'cparams': urllib.parse.urlencode({
			'subject': 'Error: ' + fixdsubm['id'],
			'message': notifymsg.format(**fixdsubm),
			'to': username,
		}),
		'notifyhelp': 'Please provide as much information as possible to help improve these results.',
	}

	# add in keywords from original submission
	for k,v in origsubm.items():
		message_data[k] = v

	message = """
		**Original Submission:** [{title:s}]({permalink:s} "Posted by {author:s} to /r/{subreddit:s} on {origtime:s}") {flags:s}
		**Posted:** {difftime:s} before this post *by [{author:s}](/user/{author:s})*
		**{confdesc:s} Confidence:** {confidence:d}% certain that these submissions are related.
		{notes:s}
		---
		*Providing source for [FIXED] posts, linking to the original in comments*
		*Confidence based on factors including title similarity and subreddit id.*
		*This comment generated by an automated bot.* **[Is this match wrong?](/message/compose/?{cparams:s} "{notifyhelp:s}")**"""

	post_time = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(origsubm['created_utc']))
	return '\n\n'.join(x for x in message.strip().replace('\t', '').splitlines() if x).format(**message_data)

ratelimit_rgx = re.compile('you are doing that too much. try again in (\d+) (\w+).', re.I)

def post_comment(opener, mhash, fixdsubm, origsubm):
	fixdsubm['permalink'] = compact_permalink(fixdsubm)
	origsubm['permalink'] = compact_permalink(origsubm)

	POST_DATA = {
		'r': fixdsubm['subreddit'],
		'text': build_comment(fixdsubm, origsubm),
		'thing_id': fixdsubm['name'],
		'uh': mhash,
	}

	# lack the confidence to associate these posts
	if POST_DATA['text'] is None: return -1
	time.sleep(REDDIT_DELAY)

	post_params = urllib.parse.urlencode(POST_DATA).encode('utf8')
	with cj_opener.open('http://www.reddit.com/api/comment', post_params) as f:
		json_data = f.read().decode('ascii')
		match = ratelimit_rgx.search(json_data)

		if match is not None:
			unit = 60 if match.group(2).startswith('min') else 1
			return unit * int(match.group(1))
	return 0

def original_posted(fixedsub, origsubm):
	time.sleep(REDDIT_DELAY)

	comments_url = 'http://www.reddit.com' + compact_permalink(fixedsub) + '.json'
	with urllib.request.urlopen(comments_url, None) as f:
		comment_data = json.loads(f.read().decode('ascii'))
		fposter = comment_data[0]['data']['children'][0]['data']['author']

		origid = origsubm['id']
		for comment in comment_data[1]['data']['children']:
			cauth, ctext = comment['data']['author'], comment['data']['body']
			if cauth in (username, fposter) and ('orig' in ctext.lower() or origid in ctext):
				return cauth
	return None

class SmartRedirectHandler(urllib.request.HTTPRedirectHandler):
	def http_error_301(self, req, fp, code, msg, headers):
		result = urllib.request.HTTPRedirectHandler.http_error_301(
			self, req, fp, code, msg, headers)
		result.status = code
		return result

	def http_error_302(self, req, fp, code, msg, headers):
		result = urllib.request.HTTPRedirectHandler.http_error_302(
			self, req, fp, code, msg, headers)
		result.status = code
		return result

try:
	cj = http.cookiejar.CookieJar()
	cj_opener = urllib.request.build_opener(SmartRedirectHandler(),
		urllib.request.HTTPCookieProcessor(cj))

	login_params = urllib.parse.urlencode(LOGIN_DATA).encode('utf8')
	with cj_opener.open('http://www.reddit.com/api/login/' + username, login_params) as f:
		login_response = json.loads(f.read().decode('ascii'))
		modhash = login_response['json']['data']['modhash']
	print('modhash: ' + modhash)

except Exception as e:
	print('Login failed...') ; raise

if __name__ == '__main__':
	last_check = None
	time.sleep(REDDIT_DELAY)

	while True:
		try:
			get_fixed = urllib.parse.urlencode({'q': 'fixed', 'sort': 'new', 'limit': POST_LIMIT})
			req = urllib.request.Request('{:s}?{:s}'.format(SEARCH_URL, get_fixed), None, HTTP_HEADERS)
			with urllib.request.urlopen(req) as f:
				fixed = json.loads(f.read().decode('ascii'))
				submissions = [c['data'] for c in fixed['data']['children']]

			# no ratelimit set
			ratelimit = 0

			# check all submissions, stop when we get to where we left off
			for fixdsubm in submissions:
				if fixdsubm['id'] == last_check: break
				time.sleep(REDDIT_DELAY)

				# data needed for information on the original
				clean_title = cleanup_tags(fixdsubm['title'])

				fixed_ndx = clean_title.lower().rindex('fixed')
				bracketL, bracketR = clean_title.rfind('[', 0, fixed_ndx), 1 + clean_title.find(']', fixed_ndx)

				if -1 in (bracketL, bracketR): continue
				fixdsubm['title_clean'] = (clean_title[:bracketL] + clean_title[bracketR:]).strip()

				original = search(fixdsubm)
				if original is None: continue

				oposter = original_posted(fixdsubm, original)
				if oposter == username: break
				if oposter is not None: continue

				# let's post this thing!
				ratelimit = post_comment(cj_opener, modhash, fixdsubm, original)
				if ratelimit >  0: print('RATELIMITED: %d seconds...' % ratelimit) ; break
				if ratelimit == 0: print('Posting info for [%s]' % compact_permalink(fixdsubm))

			# save the first entry of this set so that we can stop at this point next time
			if len(submissions): last_check = submissions[0]['id']

		except Exception as e:
			print('>>> ERROR: {:s} occurred...'.format(e.__class__.__name__))

		# sleep for no less than the requested delay, but for the amount of time between requests
		sleeptime = max(ratelimit, REDDIT_DELAY, DELAY_SECONDS) ; time.sleep(sleeptime)
	import sys, json, time, re
	import urllib.request, urllib.parse
	import http.cookiejar

	# limit on the number of posts to attempt to inspect each pass
	POST_LIMIT = 100

	# wait 5 minutes between checking
	DELAY_SECONDS = 5 * 60

	# API docs request no more than 1 request per 2 seconds
	REDDIT_DELAY = 2

	# don't link to a post older than 3 days
	AGE_RANGE = 3 * 24 * 60 * 60

	# formatting for a time object
	TIME_FORMAT = "%a, %d %b %Y %I:%M:%S %p %Z"

	from credentials import username, password
	if len(sys.argv) > 2: prog, username, password = sys.argv
	LOGIN_DATA = {'user' : username, 'passwd' : password, 'api_type': 'json'}

	SEARCH_URL = 'http://www.reddit.com/search.json'
	HTTP_HEADERS = {}

	def compact_permalink(data):
	fmt = '/r/{subreddit:s}/comments/{id:s}'
	return fmt.format(**data)

	def edit_distance(a, b):
	"""Damerau-Levenshtein distance"""
	m, n = len(a), len(b)
	prev, curr = None, list(range(1, n + 1)) + [0]

	for x in range(m):
	z, prev, curr = prev, curr, [0] * n + [x + 1]
	for y in range(n):
	delcost, addcost = prev[y] + 1, curr[y - 1] + 1
	subcost = prev[y - 1] + (a[x] != b[y])
	curr[y] = min(delcost, addcost, subcost)

	if (x > 0 and y > 0 and a[x] == b[y - 1] and a[x-1] == b[y] and a[x] != b[y]):
	curr[y] = min(curr[y], z[y - 2] + 1)
	return curr[n - 1]

	# this is just to help cleanup the logic
	replace_dict = {
	'(': '[', '{': '[', '[[': '[',
	')': ']', '}': ']', ']]': ']',
	}

	def cleanup_tags(orig_title):
	title = orig_title
	while True:
	fnd = False
	for f,r in replace_dict.items():
	if f not in title: continue
	title, fnd = title.replace(f,r), True
	if not fnd: break
	return title.strip()

	def clean(t): return cleanup_tags(t).strip()
	keep_alphanum = re.compile("[^a-z0-9]", re.I)
	r_whitespace = re.compile("[\s]", re.I)

	def title_compare(a,b):
	cleaner = lambda x: r_whitespace.sub('', clean(x))
	return edit_distance(cleaner(a), cleaner(b))

	def search(data):
	query = data['title_clean'] #'%s reddit:%s' % (data['title_clean'], data['subreddit'])
	paramdict = {'q': query, 'sort': 'new', 'limit': 100}

	url = '{url:s}?{params:s}'.format(url=SEARCH_URL, params=urllib.parse.urlencode(paramdict))
	fixed_id, fixed_ctime = data['id'], data['created']

	req = urllib.request.Request(url, None, HTTP_HEADERS)
	with urllib.request.urlopen(req) as f:
	searchjson = json.loads(f.read().decode('ascii'))

	related_links = [ c['data'] for c in searchjson['data']['children'] ]
	related_links = [ c for c in related_links if c['id'] != fixed_id and
	fixed_ctime > c['created'] and fixed_ctime - c['created'] < AGE_RANGE ]

	if not len(related_links): return None

	title = data['title_clean']
	original, min_edist = None, 10000

	for link in related_links:
	# m(x) returns true if property x is different in original vs link
	m = lambda x: link[x] != data[x]

	diffsubr, diffnsfw = 5 * m('subreddit_id'), 2 * m('over_18')
	edist = title_compare(link['title'], title) + diffsubr + diffnsfw
	if edist < min_edist: original, min_edist = link, edist
	return original

	TIME_UNITS = [('s', 60), ('m', 60), ('h', 24), ('d', 31)]
	def build_comment(fixdsubm, origsubm):
	notes, flags = [], []

	origlink = 'http://www.reddit.com' + compact_permalink(origsubm)
	ups, downs = origsubm['ups'], origsubm['downs']

	time.sleep(REDDIT_DELAY)
	url = 'http://www.reddit.com/r/{subreddit:s}/about.json'.format(**origsubm)
	with urllib.request.urlopen(url) as f:
	aboutjson = json.loads(f.read().decode('ascii'))['data']

	ftitle = fixdsubm['title_clean']
	otitle = origsubm['title']
	edist = title_compare(ftitle.lower(), otitle.lower())

	if origsubm['subreddit_id'] != fixdsubm['subreddit_id']:
	notes.append('Link not posted to same subreddit: [{origsubr:s}](/r/{origsubr:s}) -> [{newsubr:s}](/r/{newsubr:s})'.format(
	origsubr=origsubm['subreddit'], newsubr=fixdsubm['subreddit']))
	edist *= 1.5 # this lowers our confidence somewhat

	if origsubm['name'] != fixdsubm['name'] and origsubm['author'] == fixdsubm['author']:
	notes.append('Fixed by original author!'.format())
	edist *= 0.2 # this actually raises our confidence greatly

	if 'true' in ftitle.lower():
	notes.append('Based on a true story! `[citation needed]`')

	if origsubm['over_18']: flags.append('NSFW')

	# if the original entry is liked and popular, tag it with a popular tag!
	vote_threshold = aboutjson['subscribers'] * 0.0005 # 0.05%
	if ups / (ups + downs) >= 0.75 and ups + downs > vote_threshold:
	flags.append('Popular')

	flen, olen = len(ftitle), len(otitle)
	conf = 100 - int(100 * edist / min(olen, flen))
	if conf < 0: conf = 0

	time_diff = int(fixdsubm['created'] - origsubm['created'])
	for unit,conv in TIME_UNITS:
	if time_diff < conv: break
	time_diff = int(time_diff / conv)

	confdesc = 'Strong'
	if conf < 90: confdesc = 'Moderate'
	if conf < 75: confdesc = 'Slight'
	if conf < 50: return None

	notifymsg = 'Incorrect match: [{id:s}]({permalink:s}) by {author:s}'
	orig_is_self = bool(origsubm['selftext'] != '')

	message_data = {
	'botuser': username,
	'origtime': time.strftime(TIME_FORMAT, time.localtime(origsubm['created_utc'])),
	'direct': '' if orig_is_self else '[[Direct Link]({url:s})]'.format(**origsubm),

	'difftime': '{:d}{:s}'.format(time_diff, unit),
	'confdesc': confdesc,
	'confidence': conf,

	'flags': '[{:s}]'.format(','.join(flags)) if len(flags) else '',
	'notes': '\n'.join(notes),

	'cparams': urllib.parse.urlencode({
	'subject': 'Error: ' + fixdsubm['id'],
	'message': notifymsg.format(**fixdsubm),
	'to': username,
	}),
	'notifyhelp': 'Please provide as much information as possible to help improve these results.',
	}

	# add in keywords from original submission
	for k,v in origsubm.items():
	message_data[k] = v

	message = """
	Original Submission: [{title:s}]({permalink:s} "Posted by {author:s} to /r/{subreddit:s} on {origtime:s}") {flags:s}
	Posted: {difftime:s} before this post by [{author:s}](/user/{author:s})
	{confdesc:s} Confidence: {confidence:d}% certain that these submissions are related.
	{notes:s}
	---
	Providing source for [FIXED] posts, linking to the original in comments
	Confidence based on factors including title similarity and subreddit id.
	This comment generated by an automated bot. [Is this match wrong?](/message/compose/?{cparams:s} "{notifyhelp:s}")"""

	post_time = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(origsubm['created_utc']))
	return '\n\n'.join(x for x in message.strip().replace('\t', '').splitlines() if x).format(**message_data)

	ratelimit_rgx = re.compile('you are doing that too much. try again in (\d+) (\w+).', re.I)

	def post_comment(opener, mhash, fixdsubm, origsubm):
	fixdsubm['permalink'] = compact_permalink(fixdsubm)
	origsubm['permalink'] = compact_permalink(origsubm)

	POST_DATA = {
	'r': fixdsubm['subreddit'],
	'text': build_comment(fixdsubm, origsubm),
	'thing_id': fixdsubm['name'],
	'uh': mhash,
	}

	# lack the confidence to associate these posts
	if POST_DATA['text'] is None: return -1
	time.sleep(REDDIT_DELAY)

	post_params = urllib.parse.urlencode(POST_DATA).encode('utf8')
	with cj_opener.open('http://www.reddit.com/api/comment', post_params) as f:
	json_data = f.read().decode('ascii')
	match = ratelimit_rgx.search(json_data)

	if match is not None:
	unit = 60 if match.group(2).startswith('min') else 1
	return unit * int(match.group(1))
	return 0

	def original_posted(fixedsub, origsubm):
	time.sleep(REDDIT_DELAY)

	comments_url = 'http://www.reddit.com' + compact_permalink(fixedsub) + '.json'
	with urllib.request.urlopen(comments_url, None) as f:
	comment_data = json.loads(f.read().decode('ascii'))
	fposter = comment_data[0]['data']['children'][0]['data']['author']

	origid = origsubm['id']
	for comment in comment_data[1]['data']['children']:
	cauth, ctext = comment['data']['author'], comment['data']['body']
	if cauth in (username, fposter) and ('orig' in ctext.lower() or origid in ctext):
	return cauth
	return None

	class SmartRedirectHandler(urllib.request.HTTPRedirectHandler):
	def http_error_301(self, req, fp, code, msg, headers):
	result = urllib.request.HTTPRedirectHandler.http_error_301(
	self, req, fp, code, msg, headers)
	result.status = code
	return result

	def http_error_302(self, req, fp, code, msg, headers):
	result = urllib.request.HTTPRedirectHandler.http_error_302(
	self, req, fp, code, msg, headers)
	result.status = code
	return result

	try:
	cj = http.cookiejar.CookieJar()
	cj_opener = urllib.request.build_opener(SmartRedirectHandler(),
	urllib.request.HTTPCookieProcessor(cj))

	login_params = urllib.parse.urlencode(LOGIN_DATA).encode('utf8')
	with cj_opener.open('http://www.reddit.com/api/login/' + username, login_params) as f:
	login_response = json.loads(f.read().decode('ascii'))
	modhash = login_response['json']['data']['modhash']
	print('modhash: ' + modhash)

	except Exception as e:
	print('Login failed...') ; raise

	if __name__ == '__main__':
	last_check = None
	time.sleep(REDDIT_DELAY)

	while True:
	try:
	get_fixed = urllib.parse.urlencode({'q': 'fixed', 'sort': 'new', 'limit': POST_LIMIT})
	req = urllib.request.Request('{:s}?{:s}'.format(SEARCH_URL, get_fixed), None, HTTP_HEADERS)
	with urllib.request.urlopen(req) as f:
	fixed = json.loads(f.read().decode('ascii'))
	submissions = [c['data'] for c in fixed['data']['children']]

	# no ratelimit set
	ratelimit = 0

	# check all submissions, stop when we get to where we left off
	for fixdsubm in submissions:
	if fixdsubm['id'] == last_check: break
	time.sleep(REDDIT_DELAY)

	# data needed for information on the original
	clean_title = cleanup_tags(fixdsubm['title'])

	fixed_ndx = clean_title.lower().rindex('fixed')
	bracketL, bracketR = clean_title.rfind('[', 0, fixed_ndx), 1 + clean_title.find(']', fixed_ndx)

	if -1 in (bracketL, bracketR): continue
	fixdsubm['title_clean'] = (clean_title[:bracketL] + clean_title[bracketR:]).strip()

	original = search(fixdsubm)
	if original is None: continue

	oposter = original_posted(fixdsubm, original)
	if oposter == username: break
	if oposter is not None: continue

	# let's post this thing!
	ratelimit = post_comment(cj_opener, modhash, fixdsubm, original)
	if ratelimit > 0: print('RATELIMITED: %d seconds...' % ratelimit) ; break
	if ratelimit == 0: print('Posting info for [%s]' % compact_permalink(fixdsubm))

	# save the first entry of this set so that we can stop at this point next time
	if len(submissions): last_check = submissions[0]['id']

	except Exception as e:
	print('>>> ERROR: {:s} occurred...'.format(e.__class__.__name__))

	# sleep for no less than the requested delay, but for the amount of time between requests
	sleeptime = max(ratelimit, REDDIT_DELAY, DELAY_SECONDS) ; time.sleep(sleeptime)