adammenges/ml-bot.py

## ml-bot.py
##############################################################################
##                                                                          ##
##                                                                          ##
##                      ( \/ )(  )   ___(  _ \ /  \(_  _)                   ##
##                      / \/ \/ (_/\(___)) _ ((  O ) )(                     ##
##                      \_)(_/\____/    (____/ \__/ (__)                    ##
##                                                                          ##
##                                                                          ##
##                                                                          ##
##   The beginings of an ml-bot. To start, he'll let us know when           ##
##   there is a trending post on r/machinelearning                          ##
##                                                                          ##
##                                                                          ##
##############################################################################

from boltons.cacheutils import LRU
import pickle
import os
import json
import requests

pushed_path = 'pushed.pickle'
USER_AGENT = "linux:com.adammenges.herpaderp-mlbot:v0.00000001 (by adammenges)"

cache = LRU(max_size=10000)

if os.path.exists(pushed_path):
	with open(pushed_path, 'rb') as f:
		pushed = pickle.load(f)
else:
	pushed = LRU(max_size=2000)

# Should be moved over to helper lib
def requests_get(url):
	try:
		response = requests.get(
			url,
			headers={'User-Agent': USER_AGENT},
			timeout=60,
			verify=False,
		)
	except Exception, e:
		print("ERROR: " + str(e))
		return ""

	if 'Too Many Requests' in response.reason:
		print "Pausing..."
		time.sleep(30)
		return requests_get(url)

	if 'content-type' in response.headers:
		if 'html' in response.headers['content-type']:
			return response.text
		elif 'json' in response.headers['content-type']:
			return response.text
		else:
			print('Not HTML/JSON') # Proper logging at some point
	return "" # Support PDFs, etc., later...

# Do some work here once we have enough data to choose a good threshold
def running_mean():
	pass

def refresh_data():
	try:
		print('Starting to get r/ml data')
		r = requests_get('http://reddit.com/r/machinelearning.json')
		j = json.loads(r)
		for i in j['data']['children']:
			k = i['data']
			cache[k['url']] = k
	except Exception, e:
		print('ERROR: ' + str(e))

def push(url):
	push_url = 'https://hooks.slack.com/services/****'
	headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
	pushed[url] = True
	payload = {
		'text': "New top post on r/machinelearning! \n\n <{}>".format(url),
		"username": "ml-bot",
		"icon_emoji": ":smiling_imp:"
	}
	r = requests.post(push_url, data=json.dumps(payload))
	with open(pushed_path, 'wb') as f:
		pickle.dump(pushed, f)
	return r

def run():
	import time
	while(True):
		refresh_data()
		for url, item in cache.items():
			if (item['ups'] > 50) and (url not in pushed):
				push(url)
		print('sleeping...')
		time.sleep(600)
		print('about to begin...') # So I don't kill it the second it starts by accident, and end up ruining the dumped cache.
		time.sleep(20)

run()
	##############################################################################
	## ##
	## ##
	## ( \/ )( ) ___( _ \ / \(_ _) ##
	## / \/ \/ (_/\(___)) _ (( O ) )( ##
	## \_)(_/\____/ (____/ \__/ (__) ##
	## ##
	## ##
	## ##
	## The beginings of an ml-bot. To start, he'll let us know when ##
	## there is a trending post on r/machinelearning ##
	## ##
	## ##
	##############################################################################

	from boltons.cacheutils import LRU
	import pickle
	import os
	import json
	import requests

	pushed_path = 'pushed.pickle'
	USER_AGENT = "linux:com.adammenges.herpaderp-mlbot:v0.00000001 (by adammenges)"

	cache = LRU(max_size=10000)

	if os.path.exists(pushed_path):
	with open(pushed_path, 'rb') as f:
	pushed = pickle.load(f)
	else:
	pushed = LRU(max_size=2000)

	# Should be moved over to helper lib
	def requests_get(url):
	try:
	response = requests.get(
	url,
	headers={'User-Agent': USER_AGENT},
	timeout=60,
	verify=False,
	)
	except Exception, e:
	print("ERROR: " + str(e))
	return ""

	if 'Too Many Requests' in response.reason:
	print "Pausing..."
	time.sleep(30)
	return requests_get(url)

	if 'content-type' in response.headers:
	if 'html' in response.headers['content-type']:
	return response.text
	elif 'json' in response.headers['content-type']:
	return response.text
	else:
	print('Not HTML/JSON') # Proper logging at some point
	return "" # Support PDFs, etc., later...

	# Do some work here once we have enough data to choose a good threshold
	def running_mean():
	pass

	def refresh_data():
	try:
	print('Starting to get r/ml data')
	r = requests_get('http://reddit.com/r/machinelearning.json')
	j = json.loads(r)
	for i in j['data']['children']:
	k = i['data']
	cache[k['url']] = k
	except Exception, e:
	print('ERROR: ' + str(e))

	def push(url):
	push_url = 'https://hooks.slack.com/services/****'
	headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
	pushed[url] = True
	payload = {
	'text': "New top post on r/machinelearning! \n\n <{}>".format(url),
	"username": "ml-bot",
	"icon_emoji": ":smiling_imp:"
	}
	r = requests.post(push_url, data=json.dumps(payload))
	with open(pushed_path, 'wb') as f:
	pickle.dump(pushed, f)
	return r

	def run():
	import time
	while(True):
	refresh_data()
	for url, item in cache.items():
	if (item['ups'] > 50) and (url not in pushed):
	push(url)
	print('sleeping...')
	time.sleep(600)
	print('about to begin...') # So I don't kill it the second it starts by accident, and end up ruining the dumped cache.
	time.sleep(20)

	run()