Skip to content

Instantly share code, notes, and snippets.

@adammenges
Last active August 29, 2015 14:20
Show Gist options
  • Save adammenges/c76a5df91ef0e15d67f9 to your computer and use it in GitHub Desktop.
Save adammenges/c76a5df91ef0e15d67f9 to your computer and use it in GitHub Desktop.
ml-bot
##############################################################################
## ##
## ##
## ( \/ )( ) ___( _ \ / \(_ _) ##
## / \/ \/ (_/\(___)) _ (( O ) )( ##
## \_)(_/\____/ (____/ \__/ (__) ##
## ##
## ##
## ##
## The beginings of an ml-bot. To start, he'll let us know when ##
## there is a trending post on r/machinelearning ##
## ##
## ##
##############################################################################
from boltons.cacheutils import LRU
import pickle
import os
import json
import requests
pushed_path = 'pushed.pickle'
USER_AGENT = "linux:com.adammenges.herpaderp-mlbot:v0.00000001 (by adammenges)"
cache = LRU(max_size=10000)
if os.path.exists(pushed_path):
with open(pushed_path, 'rb') as f:
pushed = pickle.load(f)
else:
pushed = LRU(max_size=2000)
# Should be moved over to helper lib
def requests_get(url):
try:
response = requests.get(
url,
headers={'User-Agent': USER_AGENT},
timeout=60,
verify=False,
)
except Exception, e:
print("ERROR: " + str(e))
return ""
if 'Too Many Requests' in response.reason:
print "Pausing..."
time.sleep(30)
return requests_get(url)
if 'content-type' in response.headers:
if 'html' in response.headers['content-type']:
return response.text
elif 'json' in response.headers['content-type']:
return response.text
else:
print('Not HTML/JSON') # Proper logging at some point
return "" # Support PDFs, etc., later...
# Do some work here once we have enough data to choose a good threshold
def running_mean():
pass
def refresh_data():
try:
print('Starting to get r/ml data')
r = requests_get('http://reddit.com/r/machinelearning.json')
j = json.loads(r)
for i in j['data']['children']:
k = i['data']
cache[k['url']] = k
except Exception, e:
print('ERROR: ' + str(e))
def push(url):
push_url = 'https://hooks.slack.com/services/****'
headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
pushed[url] = True
payload = {
'text': "New top post on r/machinelearning! \n\n <{}>".format(url),
"username": "ml-bot",
"icon_emoji": ":smiling_imp:"
}
r = requests.post(push_url, data=json.dumps(payload))
with open(pushed_path, 'wb') as f:
pickle.dump(pushed, f)
return r
def run():
import time
while(True):
refresh_data()
for url, item in cache.items():
if (item['ups'] > 50) and (url not in pushed):
push(url)
print('sleeping...')
time.sleep(600)
print('about to begin...') # So I don't kill it the second it starts by accident, and end up ruining the dumped cache.
time.sleep(20)
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment