Skip to content

Instantly share code, notes, and snippets.

@zakwilson
Created November 18, 2012 15:11
Show Gist options
  • Save zakwilson/4105729 to your computer and use it in GitHub Desktop.
Save zakwilson/4105729 to your computer and use it in GitHub Desktop.
reddit spam filter
description "train the spam filter"
instance $x
stop on reddit-stop or runlevel [016]
respawn
respawn limit 10 5
nice 10
script
. /etc/default/reddit
wrap-job paster run --proctitle learn_q$x $REDDIT_INI $REDDIT_ROOT/r2/lib/learn.py -c 'run()'
end script
description "check spam for newly submitted links"
instance $x
stop on reddit-stop or runlevel [016]
respawn
respawn limit 10 5
nice 10
script
. /etc/default/reddit
wrap-job paster run --proctitle spam_q$x $REDDIT_INI $REDDIT_ROOT/r2/lib/spam.py -c 'run()'
end script
# This is my current declare_queues(). You must also add spam_q to the consumer-counts file
def declare_queues():
queues = Queues({
"scraper_q": MessageQueue(),
"newcomments_q": MessageQueue(),
"commentstree_q": MessageQueue(),
"commentstree_fastlane_q": MessageQueue(),
"vote_link_q": MessageQueue(bind_to_self=True),
"vote_comment_q": MessageQueue(bind_to_self=True),
"vote_fastlane_q": MessageQueue(bind_to_self=True),
"log_q": MessageQueue(bind_to_self=True),
"usage_q": MessageQueue(bind_to_self=True, durable=False),
"cloudsearch_changes": MessageQueue(bind_to_self=True),
"update_promos_q": MessageQueue(bind_to_self=True),
"spam_q": MessageQueue(),
"learn_q": MessageQueue(),
})
queues.cloudsearch_changes << "search_changes"
queues.scraper_q << "new_link"
queues.newcomments_q << "new_comment"
queues.commentstree_q << "new_comment"
queues.commentstree_fastlane_q << "new_fastlane_comment"
queues.spam_q << "new_link"
queues.spam_q << "new_comment"
queues.learn_q << "ban"
queues.learn_q << "unban"
return queues
# Changed: now uses amqp. Add this to the imports:
from r2.lib import amqp
# and this to spam() after t.commit
amqp.add_item('ban', t._fullname)
# and this to unspam()
amqp.add_item('unban', t._fullname)
from pylons import g, config
from r2.models.link import Link, Comment
from r2.lib import amqp
from r2.lib.spam import classifyr_request, classify_link
from r2.lib.utils import TimeoutFunction, TimeoutFunctionException
import json
def learn(link):
msg = ""
url = ""
if hasattr(link, 'selftext'):
# self post
msg = link.selftext
elif hasattr(link, 'body'):
# comment
msg = link.body
if hasattr(link, 'url') and link.url[0:3] != "/r/":
# not self post
url = link.url
body = json.dumps({'message': msg,
'url': url,
'features': {'author:' + str(link.author_id): 1},
'category': 'spam' if link._spam else 'ham'})
classifyr_request('/api/simple-spam/learn', body)
def run():
def process_link(msg):
def _process_link(fname):
link = Link._by_fullname(fname, data=True)
learn(link)
fname = msg.body
try:
TimeoutFunction(_process_link, 30)(fname)
except TimeoutFunctionException:
print "Timed out on %s" % fname
except KeyboardInterrupt:
raise
except:
print "Error fetching %s" % fname
print traceback.format_exc()
amqp.consume_items('learn_q', process_link)
# Initial classifyr.com support
from pylons import g, config
from r2.models.link import Link, Comment
from r2.lib import amqp
from hashlib import sha1
from r2.lib.db.queries import ban
import httplib, urllib, json
from r2.lib.utils import TimeoutFunction, TimeoutFunctionException
def classifyr_request(endpoint, body):
checksum = sha1(body + g.classifyr_key).hexdigest()
headers = {"classifyr-api-checksum": checksum,
"classifyr-api-user": g.classifyr_username,}
conn = httplib.HTTPConnection("classifyr.com")
conn.request('POST', endpoint, body, headers)
resp = conn.getresponse()
return resp.read()
def classify_link(link):
msg = ""
url = ""
if hasattr(link, 'selftext'):
# self post
msg = link.selftext
elif hasattr(link, 'body'):
# comment
msg = link.body
if hasattr(link, 'url') and link.url[0:3] != "/r/":
# not self post
url = link.url
body = json.dumps({'message': msg,
'url': url,
'features': {'author:' + str(link.author_id): 1}})
return int(classifyr_request('/api/simple-spam/score', body))
def spam_if_classified(link):
account = Account._byID(link.author_id)
if account._spam:
link._spam = True
link._commit()
ban(link)
return True
score = classify_link(link)
multiplier = 1
karma = account.link_karma + account.comment_karma
if karma < 20:
multiplier *= 1.5
score *= multiplier
if score >= int(g.spam_threshold): # spam
link._spam = True
link._commit()
if score >= int(g.blackhole_threshold): # kill it with fire!
ban(link)
def run():
def process_link(msg):
def _process_link(fname):
link = Link._by_fullname(fname, data=True)
spam_if_classified(link)
fname = msg.body
try:
TimeoutFunction(_process_link, 30)(fname)
except TimeoutFunctionException:
print "Timed out on %s" % fname
except KeyboardInterrupt:
raise
except:
print "Error fetching %s" % fname
print traceback.format_exc()
amqp.consume_items('spam_q', process_link)
# This is a script for learning everything in your reddit as spam or not depending on how you've already marked it.
from r2.lib.learn import train_link, learn
from r2.lib.spam import classify_link
from r2.models.link import Link, Comment
def learn_spam_range(start, finish):
for i in xrange(start, finish):
l = Link._byID(i)
try:
learn(l)
except Exception:
pass
# This is a script to run the filter on every link in a sub, or to ban/train everything in a sub that contains only spam
from r2.models.link import Link
from r2.models.subreddit import Subreddit
from r2.lib.spam import spam_if_classified
from r2.lib.learn import learn
def check_links(s):
s = Subreddit._by_name(s)
ls = s.get_links('new','all')
ls.fetch()
ll = [Link._by_fullname(x[0]) for x in ls.data]
for link in ll:
spam_if_classified(link)
def spam_all(s):
s = Subreddit._by_name(s)
ls = s.get_links('new','all')
ls.fetch()
ll = [Link._by_fullname(x[0]) for x in ls.data]
for link in ll:
link._spam = True
link._commit()
learn(link)
@xsleonard
Copy link

Thanks

Some fixes:

r2libspam.py line 42
link._commit()

r2libspam.py, r2liblearn.py should
import traceback

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment