zakwilson/etcinitreddit-consumer-learn_q.conf

## etcinitreddit-consumer-learn_q.conf
description "train the spam filter"

instance $x

stop on reddit-stop or runlevel [016]

respawn
respawn limit 10 5

nice 10
script
    . /etc/default/reddit
    wrap-job paster run --proctitle learn_q$x $REDDIT_INI $REDDIT_ROOT/r2/lib/learn.py -c 'run()'
end script

## etcinitreddit-consumer-spam_q.conf
description "check spam for newly submitted links"

instance $x

stop on reddit-stop or runlevel [016]

respawn
respawn limit 10 5

nice 10
script
    . /etc/default/reddit
    wrap-job paster run --proctitle spam_q$x $REDDIT_INI $REDDIT_ROOT/r2/lib/spam.py -c 'run()'
end script

## r2configqueues.py
# This is my current declare_queues(). You must also add spam_q to the consumer-counts file

def declare_queues():
    queues = Queues({
        "scraper_q": MessageQueue(),
        "newcomments_q": MessageQueue(),
        "commentstree_q": MessageQueue(),
        "commentstree_fastlane_q": MessageQueue(),
        "vote_link_q": MessageQueue(bind_to_self=True),
        "vote_comment_q": MessageQueue(bind_to_self=True),
        "vote_fastlane_q": MessageQueue(bind_to_self=True),
        "log_q": MessageQueue(bind_to_self=True),
        "usage_q": MessageQueue(bind_to_self=True, durable=False),
        "cloudsearch_changes": MessageQueue(bind_to_self=True),
        "update_promos_q": MessageQueue(bind_to_self=True),

        "spam_q": MessageQueue(),
        "learn_q": MessageQueue(),
    })

    queues.cloudsearch_changes << "search_changes"
    queues.scraper_q << "new_link"
    queues.newcomments_q << "new_comment"
    queues.commentstree_q << "new_comment"
    queues.commentstree_fastlane_q << "new_fastlane_comment"

    queues.spam_q << "new_link"
    queues.spam_q << "new_comment"

    queues.learn_q << "ban"
    queues.learn_q << "unban"

    return queues

## r2libadmintools.py
# Changed: now uses amqp. Add this to the imports:

from r2.lib import amqp

# and this to spam() after t.commit

amqp.add_item('ban', t._fullname)

# and this to unspam()

amqp.add_item('unban', t._fullname)

## r2liblearn.py
from pylons import g, config
from r2.models.link import Link, Comment
from r2.lib import amqp
from r2.lib.spam import classifyr_request, classify_link
from r2.lib.utils import TimeoutFunction, TimeoutFunctionException
import json

def learn(link):
    msg = ""
    url = ""
    if hasattr(link, 'selftext'):
        # self post
        msg = link.selftext
    elif hasattr(link, 'body'):
        # comment
        msg = link.body
    if hasattr(link, 'url') and link.url[0:3] != "/r/":
        # not self post
        url = link.url
    body = json.dumps({'message': msg,
                       'url': url,
                       'features': {'author:' + str(link.author_id): 1},
                       'category': 'spam' if link._spam else 'ham'})
    classifyr_request('/api/simple-spam/learn', body)

def run():
    def process_link(msg):
        def _process_link(fname):
            link = Link._by_fullname(fname, data=True)
            learn(link)

        fname = msg.body
        try:
            TimeoutFunction(_process_link, 30)(fname)
        except TimeoutFunctionException:
            print "Timed out on %s" % fname
        except KeyboardInterrupt:
            raise
        except:
            print "Error fetching %s" % fname
            print traceback.format_exc()
    amqp.consume_items('learn_q', process_link)

## r2libspam.py
# Initial classifyr.com support

from pylons import g, config
from r2.models.link import Link, Comment
from r2.lib import amqp
from hashlib import sha1
from r2.lib.db.queries import ban
import httplib, urllib, json

from r2.lib.utils import TimeoutFunction, TimeoutFunctionException

def classifyr_request(endpoint, body):
    checksum = sha1(body + g.classifyr_key).hexdigest()
    headers = {"classifyr-api-checksum": checksum,
               "classifyr-api-user": g.classifyr_username,}
    conn = httplib.HTTPConnection("classifyr.com")
    conn.request('POST', endpoint, body, headers)
    resp = conn.getresponse()
    return resp.read()

def classify_link(link):
    msg = ""
    url = ""
    if hasattr(link, 'selftext'):
        # self post
        msg = link.selftext
    elif hasattr(link, 'body'):
        # comment
        msg = link.body
    if hasattr(link, 'url') and link.url[0:3] != "/r/":
        # not self post
        url = link.url
    body = json.dumps({'message': msg,
                       'url': url,
                       'features': {'author:' + str(link.author_id): 1}})
    return int(classifyr_request('/api/simple-spam/score', body))

def spam_if_classified(link):
    account = Account._byID(link.author_id)
    if account._spam:
        link._spam = True
        link._commit()
        ban(link)
        return True
    score = classify_link(link)
    multiplier = 1
    karma = account.link_karma + account.comment_karma
    if karma < 20:
        multiplier *= 1.5
    score *= multiplier
    if score >= int(g.spam_threshold): # spam
        link._spam = True
        link._commit()
    if score >= int(g.blackhole_threshold): # kill it with fire!
        ban(link)

def run():
    def process_link(msg):
        def _process_link(fname):
            link = Link._by_fullname(fname, data=True)
            spam_if_classified(link)

        fname = msg.body
        try:
            TimeoutFunction(_process_link, 30)(fname)
        except TimeoutFunctionException:
            print "Timed out on %s" % fname
        except KeyboardInterrupt:
            raise
        except:
            print "Error fetching %s" % fname
            print traceback.format_exc()
    amqp.consume_items('spam_q', process_link)

## spamlearn.py
# This is a script for learning everything in your reddit as spam or not depending on how you've already marked it.

from r2.lib.learn import train_link, learn
from r2.lib.spam import classify_link
from r2.models.link import Link, Comment

def learn_spam_range(start, finish):
    for i in xrange(start, finish):
        l = Link._byID(i)
        try:
            learn(l)
        except Exception:
            pass

## spamsub.py
# This is a script to run the filter on every link in a sub, or to ban/train everything in a sub that contains only spam

from r2.models.link import Link
from r2.models.subreddit import Subreddit
from r2.lib.spam import spam_if_classified
from r2.lib.learn import learn

def check_links(s):
  s = Subreddit._by_name(s)
  ls = s.get_links('new','all')
  ls.fetch()
  ll = [Link._by_fullname(x[0]) for x in ls.data]
  for link in ll:
    spam_if_classified(link)

def spam_all(s):
  s = Subreddit._by_name(s)
  ls = s.get_links('new','all')
  ls.fetch()
  ll = [Link._by_fullname(x[0]) for x in ls.data]
  for link in ll:
    link._spam = True
    link._commit()
    learn(link)
	description "train the spam filter"

	instance $x

	stop on reddit-stop or runlevel [016]

	respawn
	respawn limit 10 5

	nice 10
	script
	. /etc/default/reddit
	wrap-job paster run --proctitle learn_q$x $REDDIT_INI $REDDIT_ROOT/r2/lib/learn.py -c 'run()'
	end script
	description "check spam for newly submitted links"

	instance $x

	stop on reddit-stop or runlevel [016]

	respawn
	respawn limit 10 5

	nice 10
	script
	. /etc/default/reddit
	wrap-job paster run --proctitle spam_q$x $REDDIT_INI $REDDIT_ROOT/r2/lib/spam.py -c 'run()'
	end script
	# This is my current declare_queues(). You must also add spam_q to the consumer-counts file

	def declare_queues():
	queues = Queues({
	"scraper_q": MessageQueue(),
	"newcomments_q": MessageQueue(),
	"commentstree_q": MessageQueue(),
	"commentstree_fastlane_q": MessageQueue(),
	"vote_link_q": MessageQueue(bind_to_self=True),
	"vote_comment_q": MessageQueue(bind_to_self=True),
	"vote_fastlane_q": MessageQueue(bind_to_self=True),
	"log_q": MessageQueue(bind_to_self=True),
	"usage_q": MessageQueue(bind_to_self=True, durable=False),
	"cloudsearch_changes": MessageQueue(bind_to_self=True),
	"update_promos_q": MessageQueue(bind_to_self=True),

	"spam_q": MessageQueue(),
	"learn_q": MessageQueue(),
	})

	queues.cloudsearch_changes << "search_changes"
	queues.scraper_q << "new_link"
	queues.newcomments_q << "new_comment"
	queues.commentstree_q << "new_comment"
	queues.commentstree_fastlane_q << "new_fastlane_comment"

	queues.spam_q << "new_link"
	queues.spam_q << "new_comment"

	queues.learn_q << "ban"
	queues.learn_q << "unban"

	return queues
	# Changed: now uses amqp. Add this to the imports:

	from r2.lib import amqp

	# and this to spam() after t.commit

	amqp.add_item('ban', t._fullname)

	# and this to unspam()

	amqp.add_item('unban', t._fullname)
	from pylons import g, config
	from r2.models.link import Link, Comment
	from r2.lib import amqp
	from r2.lib.spam import classifyr_request, classify_link
	from r2.lib.utils import TimeoutFunction, TimeoutFunctionException
	import json

	def learn(link):
	msg = ""
	url = ""
	if hasattr(link, 'selftext'):
	# self post
	msg = link.selftext
	elif hasattr(link, 'body'):
	# comment
	msg = link.body
	if hasattr(link, 'url') and link.url[0:3] != "/r/":
	# not self post
	url = link.url
	body = json.dumps({'message': msg,
	'url': url,
	'features': {'author:' + str(link.author_id): 1},
	'category': 'spam' if link._spam else 'ham'})
	classifyr_request('/api/simple-spam/learn', body)

	def run():
	def process_link(msg):
	def _process_link(fname):
	link = Link._by_fullname(fname, data=True)
	learn(link)

	fname = msg.body
	try:
	TimeoutFunction(_process_link, 30)(fname)
	except TimeoutFunctionException:
	print "Timed out on %s" % fname
	except KeyboardInterrupt:
	raise
	except:
	print "Error fetching %s" % fname
	print traceback.format_exc()
	amqp.consume_items('learn_q', process_link)
	# Initial classifyr.com support

	from pylons import g, config
	from r2.models.link import Link, Comment
	from r2.lib import amqp
	from hashlib import sha1
	from r2.lib.db.queries import ban
	import httplib, urllib, json

	from r2.lib.utils import TimeoutFunction, TimeoutFunctionException

	def classifyr_request(endpoint, body):
	checksum = sha1(body + g.classifyr_key).hexdigest()
	headers = {"classifyr-api-checksum": checksum,
	"classifyr-api-user": g.classifyr_username,}
	conn = httplib.HTTPConnection("classifyr.com")
	conn.request('POST', endpoint, body, headers)
	resp = conn.getresponse()
	return resp.read()

	def classify_link(link):
	msg = ""
	url = ""
	if hasattr(link, 'selftext'):
	# self post
	msg = link.selftext
	elif hasattr(link, 'body'):
	# comment
	msg = link.body
	if hasattr(link, 'url') and link.url[0:3] != "/r/":
	# not self post
	url = link.url
	body = json.dumps({'message': msg,
	'url': url,
	'features': {'author:' + str(link.author_id): 1}})
	return int(classifyr_request('/api/simple-spam/score', body))

	def spam_if_classified(link):
	account = Account._byID(link.author_id)
	if account._spam:
	link._spam = True
	link._commit()
	ban(link)
	return True
	score = classify_link(link)
	multiplier = 1
	karma = account.link_karma + account.comment_karma
	if karma < 20:
	multiplier *= 1.5
	score *= multiplier
	if score >= int(g.spam_threshold): # spam
	link._spam = True
	link._commit()
	if score >= int(g.blackhole_threshold): # kill it with fire!
	ban(link)

	def run():
	def process_link(msg):
	def _process_link(fname):
	link = Link._by_fullname(fname, data=True)
	spam_if_classified(link)

	fname = msg.body
	try:
	TimeoutFunction(_process_link, 30)(fname)
	except TimeoutFunctionException:
	print "Timed out on %s" % fname
	except KeyboardInterrupt:
	raise
	except:
	print "Error fetching %s" % fname
	print traceback.format_exc()
	amqp.consume_items('spam_q', process_link)
	# This is a script for learning everything in your reddit as spam or not depending on how you've already marked it.

	from r2.lib.learn import train_link, learn
	from r2.lib.spam import classify_link
	from r2.models.link import Link, Comment

	def learn_spam_range(start, finish):
	for i in xrange(start, finish):
	l = Link._byID(i)
	try:
	learn(l)
	except Exception:
	pass
	# This is a script to run the filter on every link in a sub, or to ban/train everything in a sub that contains only spam

	from r2.models.link import Link
	from r2.models.subreddit import Subreddit
	from r2.lib.spam import spam_if_classified
	from r2.lib.learn import learn

	def check_links(s):
	s = Subreddit._by_name(s)
	ls = s.get_links('new','all')
	ls.fetch()
	ll = [Link._by_fullname(x[0]) for x in ls.data]
	for link in ll:
	spam_if_classified(link)

	def spam_all(s):
	s = Subreddit._by_name(s)
	ls = s.get_links('new','all')
	ls.fetch()
	ll = [Link._by_fullname(x[0]) for x in ls.data]
	for link in ll:
	link._spam = True
	link._commit()
	learn(link)