zopieux/ircbot_url_title.py

## ircbot_url_title.py
#!/usr/bin/env python3
#-*- encoding: utf-8 -*-

# ircbot_url_title.py
# Simple bot which displays urls titles
#
# Copyright (c) 2010 Mick@el and Zopieux
# Copyright (c) 2014 gustavi
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as
# published by the Free Software Foundation, either version 3 of
# the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import time
from urllib import parse
import re
import unicodedata
import difflib

import requests
from pypeul import *
from bs4 import BeautifulSoup
from itertools import groupby

# Bot informations
BOT_NAME = 'SimpleB00t'
BOT_CHAN = '##gustavi'
BOT_SERVER = 'irc.freenode.net'
BOT_PORT = 6667

# Main settings
settings = {
    'ratio' : 0.6, # Display or not title, use in SimpleUrlBot.show_url()
    'buffer_url_len' : 25,
    'title_len_min' : 18,
    'title_len_max' : 136,
    'content_length_max' : 2097152,
}

# Contain lasts urls
buffer_url = []

# The urls we can't displays title
exceptions_domain = [
    'docs.google.com',
    'translate.google.com',
    'paste.awesom.eu',
]

class SimpleUrlBot(IRC):
    def get_final_url(self, url):
        """Return the final url (after redirections)"""

        req = requests.get(url)
        return req.url

    def get_title(self, url):
        """
        Get the url title.  Return error message if http error or domain does
        not exist.
        """

        try:
            # Download only the header to check content-length
            head = requests.head(self.get_final_url(url))
            try:
                if int(head.headers['content-length']) > settings['content_length_max']:
                    return ''
            except (ValueError, KeyError):
                pass
            # Download all
            req = requests.get(url)

            try:
                soup = BeautifulSoup(req.content)
                title = soup.title.string.replace('\n', '')
                if len(title) > settings['title_len_max']:
                    title = title[:settings['title_len_max']] + '...'
            except AttributeError:
                return ''
        except urllib.error.HTTPError:
            return ''
        except urllib.error.URLError:
            return ''

        # Compute a compact redirection path
        hist = (urllib.parse.urlsplit(u.url).netloc for u in req.history + (req.url,))
        nopdup_hist = ((k, len(g)) for k, g in groupby(hist))
        path = ' → '.join('%s%s' % (host, ' ×%d' % c if c > 1 else '') for host, c in nodup_hist)
        return '[{}] {}'.format(path, title)

    def slugify(self, value):
        """
        Converts to lowercase, removes non-word characters (alphanumerics and
        underscores) and converts spaces to hyphens. Also strips leading and
        trailing whitespace.
        """

        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
        value = re.sub('[^\w\s-]', '', value).strip().lower()
        return re.sub('[-\s]+', '-', value)

    def show_url(self, url, title):
        """
        Displays title only if:
            - url does not contain title
            - url is not recent (is not in 'buffer_url')
            - lenght of title is bigger than 'url_title_min' (in settings)
            - not in exeptions list
        """

        # Check the min lenght
        if len(title) < settings['title_len_min']:
            return False
        # Check if not in exceptions list
        if urllib.parse.urlsplit(url)[1] in exceptions_domain:
            return False
        url_dif = urllib.parse.urlsplit(url).path.split('/')[-1]
        title_dif = self.slugify(title)
        # Check if url does not contain title
        if difflib.SequenceMatcher(None, url_dif, title_dif).ratio() < settings['ratio']:
            # Check if is recent
            url = self.get_final_url(url)
            if url in buffer_url:
                return False
            if len(buffer_url) == settings['buffer_url_len']:
                del buffer_url[0]
            buffer_url.append(url)
            return True
        return False

    def on_ready(self):
        """
        If you want your bot to join a channel when it connects, you should do
        that in the on_ready event handler.
        """

        self.join(BOT_CHAN)

    def on_channel_message(self, umask, target, msg):
        """
        Main event handler, called when someone speaks on a channel where the
        bot is.
        Just displays the title for each url.
        """

        urls = urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', msg)
        for i in urls:
            title = self.get_title(i)
            if self.show_url(i, title) and title != '':
                self.message(target, title)

    def on_ctcp_version_request(self, umask, value):
        """
        There are event handlers for CTCP too.
        Here the bot replies its own __version__ string on a CTCP "version".
        """

        self.ctcp_reply(umask.nick,
                        'VERSION',
                        "{}, powered by pypeul and <3".format(BOT_NAME))

    def on_disconnected(self):
        logger.info('Disconnected. Trying to reconnect...')
        time_sleep = 30
        while True:
            try:
                self.connect(BOT_SERVER, BOT_PORT)
                self.ident(BOT_NAME)
                self.run()
                break
            except:
                logger.error('Attempt failed. Retrying in {}s...'.format(time_spleep))
            time_sleep += 30
            time.sleep(time_sleep)

if __name__ == '__main__':
    # Enable debug-level logging
    import logging
    logging.basicConfig(level=logging.DEBUG)

    # Instanciate our SimpleUrlBot class and let it run
    bot = SimpleUrlBot()
    bot.connect(BOT_SERVER, BOT_PORT)
    bot.ident(BOT_NAME)
    bot.run()
	#!/usr/bin/env python3
	#-- encoding: utf-8 --

	# ircbot_url_title.py
	# Simple bot which displays urls titles
	#
	# Copyright (c) 2010 Mick@el and Zopieux
	# Copyright (c) 2014 gustavi
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU Lesser General Public License as
	# published by the Free Software Foundation, either version 3 of
	# the License, or (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU Lesser General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <http://www.gnu.org/licenses/>.

	import time
	from urllib import parse
	import re
	import unicodedata
	import difflib

	import requests
	from pypeul import *
	from bs4 import BeautifulSoup
	from itertools import groupby

	# Bot informations
	BOT_NAME = 'SimpleB00t'
	BOT_CHAN = '##gustavi'
	BOT_SERVER = 'irc.freenode.net'
	BOT_PORT = 6667

	# Main settings
	settings = {
	'ratio' : 0.6, # Display or not title, use in SimpleUrlBot.show_url()
	'buffer_url_len' : 25,
	'title_len_min' : 18,
	'title_len_max' : 136,
	'content_length_max' : 2097152,
	}

	# Contain lasts urls
	buffer_url = []

	# The urls we can't displays title
	exceptions_domain = [
	'docs.google.com',
	'translate.google.com',
	'paste.awesom.eu',
	]

	class SimpleUrlBot(IRC):
	def get_final_url(self, url):
	"""Return the final url (after redirections)"""

	req = requests.get(url)
	return req.url

	def get_title(self, url):
	"""
	Get the url title. Return error message if http error or domain does
	not exist.
	"""

	try:
	# Download only the header to check content-length
	head = requests.head(self.get_final_url(url))
	try:
	if int(head.headers['content-length']) > settings['content_length_max']:
	return ''
	except (ValueError, KeyError):
	pass
	# Download all
	req = requests.get(url)

	try:
	soup = BeautifulSoup(req.content)
	title = soup.title.string.replace('\n', '')
	if len(title) > settings['title_len_max']:
	title = title[:settings['title_len_max']] + '...'
	except AttributeError:
	return ''
	except urllib.error.HTTPError:
	return ''
	except urllib.error.URLError:
	return ''

	# Compute a compact redirection path
	hist = (urllib.parse.urlsplit(u.url).netloc for u in req.history + (req.url,))
	nopdup_hist = ((k, len(g)) for k, g in groupby(hist))
	path = ' → '.join('%s%s' % (host, ' ×%d' % c if c > 1 else '') for host, c in nodup_hist)
	return '[{}] {}'.format(path, title)

	def slugify(self, value):
	"""
	Converts to lowercase, removes non-word characters (alphanumerics and
	underscores) and converts spaces to hyphens. Also strips leading and
	trailing whitespace.
	"""

	value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
	value = re.sub('[^\w\s-]', '', value).strip().lower()
	return re.sub('[-\s]+', '-', value)

	def show_url(self, url, title):
	"""
	Displays title only if:
	- url does not contain title
	- url is not recent (is not in 'buffer_url')
	- lenght of title is bigger than 'url_title_min' (in settings)
	- not in exeptions list
	"""

	# Check the min lenght
	if len(title) < settings['title_len_min']:
	return False
	# Check if not in exceptions list
	if urllib.parse.urlsplit(url)[1] in exceptions_domain:
	return False
	url_dif = urllib.parse.urlsplit(url).path.split('/')[-1]
	title_dif = self.slugify(title)
	# Check if url does not contain title
	if difflib.SequenceMatcher(None, url_dif, title_dif).ratio() < settings['ratio']:
	# Check if is recent
	url = self.get_final_url(url)
	if url in buffer_url:
	return False
	if len(buffer_url) == settings['buffer_url_len']:
	del buffer_url[0]
	buffer_url.append(url)
	return True
	return False

	def on_ready(self):
	"""
	If you want your bot to join a channel when it connects, you should do
	that in the on_ready event handler.
	"""

	self.join(BOT_CHAN)

	def on_channel_message(self, umask, target, msg):
	"""
	Main event handler, called when someone speaks on a channel where the
	bot is.
	Just displays the title for each url.
	"""

	urls = urls = re.findall('http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\(\),]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+', msg)
	for i in urls:
	title = self.get_title(i)
	if self.show_url(i, title) and title != '':
	self.message(target, title)

	def on_ctcp_version_request(self, umask, value):
	"""
	There are event handlers for CTCP too.
	Here the bot replies its own __version__ string on a CTCP "version".
	"""

	self.ctcp_reply(umask.nick,
	'VERSION',
	"{}, powered by pypeul and <3".format(BOT_NAME))

	def on_disconnected(self):
	logger.info('Disconnected. Trying to reconnect...')
	time_sleep = 30
	while True:
	try:
	self.connect(BOT_SERVER, BOT_PORT)
	self.ident(BOT_NAME)
	self.run()
	break
	except:
	logger.error('Attempt failed. Retrying in {}s...'.format(time_spleep))
	time_sleep += 30
	time.sleep(time_sleep)

	if __name__ == '__main__':
	# Enable debug-level logging
	import logging
	logging.basicConfig(level=logging.DEBUG)

	# Instanciate our SimpleUrlBot class and let it run
	bot = SimpleUrlBot()
	bot.connect(BOT_SERVER, BOT_PORT)
	bot.ident(BOT_NAME)
	bot.run()