Skip to content

Instantly share code, notes, and snippets.

@zopieux
Forked from gustavi/ircbot_url_title.py
Last active August 29, 2015 13:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zopieux/8769810 to your computer and use it in GitHub Desktop.
Save zopieux/8769810 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
#-*- encoding: utf-8 -*-
# ircbot_url_title.py
# Simple bot which displays urls titles
#
# Copyright (c) 2010 Mick@el and Zopieux
# Copyright (c) 2014 gustavi
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as
# published by the Free Software Foundation, either version 3 of
# the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import time
from urllib import parse
import re
import unicodedata
import difflib
import requests
from pypeul import *
from bs4 import BeautifulSoup
from itertools import groupby
# Bot informations
BOT_NAME = 'SimpleB00t'
BOT_CHAN = '##gustavi'
BOT_SERVER = 'irc.freenode.net'
BOT_PORT = 6667
# Main settings
settings = {
'ratio' : 0.6, # Display or not title, use in SimpleUrlBot.show_url()
'buffer_url_len' : 25,
'title_len_min' : 18,
'title_len_max' : 136,
'content_length_max' : 2097152,
}
# Contain lasts urls
buffer_url = []
# The urls we can't displays title
exceptions_domain = [
'docs.google.com',
'translate.google.com',
'paste.awesom.eu',
]
class SimpleUrlBot(IRC):
def get_final_url(self, url):
"""Return the final url (after redirections)"""
req = requests.get(url)
return req.url
def get_title(self, url):
"""
Get the url title. Return error message if http error or domain does
not exist.
"""
try:
# Download only the header to check content-length
head = requests.head(self.get_final_url(url))
try:
if int(head.headers['content-length']) > settings['content_length_max']:
return ''
except (ValueError, KeyError):
pass
# Download all
req = requests.get(url)
try:
soup = BeautifulSoup(req.content)
title = soup.title.string.replace('\n', '')
if len(title) > settings['title_len_max']:
title = title[:settings['title_len_max']] + '...'
except AttributeError:
return ''
except urllib.error.HTTPError:
return ''
except urllib.error.URLError:
return ''
# Compute a compact redirection path
hist = (urllib.parse.urlsplit(u.url).netloc for u in req.history + (req.url,))
nopdup_hist = ((k, len(g)) for k, g in groupby(hist))
path = ' → '.join('%s%s' % (host, ' ×%d' % c if c > 1 else '') for host, c in nodup_hist)
return '[{}] {}'.format(path, title)
def slugify(self, value):
"""
Converts to lowercase, removes non-word characters (alphanumerics and
underscores) and converts spaces to hyphens. Also strips leading and
trailing whitespace.
"""
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
value = re.sub('[^\w\s-]', '', value).strip().lower()
return re.sub('[-\s]+', '-', value)
def show_url(self, url, title):
"""
Displays title only if:
- url does not contain title
- url is not recent (is not in 'buffer_url')
- lenght of title is bigger than 'url_title_min' (in settings)
- not in exeptions list
"""
# Check the min lenght
if len(title) < settings['title_len_min']:
return False
# Check if not in exceptions list
if urllib.parse.urlsplit(url)[1] in exceptions_domain:
return False
url_dif = urllib.parse.urlsplit(url).path.split('/')[-1]
title_dif = self.slugify(title)
# Check if url does not contain title
if difflib.SequenceMatcher(None, url_dif, title_dif).ratio() < settings['ratio']:
# Check if is recent
url = self.get_final_url(url)
if url in buffer_url:
return False
if len(buffer_url) == settings['buffer_url_len']:
del buffer_url[0]
buffer_url.append(url)
return True
return False
def on_ready(self):
"""
If you want your bot to join a channel when it connects, you should do
that in the on_ready event handler.
"""
self.join(BOT_CHAN)
def on_channel_message(self, umask, target, msg):
"""
Main event handler, called when someone speaks on a channel where the
bot is.
Just displays the title for each url.
"""
urls = urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', msg)
for i in urls:
title = self.get_title(i)
if self.show_url(i, title) and title != '':
self.message(target, title)
def on_ctcp_version_request(self, umask, value):
"""
There are event handlers for CTCP too.
Here the bot replies its own __version__ string on a CTCP "version".
"""
self.ctcp_reply(umask.nick,
'VERSION',
"{}, powered by pypeul and <3".format(BOT_NAME))
def on_disconnected(self):
logger.info('Disconnected. Trying to reconnect...')
time_sleep = 30
while True:
try:
self.connect(BOT_SERVER, BOT_PORT)
self.ident(BOT_NAME)
self.run()
break
except:
logger.error('Attempt failed. Retrying in {}s...'.format(time_spleep))
time_sleep += 30
time.sleep(time_sleep)
if __name__ == '__main__':
# Enable debug-level logging
import logging
logging.basicConfig(level=logging.DEBUG)
# Instanciate our SimpleUrlBot class and let it run
bot = SimpleUrlBot()
bot.connect(BOT_SERVER, BOT_PORT)
bot.ident(BOT_NAME)
bot.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment