Last active February 4, 2016 18:23
A phenny/jenni IRC Bot module for searching the Kaltura Knowledge Center and presenting the results from the first search results page in the IRC chat. Join us at #kaltura on
#!/usr/bin/env python
""" - jenni's Kaltura Knowledge Center Search Module
author Zohar Babin <>
Copyright Kaltura Inc.
Licensed under AGPL v3.
More info:
Returns the list of search results from the first page in the Kaltura Knowledge Center (
* jenni:
import re
import mechanize
import sys
import string
from bs4 import BeautifulSoup
import bitly_api
import urllib
def kc(jenni, input):
# We use bitly to shorten the Urls.. this is not a must, but makes the IRC messages cleaner
# Get your bitly API token from:
bitly = bitly_api.Connection(access_token="REPLACE_THIS_WITH_YOUR_BITLY_TOKEN")
# phenny keeps the command input in would be the command itself (kc in our case)
input_txt = urllib.quote('utf-8'))
# give the user some feedback while we go searching...
jenni.reply("Searcing the Kaltura Knowledge Center ( for: "
jenni.reply("If I find any results for your search, I will return all the results from the first page.")
# we use mechanize for fetching the search results page
br = mechanize.Browser()
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv: Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]""+input_txt)
assert br.viewing_html()
html = br.response().read()
soup = BeautifulSoup(html, "lxml") #lxml is the faster parser of all
search_div = soup.findAll("div", class_="search-results") #get the search-results div
# if we couldn't find any results for the desired terms:
if len(search_div) == 0:
return jenni.reply("I couldn't find any results for your search of "+input_txt)
# If resulsts were indeed found, parse the page for all h2 headers in the search-results div
# and return all the links to the results
rx = re.compile('\W+') #this will be used to cleanup any non textual chars from the result title
counter = 1
# we will only have one search div
for div in search_div:
headers = div.findAll("h2")
# every search result in the KC is under <article><h2>.. so we fetch all h2
for h in headers:
links = h.findAll("a")
for a in links:
# every h2 will have 2 links, one that is used to hide/show the description and one linking to the result page
# ignore the links that are empty (used to hide/show desc box)
if a["href"] != "#":
#cleanup the title:
tmpTitle = a["title"].encode('ascii', 'ignore')
titleTxt = rx.sub(' ', tmpTitle.strip())
#shorten the url:
bitlyUrl = bitly.shorten(""+str(a['href']))
#print a result to the user:
jenni.reply(str(counter)+") "+titleTxt+" - "+bitlyUrl["url"])
counter += 1
#defines the command used to call this module:
kc.commands = ['kc']
#set this to how important you feel this module should be treated among other modules (low/medium/high):
kc.priority = 'high'
#description text for help
kc.example = '.kc [any kaltura search term]'
if __name__ == '__main__':
print __doc__.strip()
