lanmaster53/whey-cewler.py

## whey-cewler.py
'''
Based on the initial work of Digininja at https://github.com/digininja/CeWL. While CeWL is a script written
in Ruby that requires an independent crawl of a website in order to build a custom wordlist, Whey CeWLer
runs within Portswigger's Burp Suite and parses an already crawled sitemap to build a custom wordlist. It
does not have the meta data parsing capabilities that CeWL does, but it more than makes up for it in
convenience.

The name gets its origins from the CeWLer portion of the CO2 Burp extension by Jason Gillam, which is written
in Java and does something similar, but Whey CeWLer is a completely reimagined extension written in Python,
making it "way cooler".

Usage:
1. Point Burp Suite to Jython in the Extender > Options tab.
2. Install this extension manually in the Extender > Extensions tab.
3. Select an option for extension output (File, Console or UI).
4. Right-click on any element in the Target tab's hierarchical sitemap.
5. Select the Extensions > Create wordlist context menu item.

The wordlist is created to wherever you have the extension configured for output.
'''

from burp import IBurpExtender
from burp import IContextMenuFactory
from javax.swing import JMenuItem
from java.util import ArrayList, List
from HTMLParser import HTMLParser
from datetime import datetime
import re

COMMON_PASSWORDS = ['password']
TEXT_CONTENT_TYPES = ['text/html', 'application/xml', 'application/json', 'text/plain']

# helpful resource
# https://github.com/laconicwolf/burp-extensions/blob/master/GenerateForcedBrowseWordlist.py

class TagStripper(HTMLParser):
    '''
    Attempts to strip all tags from an HTML page recieved in the HTTP response. The remaining text
    is appended to an array and then joined with " " for regex parsing.
    '''

    def __init__(self):
        HTMLParser.__init__(self)
        self.page_text = []

    def handle_data(self, data):
        self.page_text.append(data)

    def handle_comment(self, data):
        self.page_text.append(data)

    def strip(self, html_page):
        self.feed(html_page)
        return " ".join(self.page_text)


class BurpExtender(IBurpExtender, IContextMenuFactory):
    '''
    BurpExtender Class as per Reference API.
    '''

    def registerExtenderCallbacks(self, callbacks):
        '''
        Registers the extension and initializes the root URLs and word list sets.
        '''
        self._callbacks = callbacks
        self._helpers = callbacks.getHelpers()
        self.context = None
        self.roots = set()
        self.word_list = set(COMMON_PASSWORDS)
        callbacks.setExtensionName("Whey CeWLer")
        callbacks.registerContextMenuFactory(self)
        return

    def createMenuItems(self, context):
        '''
        Invokes the "Create Wordlist" Menu.
        '''
        self.context = context
        if context.getInvocationContext() == context.CONTEXT_TARGET_SITE_MAP_TREE:
            menu_list = ArrayList()
            menu_item = JMenuItem("Create Wordlist", actionPerformed=self.menu_action)
            menu_list.add(menu_item)
            return menu_list
        return

    def menu_action(self, event):
        '''
        Obtains the selected messages from the interface. Filters the sitmap for all messages containing
        URLs within the selected messages' hierarchy. If so, the message is analyzed to create a word list.
        '''
        # get all first-level selected messages and store the URLs as roots to filter the sitemap
        http_messages = self.context.getSelectedMessages()
        for http_message in http_messages:
            root = str(http_message.getUrl())
            self.roots.add(root)
        # get all sitemap entries associated with the selected messages and scrape them for words
        for http_message in self._callbacks.getSiteMap(None):
            url = http_message.getUrl().toString()
            for root in self.roots:
                # will scrape the same URL multiple times if the site map has stored multiple instances
                # the site map stores multiple instances if it detects differences, so this is desirable
                if url.startswith(root):
                    # only scrape if there is a response to scrape
                    http_response = http_message.getResponse()
                    if http_response:
                        self.get_words(url, http_response)
        self.display_words()
        return

    def get_words(self, url, http_response):
        '''
        Checks the header for a text-based content type. If the content type is text-based, uses
        the TagStripper class to parse out the text and runs a regex to create a wordlist based on
        the regex criteria. The resulting words are added to the word_list set.
        '''
        response = self._helpers.analyzeResponse(http_response)
        headers = response.getHeaders()[1:]
        body = self._helpers.bytesToString(http_response[response.getBodyOffset():])
        for header in headers:
            name, value = [x.strip() for x in header.split(':', 1)]
            if name.lower() == 'content-type':
                content_type = value.split(';')[0].strip()
                if content_type.lower() not in TEXT_CONTENT_TYPES:
                    return
        tag_stripper = TagStripper()
        page_text = tag_stripper.strip(body)
        # alpha numerics and apostrophes
        # at least 3 characters in length
        word_candidates = re.findall(r"[\w']{3,}", page_text)
        for word in word_candidates:
            # strip apostrophes
            word = word.replace("'", "")
            # add the word to the list
            self.word_list.add(word)
        return

    def display_words(self):
        '''
        Displays the word list to whatever Burp is configured for stdout.
        '''
        for word in sorted(self.word_list):
            print word
        return
	'''
	Based on the initial work of Digininja at https://github.com/digininja/CeWL. While CeWL is a script written
	in Ruby that requires an independent crawl of a website in order to build a custom wordlist, Whey CeWLer
	runs within Portswigger's Burp Suite and parses an already crawled sitemap to build a custom wordlist. It
	does not have the meta data parsing capabilities that CeWL does, but it more than makes up for it in
	convenience.

	The name gets its origins from the CeWLer portion of the CO2 Burp extension by Jason Gillam, which is written
	in Java and does something similar, but Whey CeWLer is a completely reimagined extension written in Python,
	making it "way cooler".

	Usage:
	1. Point Burp Suite to Jython in the Extender > Options tab.
	2. Install this extension manually in the Extender > Extensions tab.
	3. Select an option for extension output (File, Console or UI).
	4. Right-click on any element in the Target tab's hierarchical sitemap.
	5. Select the Extensions > Create wordlist context menu item.

	The wordlist is created to wherever you have the extension configured for output.
	'''

	from burp import IBurpExtender
	from burp import IContextMenuFactory
	from javax.swing import JMenuItem
	from java.util import ArrayList, List
	from HTMLParser import HTMLParser
	from datetime import datetime
	import re

	COMMON_PASSWORDS = ['password']
	TEXT_CONTENT_TYPES = ['text/html', 'application/xml', 'application/json', 'text/plain']

	# helpful resource
	# https://github.com/laconicwolf/burp-extensions/blob/master/GenerateForcedBrowseWordlist.py

	class TagStripper(HTMLParser):
	'''
	Attempts to strip all tags from an HTML page recieved in the HTTP response. The remaining text
	is appended to an array and then joined with " " for regex parsing.
	'''

	def __init__(self):
	HTMLParser.__init__(self)
	self.page_text = []

	def handle_data(self, data):
	self.page_text.append(data)

	def handle_comment(self, data):
	self.page_text.append(data)

	def strip(self, html_page):
	self.feed(html_page)
	return " ".join(self.page_text)


	class BurpExtender(IBurpExtender, IContextMenuFactory):
	'''
	BurpExtender Class as per Reference API.
	'''

	def registerExtenderCallbacks(self, callbacks):
	'''
	Registers the extension and initializes the root URLs and word list sets.
	'''
	self._callbacks = callbacks
	self._helpers = callbacks.getHelpers()
	self.context = None
	self.roots = set()
	self.word_list = set(COMMON_PASSWORDS)
	callbacks.setExtensionName("Whey CeWLer")
	callbacks.registerContextMenuFactory(self)
	return

	def createMenuItems(self, context):
	'''
	Invokes the "Create Wordlist" Menu.
	'''
	self.context = context
	if context.getInvocationContext() == context.CONTEXT_TARGET_SITE_MAP_TREE:
	menu_list = ArrayList()
	menu_item = JMenuItem("Create Wordlist", actionPerformed=self.menu_action)
	menu_list.add(menu_item)
	return menu_list
	return

	def menu_action(self, event):
	'''
	Obtains the selected messages from the interface. Filters the sitmap for all messages containing
	URLs within the selected messages' hierarchy. If so, the message is analyzed to create a word list.
	'''
	# get all first-level selected messages and store the URLs as roots to filter the sitemap
	http_messages = self.context.getSelectedMessages()
	for http_message in http_messages:
	root = str(http_message.getUrl())
	self.roots.add(root)
	# get all sitemap entries associated with the selected messages and scrape them for words
	for http_message in self._callbacks.getSiteMap(None):
	url = http_message.getUrl().toString()
	for root in self.roots:
	# will scrape the same URL multiple times if the site map has stored multiple instances
	# the site map stores multiple instances if it detects differences, so this is desirable
	if url.startswith(root):
	# only scrape if there is a response to scrape
	http_response = http_message.getResponse()
	if http_response:
	self.get_words(url, http_response)
	self.display_words()
	return

	def get_words(self, url, http_response):
	'''
	Checks the header for a text-based content type. If the content type is text-based, uses
	the TagStripper class to parse out the text and runs a regex to create a wordlist based on
	the regex criteria. The resulting words are added to the word_list set.
	'''
	response = self._helpers.analyzeResponse(http_response)
	headers = response.getHeaders()[1:]
	body = self._helpers.bytesToString(http_response[response.getBodyOffset():])
	for header in headers:
	name, value = [x.strip() for x in header.split(':', 1)]
	if name.lower() == 'content-type':
	content_type = value.split(';')[0].strip()
	if content_type.lower() not in TEXT_CONTENT_TYPES:
	return
	tag_stripper = TagStripper()
	page_text = tag_stripper.strip(body)
	# alpha numerics and apostrophes
	# at least 3 characters in length
	word_candidates = re.findall(r"[\w']{3,}", page_text)
	for word in word_candidates:
	# strip apostrophes
	word = word.replace("'", "")
	# add the word to the list
	self.word_list.add(word)
	return

	def display_words(self):
	'''
	Displays the word list to whatever Burp is configured for stdout.
	'''
	for word in sorted(self.word_list):
	print word
	return