Skip to content

Instantly share code, notes, and snippets.

@decidedlygray
Created January 19, 2018 17:41
Show Gist options
  • Save decidedlygray/f0727a63b7f68aae41155b0c90232d59 to your computer and use it in GitHub Desktop.
Save decidedlygray/f0727a63b7f68aae41155b0c90232d59 to your computer and use it in GitHub Desktop.
Debugging xsscrapy.py - Issue #36
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exceptions import DropItem
import HTMLParser
from xsscrapy.items import vuln#, inj_resp
import re
import lxml.etree
import lxml.html
from lxml.html import soupparser, fromstring
import itertools
#from IPython import embed
from socket import gaierror, gethostbyname
from urlparse import urlparse
from logging import CRITICAL, ERROR, WARNING, INFO, DEBUG
class XSSCharFinder(object):
def __init__(self):
self.url_param_xss_items = []
def get_filename(self, url):
filename = 'xsscrapy-vulns.txt'
up = urlparse(url).netloc.replace('www.', '').split(':')[0]
if up:
filename = up + '.txt'
return filename
def open_spider(self, spider):
self.filename = self.get_filename(spider.url)
def process_item(self, item, spider):
response = item['resp']
meta = response.meta
payload = meta['payload']
delim = meta['delim']
param = meta['xss_param']
resp_url = response.url
body = response.body
mismatch = False
error = None
fuzz_payload = payload.replace(delim, '').replace(';9', '') # xss char payload
# Regex: ( ) mean group 1 is within the parens, . means any char,
# {1,80} means match any char 0 to 80 times, 80 chosen because double URL encoding
# ? makes the search nongreedy so it stops after hitting its limits
full_match = '%s.{0,80}?%s' % (delim, delim)
# matches with semicolon which sometimes cuts results off
sc_full_match = '%s.{0,80}?%s;9' % (delim, delim)
# Quick sqli check based on DSSS
dbms, regex = self.sqli_check(body, meta['orig_body'])
if dbms:
msg = 'Possible SQL injection error! Suspected DBMS: %s, regex used: %s' % (dbms, regex)
item = self.make_item(meta, resp_url, msg, 'N/A', None)
self.write_to_file(item, spider)
item = None
# Check for script tags pointing to a no longer registered domain
unclaimedURL = self.unclaimedURL_check(body)
if unclaimedURL:
msg = 'Found non-registered domain in script tag! Non-registered URL: %s' % unclaimedURL
with open(self.filename, 'a+') as f:
f.write('\n')
f.write('URL: '+resp_url+'\n')
spider.log(' URL: '+resp_url+'\n', level=INFO)
f.write(msg+'\n')
spider.log(' '+msg+'\n', level=INFO)
# Now that we've checked for SQLi, we can lowercase the body
body = body.lower()
# XSS detection starts here
re_matches = sorted([(m.start(), m.group(), m.end()) for m in re.finditer(full_match, body)])
if len(re_matches) > 0:
#scolon_matches = sorted([(m.start(), m.group()) for m in re.finditer(sc_full_match, body)])
# If regex finds anything, get lxml matches
lxml_injs = self.get_lxml_matches(full_match, body, resp_url, delim)
if lxml_injs:
err = None
if len(re_matches) != len(lxml_injs):
spider.log('Error: mismatch in injections found by lxml and regex; higher chance of false positive for %s' % resp_url)
error = 'Error: mismatch in injections found by lxml and regex; higher chance of false positive'
mismatch = True
inj_data = self.combine_regex_lxml(lxml_injs, re_matches, body, mismatch, payload, delim)
# If mismatch is True, then "for offset in sorted(inj_data)" will fail with TypeError
try:
for offset in sorted(inj_data):
########## XSS LOGIC #############
item = self.xss_logic(inj_data[offset], meta, resp_url, error)
if item:
item = self.url_item_filtering(item, spider)
if mismatch:
item['error'] = 'Mismatch: html parsed vs regex parsed injections, %d vs %d. Higher chance of false positive' % (len(injections), len(full_matches))
self.write_to_file(item, spider)
return item
except TypeError:
if mismatch:
return
else:
raise
# No lxml_match_data
else:
spider.log('Error: regex found injection, but lxml did not')
# Catch all test payload chars no delims
# Catches DB errors
pl_lines_found = self.payloaded_lines(body, fuzz_payload)
if pl_lines_found:
item = self.make_item(meta, resp_url, pl_lines_found, fuzz_payload, None)
if item:
item = self.url_item_filtering(item, spider)
item['error'] = 'Payload delims do not surround this injection point. Found via search for entire payload.'
self.write_to_file(item, spider)
return item
raise DropItem('No XSS vulns in %s. type = %s, %s' % (resp_url, meta['xss_place'], meta['xss_param']))
def sqli_check(self, body, orig_body):
''' Do a quick lookup in the response body for SQL errors. Both w3af's and DSSS.py's methods are in here
but sectoolsmarket.com shows DSSS as having better detection rates so using theirs '''
# Taken from Damn Small SQLi Scanner
DBMS_ERRORS = {"MySQL": (r"SQL syntax.*MySQL", r"Warning.*mysql_.*", r"valid MySQL result", r"MySqlClient\."),
"PostgreSQL": (r"PostgreSQL.*ERROR", r"Warning.*\Wpg_.*", r"valid PostgreSQL result", r"Npgsql\."),
"Microsoft SQL Server": (r"Driver.* SQL[\-\_\ ]*Server", r"OLE DB.* SQL Server", r"(\W|\A)SQL Server.*Driver", r"Warning.*mssql_.*",
r"(\W|\A)SQL Server.*[0-9a-fA-F]{8}", r"(?s)Exception.*\WSystem\.Data\.SqlClient\.", r"(?s)Exception.*\WRoadhouse\.Cms\."),
"Microsoft Access": (r"Microsoft Access Driver", r"JET Database Engine", r"Access Database Engine"),
"Oracle": (r"ORA-[0-9][0-9][0-9][0-9]", r"Oracle error", r"Oracle.*Driver", r"Warning.*\Woci_.*", r"Warning.*\Wora_.*")}
for (dbms, regex) in ((dbms, regex) for dbms in DBMS_ERRORS for regex in DBMS_ERRORS[dbms]):
if re.search(regex, body, re.I) and not re.search(regex, orig_body, re.I):
return (dbms, regex)
return None, None
# Taken from w3af
#SQL_errors = ("System.Data.OleDb.OleDbException",
# "[SQL Server]",
# "[Microsoft][ODBC SQL Server Driver]",
# "[SQLServer JDBC Driver]",
# "[SqlException",
# "System.Data.SqlClient.SqlException",
# "Unclosed quotation mark after the character string",
# "'80040e14'",
# "mssql_query()",
# "odbc_exec()",
# "Microsoft OLE DB Provider for ODBC Drivers",
# "Microsoft OLE DB Provider for SQL Server",
# "Incorrect syntax near",
# "Sintaxis incorrecta cerca de",
# "Syntax error in string in query expression",
# "ADODB.Field (0x800A0BCD)<br>",
# "ADODB.Recordset'",
# "Unclosed quotation mark before the character string",
# "'80040e07'",
# "Microsoft SQL Native Client error",
# "SQLCODE",
# "DB2 SQL error:",
# "SQLSTATE",
# "[CLI Driver]",
# "[DB2/6000]",
# "Sybase message:",
# "Sybase Driver",
# "[SYBASE]",
# "Syntax error in query expression",
# "Data type mismatch in criteria expression.",
# "Microsoft JET Database Engine",
# "[Microsoft][ODBC Microsoft Access Driver]",
# "Microsoft OLE DB Provider for Oracle",
# "wrong number or types",
# "PostgreSQL query failed:",
# "supplied argument is not a valid PostgreSQL result",
# "unterminated quoted string at or near",
# "pg_query() [:",
# "pg_exec() [:",
# "supplied argument is not a valid MySQL",
# "Column count doesn\'t match value count at row",
# "mysql_fetch_array()",
# "mysql_",
# "on MySQL result index",
# "You have an error in your SQL syntax;",
# "You have an error in your SQL syntax near",
# "MySQL server version for the right syntax to use",
# "Division by zero in",
# "not a valid MySQL result",
# "[MySQL][ODBC",
# "Column count doesn't match",
# "the used select statements have different number of columns",
# "DBD::mysql::st execute failed",
# "DBD::mysql::db do failed:",
# "com.informix.jdbc",
# "Dynamic Page Generation Error:",
# "An illegal character has been found in the statement",
# "[Informix]",
# "<b>Warning</b>: ibase_",
# "Dynamic SQL Error",
# "[DM_QUERY_E_SYNTAX]",
# "has occurred in the vicinity of:",
# "A Parser Error (syntax error)",
# "java.sql.SQLException",
# "Unexpected end of command in statement",
# "[Macromedia][SQLServer JDBC Driver]",
# "could not prepare statement",
# "Unknown column",
# "where clause",
# "SqlServer",
# "syntax error")
#for e in SQL_errors:
# if e in body and e not in orig_body:
# return e
def unclaimedURL_check(self, body):
print('DEBUG unclaimedURL_check:: PROBLEM body ----- START')
print(str(body))
print('DEBUG unclaimedURL_check:: PROBLEM body ------ END')
tree = fromstring(body)
scriptURLs = tree.xpath('//script/@src')
for url in scriptURLs:
parser = urlparse(url)
domain = parser.netloc
try:
gethostbyname(domain)
resolved = True
except gaierror:
resolved = False
if resolved == False:
return url
def xss_logic(self, injection, meta, resp_url, error):
''' XSS logic. Returns None if vulnerability not found
The breakout_chars var is a list(set()). This ensure we can
test for breakout given OR statements, like " or ; to breakout'''
# Gets rid of some false positives
# ex: https://www.facebook.com/directory/pages/9zqjxpo'%22()%7B%7D%3Cx%3E:9zqjxpo;9
#if len(unfiltered) > len(self.test_str):
# return
# Unpack the injection
#tag_index, tag, attr, attr_val, payload, unfiltered_chars, line = injection
# get_unfiltered_chars() can only return a string 0+ characters, but never None
unfiltered_chars = injection[5]
payload = injection[4]
# injection[6] sometimes == '<p' maybe?
# It happened POSTing chatRecord to http://service.taobao.com/support/minerva/robot_save_chat_record.htm
# that page returned a Connection: Close header and no body
############### THIS NEEDS TO BE THE REFLECTED PAYLOAD
line = injection[6]
item_found = None
# get_reflected_chars() always returns a string
if len(unfiltered_chars) > 0:
chars_payloads = self.get_breakout_chars(injection, resp_url)
# breakout_chars always returns a , never None
if len(chars_payloads) > 0:
sugg_payloads = []
for chars in chars_payloads:
if set(chars).issubset(set(unfiltered_chars)):
# Get rid of possible payloads with > in them if > not in unfiltered_chars
item_found = True
for possible_payload in chars_payloads[chars]:
if '>' not in unfiltered_chars:
if '>' in possible_payload:
continue
sugg_payloads.append(possible_payload)
if item_found:
return self.make_item(meta, resp_url, line, unfiltered_chars, sugg_payloads)
def get_breakout_chars(self, injection, resp_url):
''' Returns either None if no breakout chars were found
or a list of sets of potential breakout characters '''
tag_index, tag, attr, attr_val, payload, unfiltered_chars, line = injection
pl_delim = payload[:6]
full_match = '%s.{0,85}?%s' % (pl_delim, pl_delim)
line = re.sub(full_match, 'INJECTION', line)
all_chars_payloads = {}
# Comment injection
if tag == '!--':
chars = ('>')
payload = '--><svG onLoad=prompt(9)>'
try:
all_chars_payloads[chars] += [payload]
except KeyError:
all_chars_payloads[chars] = [payload]
# Attribute injection
elif attr:
chars_payloads = self.attr_breakout(tag, attr, attr_val, pl_delim, line)
for k in chars_payloads:
try:
all_chars_payloads[k] += chars_payloads[k]
except KeyError:
all_chars_payloads[k] = chars_payloads[k]
# Between tag injection
else:
chars_payloads = self.tag_breakout(tag, line)
for k in chars_payloads:
try:
all_chars_payloads[k] += chars_payloads[k]
except KeyError:
all_chars_payloads[k] = chars_payloads[k]
# Dedupe the list of potential payloads
for chars in all_chars_payloads:
all_chars_payloads[chars] = list(set(all_chars_payloads[chars]))
return all_chars_payloads
def decomment_js(self, line):
''' Remove commented JS lines which screw with quote detection '''
lines = line.splitlines()
decommented_lines = [l for l in lines if not l.strip().startswith('//')]
line = '\n'.join(decommented_lines)
return line
def tag_breakout(self, tag, line):
chars_payloads = {}
# Look for javascript breakouts
if tag == 'script':
# Get rid of comments which screw with the quote detector
decommented_js = self.decomment_js(line)
# Get rid of javascript escaped quotes
dquote_open, squote_open = self.get_quote_context(decommented_js)
if dquote_open:
chars = ('"')
payload = 'x";prompt(9);'
try:
chars_payloads[chars].append(payload)
except KeyError:
chars_payloads[chars] = [payload]
if squote_open:
chars = ("'")
payload = "x';prompt(9);"
try:
chars_payloads[chars].append(payload)
except KeyError:
chars_payloads[chars] = [payload]
chars = ('<', '>')
payload = '<svG onLoad=prompt(9)>'
try:
chars_payloads[chars].append(payload)
except KeyError:
chars_payloads[chars] = [payload]
# If neither double nor single quotes are open at injection point, we just need a ;
if not dquote_open and not squote_open:
chars = (";")
payload = ';prompt(9);'
try:
chars_payloads[chars].append(payload)
except KeyError:
chars_payloads[chars] = [payload]
chars = ("<", ">", "/")
payload = '</SCript><svG/onLoad=prompt(9)>'
try:
chars_payloads[chars].append(payload)
except KeyError:
chars_payloads[chars] = [payload]
# Everything that's not a script tag
else:
chars = ("<", ">")
payload = '<svG onLoad=prompt(9)>'
try:
chars_payloads[chars].append(payload)
except KeyError:
chars_payloads[chars] = [payload]
return chars_payloads
def get_attr_quote(self, attr, line):
''' Return the first quote in the string which
should always be the html quote if this is called
on a string of html with an attr '''
attr_quote = None
# Split the line at the attribute once so target string is everything after the attr
split_line = line.split(attr, 1)
attr_split_lines = line.split(attr, 1) #[0] is ''
if len(attr_split_lines) > 1:
attr_split_line = attr + attr_split_lines[1]
# match starts with =, then find any amount of space (nongreedy with ?) until either ' or "
attr_quote = re.search('=\s*?(\'|")', attr_split_line)
if attr_quote:
attr_quote = attr_quote.group(1)
else:
attr_quote = None
return attr_quote
def attr_breakout(self, tag, attr, attr_val, delim, line):
breakout_chars = []
sugg_payload = []
chars_payloads = {}
dquote_open, squote_open = self.get_quote_context(line)
attr_quote = self.get_attr_quote(attr, line)
# javascript:alert(1) vulns
# We do this slicing operation because ;9 might be at the end
# although it's unnecessary for the payload
# CHECK HERE, PASS DOWN THE ORIG ATTR VAL
#if delim+'subbed' in attr_val:
if attr_val[:len(delim+'subbed')] == delim+'subbed':
if tag == 'a' and attr == 'href':
# Only need : ( and ) to use javascript:prompt(4) redir payload
chars = (':', '(', ')')
payload = 'JavaSCript:prompt(9)'
try:
chars_payloads[chars].append(payload)
except KeyError:
chars_payloads[chars] = [payload]
# Inject the frame or script source file
elif tag in ['script', 'frame', 'iframe']:
if attr == 'src':
chars = (':', '(', ')')
payload = 'JavaSCript:prompt(9)'
try:
chars_payloads[chars].append(payload)
except KeyError:
chars_payloads[chars] = [payload]
# Catching src=vbscript:...
# Exploit: msgbox (document.domain)
if re.match('vbscript:', attr_val):
if dquote_open and squote_open:
vbs_quote = self.opposite_quote(attr_quote)
chars = ("x", vbs_quote)
payload = vbs_quote + '%0a%0dMsgBox (xss)'
try:
chars_payloads[chars].append(payload)
except KeyError:
chars_payloads[chars] = [payload]
else:
chars = ("x")
payload = "%0a%0dMsgBox (xss)"
try:
chars_payloads[chars].append(payload)
except KeyError:
chars_payloads[chars] = [payload]
# Catching src=javascript:...
if re.match('javascript:', attr_val):
if dquote_open and squote_open:
js_quote = self.opposite_quote(attr_quote)
chars = (js_quote, ";")
payload = js_quote+";prompt(9);"
try:
chars_payloads[chars].append(payload)
except KeyError:
chars_payloads[chars] = [payload]
else:
chars = (";")
payload = ";prompt(9);"
try:
chars_payloads[chars].append(payload)
except KeyError:
chars_payloads[chars] = [payload]
# Check for JS attributes
if attr in self.event_attributes():
if dquote_open and squote_open:
js_quote = self.opposite_quote(attr_quote)
chars = (";", js_quote)
payload = 'x'+js_quote+';prompt(9);'+js_quote
try:
chars_payloads[chars].append(payload)
except KeyError:
chars_payloads[chars] = [payload]
elif not dquote_open and not squote_open:
chars = (";")
payload = ';prompt(9);'
try:
chars_payloads[chars].append(payload)
except KeyError:
chars_payloads[chars] = [payload]
#elif squote_open and not dquote_open:
else:
chars = (";")
payload = ";prompt(9);"
try:
chars_payloads[chars].append(payload)
except KeyError:
chars_payloads[chars] = [payload]
# Check if quotes even exist in the line
if '"' in line or "'" in line:
if not attr_quote:
chars = ('<', '>')
payload = 'x><svG onLoad=prompt(9)>'
try:
chars_payloads[chars].append(payload)
except KeyError:
chars_payloads[chars] = [payload]
else:
chars = (attr_quote)
payload1 = 'x'+attr_quote+'><svG onLoad=prompt(9)>'
payload2 = 'x'+attr_quote+' onmouseover=prompt(9) '+attr_quote
payload3 = 'x'+attr_quote+'/onmouseover=prompt(9)/'+attr_quote
try:
chars_payloads[chars].append(payload1)
chars_payloads[chars].append(payload2)
chars_payloads[chars].append(payload3)
except KeyError:
chars_payloads[chars] = [payload1, payload2, payload3]
# If no quotes are open or they're just not found:
else:
# means you can inject your own attribute value like:
# x src=http://exploit.com
if tag in ['script', 'frame', 'iframe']:
chars = (':', ')', '(')
payload = 'x src=JaVaSCript:prompt(9)'
try:
chars_payloads[chars].append(payload)
except KeyError:
chars_payloads[chars] = [payload]
else:
chars = ("<", ">")
payload = 'x><svG onLoad=prompt(9)>'
try:
chars_payloads[chars].append(payload)
except KeyError:
chars_payloads[chars] = [payload]
return chars_payloads
def get_quote_context(self, line):
''' Goes through char by char to determine if double
or single quotes are open or not: True/None '''
dquote_open = None
squote_open = None
# For something like <script>"hey":"hello's"</script> needed a way to remove the single quote
# from being considered open since it's not actually important. This prevents a lot of false+
# like in ebay.com and twitter.com
first_open = None
for c in line:
if c == '"':
# I have noticed that booking.com throws false+ with this first_open stuff
if not first_open:
first_open = c
dquote_open = self.opposite(dquote_open)
if first_open == '"' and dquote_open == None:
first_open = None
if squote_open == True:
squote_open = None
elif c == "'":
if not first_open:
first_open = c
squote_open = self.opposite(squote_open)
if first_open == "'" and squote_open == None:
first_open = None
if dquote_open == True:
dquote_open = None
return dquote_open, squote_open
def opposite(self, obj):
''' Returns the obj as either True or None '''
if obj:
obj = None
else:
obj = True
return obj
def opposite_quote(self, quote):
''' Return the opposite quote of the one give, single for double or
double for single '''
if quote == '"':
oppo_quote = "'"
else:
oppo_quote = '"'
return oppo_quote
def get_lxml_matches(self, full_match, body, resp_url, delim):
# Replace the payloaded string with just the delim string (minus the ; )
sub = delim+'subbed'
subbed_body = re.sub(full_match, sub, body)
doc = self.html_parser(subbed_body, resp_url)
lxml_injs = self.xpath_inj_points(sub, doc)
return lxml_injs
def html_parser(self, body, resp_url):
try:
# You must use lxml.html.soupparser or else candyass webdevs who use identical
# multiple html attributes with injections in them don't get caught
# That being said, soupparser is crazy slow and introduces a ton of
# new bugs so that is not an option at this point in time
doc = lxml.html.fromstring(body, base_url=resp_url)
except lxml.etree.ParserError:
self.log('ParserError from lxml on %s' % resp_url)
return
except lxml.etree.XMLSyntaxError:
self.log('XMLSyntaxError from lxml on %s' % resp_url)
return
return doc
#try:
# # You must use soupparser or else candyass webdevs who use identical
# # multiple html attributes with injections in them don't get caught
# # EXCEPT I found out that soupparser is a CPU MONSTER and intros more bugs
# doc = lxml.html.soupparser.fromstring(subbed_body)
#except ValueError:
# try:
# doc = lxml.html.fromstring(subbed_body)
# except lxml.etree.ParserError:
# self.log('ParserError from lxml on %s' % resp_url)
# return
# except lxml.etree.XMLSyntaxError:
# self.log('XMLSyntaxError from lxml on %s' % resp_url)
# return
#except lxml.etree.ParserError:
# self.log('ParserError from lxml on %s' % resp_url)
# return
#except lxml.etree.XMLSyntaxError:
# self.log('XMLSyntaxError from lxml on %s' % resp_url)
# return
def combine_regex_lxml(self, lxml_injs, full_matches, body, mismatch, payload, delim):
''' Combine lxml injection data with the 2 regex injection search data '''
all_inj_data = {}
for idx, match in enumerate(full_matches):
try:
tag_index, tag = lxml_injs[idx][0]
except IndexError:
if mismatch:
return
else:
raise
# Always returns a populated dict, even if its {None:None} like
# with inbetween tag injections
attrs_attrvals = lxml_injs[idx][1]
if attrs_attrvals:
for a in attrs_attrvals:
attr = a
attr_val = attrs_attrvals[a]
# I don't think this is necessary now that we use lxml.html.soupparse.fromstring rather than lxml.html.fromstring
else:
spider.log("Error: Possible broken HTML. Lxml failed to find the attribute. twitter.com/?q=[payload] is an example.")
continue
# If it's an attribute inj then the tag will always be "<tag "
# If it's an injection in a comment or between tags
# it will always start with "<tag" (comment inj tag = "!--")
if attr:
tag_delim = '<'+tag+' '
else:
tag_delim = '<'+tag
match_start_offset = match[0]
ret_between_delim = match[1]
match_end_offset = match[2]
sub = delim+'subbed'
start_of_match = body[match_start_offset:]
ref_payload_plus_2 = start_of_match[:(match_end_offset - match_start_offset)+2]
ref_payload = start_of_match[:(match_end_offset - match_start_offset)]
# ref_payload would only equal ref_payload_plus_2 if ;9 doesn't exist
if ref_payload != ref_payload_plus_2:
if ref_payload_plus_2[-2:] == ';9':
ref_payload = ref_payload_plus_2
# split the body at the tag, then take the last fragment
# which is closest to the injection point as regexed
split_body = body[:match_start_offset]
line_no_tag = split_body.split(tag_delim)[-1]#.replace('\\"', '').replace("\\'", "")
line = tag_delim + line_no_tag + ref_payload
# Sometimes it may split wrong, in which case we drop that lxml match
if line_no_tag.startswith('<doctype') or line_no_tag.startswith('<html'):
line = ''
# Get the accurate attr, as the original attr and attr_val in this function
# would just be the first one found even if there were >1 attributes in lxml_injs
if attr:
attr_dict = self.accurate_attr(tag, attrs_attrvals, match, line)
for a in attr_dict:
attr = a
attr_val = attr_dict[a]
break
unfiltered_chars = self.get_unfiltered_chars(payload, ref_payload, delim, tag, attr)
# Common false+ shows only "> as unfiltered if script parses the chars between 2 unrelated delim strs
#if unfiltered_chars == '">':
# unfiltered_chars = ''
all_inj_data[match_start_offset] = [tag_index, tag, attr, attr_val, payload, unfiltered_chars, line]
return all_inj_data
def get_unfiltered_chars(self, payload, ref_payload, delim, tag, attr):
"""
Pull out just the unfiltered chars from the reflected chars
payload = delim+fuzz+delim+;9
"""
ref_chars = ref_payload.replace(delim, '').replace('9', '')
fuzz_chars = payload.replace(delim, '').replace('9', '')
remove_chars = set([])
unfiltered_chars_list = []
html_entities = ['&#39', '&quot;', '&lt;', '&gt;']
# Remove html encoded entities
for entity in html_entities:
if entity in ref_chars:
ref_chars.replace(entity, '')
#If injection is inside script tag, remove the escaped chars
if tag == 'script' or attr in self.event_attributes():
ref_chars = ref_chars.replace("\\'", "").replace('\\"', '').replace('\;', '').replace('\\>', '').replace('\\<', '').replace('\\/', '')
for c in fuzz_chars:
if c in ref_chars:
continue
else:
remove_chars.add(c)
for char in fuzz_chars:
if char not in remove_chars:
unfiltered_chars_list.append(char)
unfiltered_chars = ''.join(unfiltered_chars_list)
return unfiltered_chars
def accurate_attr(self, tag, attrs_attrvals, match, line):
''' lxml cannot determine the order of attrs which is important
if we're using regex to find the location of the inj so this func
finds the accurate attr based on the regex match amongst a list of
attrs of the same tag. This is for multi injections in single attr_val
as well as multi injections in multiple attrs.'''
# Going for process of elimination first
copy_attrs_attrvals = attrs_attrvals.copy()
for attr in attrs_attrvals:
if attr+'=' not in line and attr+' =' not in line:
copy_attrs_attrvals.pop(attr, None)
# Keep doing research to find the accurate attr
if len(copy_attrs_attrvals) > 1:
attr_places = {}
for attr in copy_attrs_attrvals:
attrs_found = [x.end() for x in re.finditer(attr+'\s?=', line)]
attr_places[max(attrs_found)] = attr
# Get the attribute from the higher numbered key within attr_places
closest_attr = attr_places[max(attr_places)]
return {closest_attr:copy_attrs_attrvals[closest_attr]}
# Found the accurate attr
else:
for attr in copy_attrs_attrvals:
return {attr:copy_attrs_attrvals[attr]}
return {None:None}
def payloaded_lines(self, body, payload):
pl_lines_found = sorted([])
for line in body.splitlines():
if payload in line:
pl_lines_found.append(line)
return pl_lines_found
def make_item(self, meta, resp_url, line, unfiltered, sugg_payloads):
''' Create the vuln item '''
item = vuln()
if isinstance(line, str):
item['line'] = line
else:
item['line'] = '\n'.join(line)
item['xss_payload'] = meta['payload']
item['unfiltered'] = unfiltered
item['xss_param'] = meta['xss_param']
item['xss_place'] = meta['xss_place']
item['orig_url'] = meta['orig_url']
item['resp_url'] = resp_url
if sugg_payloads:
item['sugg_payloads'] = ', '.join(sugg_payloads)
if 'POST_to' in meta:
item['POST_to'] = meta['POST_to']
# Just make sure one of the options has been set
if item['unfiltered']:
return item
def xpath_inj_points(self, search_str, doc):
''' Searches lxml doc for any text, attributes, or comments
that reflect the subbed text '''
injections = []
same_tag_attrs = {}
# Why this xpath is wrong: perm.ly/dom4j-xpath-contains
#text_xss = doc.xpath("//*[contains(text(), '%s')]" % search_str)
# Will this find DB error message stuff without a tag? No. It will not.
#text_xss = doc.xpath("//*[text()]")#[contains(.,'%s')]]" % search_str)
# Return all text nodes that contain the search_str
text_xss = doc.xpath("//text()[contains(., '%s')]" % search_str)
# Returns parent elements //* with any attribute [@* that
# contains a value with search_str in it [contains(., '%s')]
attr_xss = doc.xpath("//*[@*[contains(., '%s')]]" % search_str)
# Return any comment that contains certain text
comment_xss = doc.xpath("//comment()[contains(., '%s')]" % search_str)
attr_inj = self.parse_attr_xpath(attr_xss, search_str, doc)
comm_inj = self.parse_comm_xpath(comment_xss, search_str, doc)
text_inj = self.parse_text_xpath(text_xss, search_str, doc)
injects = [attr_inj, comm_inj, text_inj]
for i in injects:
# i = attr_inj or comm_inj or text_inj list
for x in i:
# x = (tag_index, tag, attr, attr_val)
tag_index = x[0]
tag = x[1]
attr = x[2]
attr_val = x[3]
# Sometimes <script> tags don't appear within the doc?? Dunno why, maybe some weird ajax shit
# But if we remove them then we lose the matchups between lxml and regex so we must keep them
# Just make them useless by entering empty tag and putting them at the end of the lxml matches
# so a split at tag won't find anything
if not tag_index:
print ' '*36+'ERROR: Error: could not find tag index location. Element does not exist in root doc.'
tag_index = 999999999
tag = ''
loc_tag = (tag_index, tag)
attr_attrval = {attr:attr_val}
# combine the multi attr injections into a dict otherwise
# just add the {attr, attr_val} injection as the second item
if attr:
if loc_tag in same_tag_attrs:
same_tag_attrs[loc_tag][attr] = attr_val
else:
same_tag_attrs[loc_tag] = attr_attrval
injections.append((loc_tag, attr_attrval))
# Final injections list should be like:
# injections = [(tag_index, tag), {attribute1:attribute_value, attribute2:attribute_value}]
# with variables amount of attributes with injected values
for idx, i in enumerate(injections):
loc_tag = i[0]
if loc_tag in same_tag_attrs:
injections[idx] = (loc_tag, same_tag_attrs[loc_tag])
injections = sorted(injections)
if len(injections) > 0:
return injections
def get_elem_position(self, elem, doc):
''' Iterate through all elements in doc
and match them up against the element found
during xpathing '''
order = 0
for i in doc.iter():
order += 1
if i == elem:
return order
def parse_attr_xpath(self, xpath, search_str, doc):
''' Find all tags with attributes that contain the subbed str '''
attr_inj = []
for x in xpath:
# x = http://lxml.de/api/lxml.etree._Element-class.html
tag_index = self.get_elem_position(x, doc)
tag = x.tag
items = x.items()
for i in items:
#i = (attr, attr_val)
attr = i[0]
attr_val = i[1]
found = re.findall(search_str, attr_val)
for f in found:
attr_inj.append((tag_index, tag, attr, attr_val))
return attr_inj
def parse_comm_xpath(self, xpath, search_str, doc):
''' Parse the xpath comment search findings '''
comm_inj = []
for x in xpath:
parent = x.getparent()
text = x.text
found = re.findall(search_str, text)
for f in found:
tag_index = self.get_elem_position(x, doc)
# Set this tag so when we split the final line
# We split it at '<' + tag so that gives us
# a split at <!--
tag = '!--'
comm_inj.append((tag_index, tag, None, None))
return comm_inj
def parse_text_xpath(self, xpath, search_str, doc):
''' Creates injection points for the xpath that finds the payload in any html enclosed text '''
text_inj = []
for x in xpath:
parent = x.getparent()
# In case parent is a Comment()
while callable(parent.tag):
parent = parent.getparent()
tag = parent.tag
tag_index = self.get_elem_position(parent, doc)
found = re.findall(search_str, x.strip())
for f in found:
text_inj.append((tag_index, tag, None, None))
return text_inj
def unescape_payload(self, payload):
''' Unescape the various payload encodings (html and url encodings)'''
if '%' in payload:
payload = urllib.unquote_plus(payload)
#if '%' in payload: # in case we ever add double url encoding like %2522 for dub quote
# payload = urllib.unquote_plus(payload)
# only html-encoded payloads will have & in them
payload = HTMLParser.HTMLParser().unescape(payload)
return payload
def get_reflected_chars(self, tag, attr, ref_payload, delim, body, match_end_offset):
''' Check for the special chars and append them to a master list of tuples, one tuple per injection point
Always returns a string '''
returned_chars = ref_payload.replace(delim, '').replace('9','')
return returned_chars
def url_item_filtering(self, item, spider):
''' Make sure we're not just repeating the same URL XSS over and over '''
if item['xss_place'] == 'url':
if len(self.url_param_xss_items) > 0:
for i in self.url_param_xss_items:
# If the injection param, the url up until the injected param and the payload
# are all the same as a previous item, then don't bother creating the item
# Match tags where injection point was found
if item['xss_param'] == i['xss_param']:
# Match the URL up until the params
if item['orig_url'].split('?', 1)[0] == i['orig_url'].split('?', 1)[0]:
# Match the payload
if item['xss_payload'] == i['xss_payload']:
# Match the unfiltered characters
if item['unfiltered'] == i['unfiltered']:
raise DropItem('Duplicate item found: %s' % item['orig_url'])
self.url_param_xss_items.append(item)
return item
def event_attributes(self):
''' HTML tag attributes that allow javascript TAKEN OUT AT THE MOMENT'''
event_attributes = ['onafterprint', 'onbeforeprint', 'onbeforeunload', 'onerror',
'onhaschange', 'onload', 'onmessage', 'onoffline', 'ononline',
'onpagehide', 'onpageshow', 'onpopstate', 'onredo', 'onresize',
'onstorage', 'onundo', 'onunload', 'onblur', 'onchange',
'oncontextmenu', 'onfocus', 'onformchange', 'onforminput',
'oninput', 'oninvalid', 'onreset', 'onselect', 'onsubmit',
'onkeydown', 'onkeypress', 'onkeyup', 'onclick', 'ondblclick',
'ondrag', 'ondragend', 'ondragenter', 'ondragleave', 'ondragover',
'ondragstart', 'ondrop', 'onmousedown', 'onmousemove',
'onmouseout', 'onmouseover', 'onmouseup', 'onmousewheel',
'onscroll', 'onabort', 'oncanplay', 'oncanplaythrough',
'ondurationchange', 'onemptied', 'onended', 'onerror',
'onloadeddata', 'onloadedmetadata', 'onloadstart', 'onpause',
'onplay', 'onplaying', 'onprogress', 'onratechange',
'onreadystatechange', 'onseeked', 'onseeking', 'onstalled',
'onsuspend', 'ontimeupdate', 'onvolumechange', 'onwaiting']
return event_attributes
def write_to_file(self, item, spider):
with open(self.filename, 'a+') as f:
f.write('\n')
f.write('URL: '+item['orig_url']+'\n')
spider.log(' URL: '+item['orig_url'], level=INFO)
f.write('response URL: '+item['resp_url']+'\n')
spider.log(' response URL: '+item['resp_url'], level=INFO)
if 'POST_to' in item:
f.write('POST url: '+item['POST_to']+'\n')
spider.log(' POST url: '+item['POST_to'], level=INFO)
f.write('Unfiltered: '+item['unfiltered']+'\n')
spider.log(' Unfiltered: '+item['unfiltered'], level=INFO)
f.write('Payload: '+item['xss_payload']+'\n')
spider.log(' Payload: '+item['xss_payload'], level=INFO)
f.write('Type: '+item['xss_place']+'\n')
spider.log(' Type: '+item['xss_place'], level=INFO)
f.write('Injection point: '+item['xss_param']+'\n')
spider.log(' Injection point: '+item['xss_param'], level=INFO)
if 'sugg_payloads' in item:
f.write('Possible payloads: '+item['sugg_payloads']+'\n')
spider.log(' Possible payloads: '+item['sugg_payloads'], level=INFO)
# Cut off the line at 500
#f.write('Line: '+item['line'][-500:]+'\n')
f.write('Line: '+item['line']+'\n')
spider.log(' Line: '+item['line'], level=INFO)
if 'error' in item:
f.write('Error: '+item['error']+'\n')
spider.log(' Error: '+item['error'], level=INFO)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment