-
-
Save decidedlygray/f0727a63b7f68aae41155b0c90232d59 to your computer and use it in GitHub Desktop.
Debugging xsscrapy.py - Issue #36
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Define your item pipelines here | |
# | |
# Don't forget to add your pipeline to the ITEM_PIPELINES setting | |
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html | |
from scrapy.exceptions import DropItem | |
import HTMLParser | |
from xsscrapy.items import vuln#, inj_resp | |
import re | |
import lxml.etree | |
import lxml.html | |
from lxml.html import soupparser, fromstring | |
import itertools | |
#from IPython import embed | |
from socket import gaierror, gethostbyname | |
from urlparse import urlparse | |
from logging import CRITICAL, ERROR, WARNING, INFO, DEBUG | |
class XSSCharFinder(object): | |
def __init__(self): | |
self.url_param_xss_items = [] | |
def get_filename(self, url): | |
filename = 'xsscrapy-vulns.txt' | |
up = urlparse(url).netloc.replace('www.', '').split(':')[0] | |
if up: | |
filename = up + '.txt' | |
return filename | |
def open_spider(self, spider): | |
self.filename = self.get_filename(spider.url) | |
def process_item(self, item, spider): | |
response = item['resp'] | |
meta = response.meta | |
payload = meta['payload'] | |
delim = meta['delim'] | |
param = meta['xss_param'] | |
resp_url = response.url | |
body = response.body | |
mismatch = False | |
error = None | |
fuzz_payload = payload.replace(delim, '').replace(';9', '') # xss char payload | |
# Regex: ( ) mean group 1 is within the parens, . means any char, | |
# {1,80} means match any char 0 to 80 times, 80 chosen because double URL encoding | |
# ? makes the search nongreedy so it stops after hitting its limits | |
full_match = '%s.{0,80}?%s' % (delim, delim) | |
# matches with semicolon which sometimes cuts results off | |
sc_full_match = '%s.{0,80}?%s;9' % (delim, delim) | |
# Quick sqli check based on DSSS | |
dbms, regex = self.sqli_check(body, meta['orig_body']) | |
if dbms: | |
msg = 'Possible SQL injection error! Suspected DBMS: %s, regex used: %s' % (dbms, regex) | |
item = self.make_item(meta, resp_url, msg, 'N/A', None) | |
self.write_to_file(item, spider) | |
item = None | |
# Check for script tags pointing to a no longer registered domain | |
unclaimedURL = self.unclaimedURL_check(body) | |
if unclaimedURL: | |
msg = 'Found non-registered domain in script tag! Non-registered URL: %s' % unclaimedURL | |
with open(self.filename, 'a+') as f: | |
f.write('\n') | |
f.write('URL: '+resp_url+'\n') | |
spider.log(' URL: '+resp_url+'\n', level=INFO) | |
f.write(msg+'\n') | |
spider.log(' '+msg+'\n', level=INFO) | |
# Now that we've checked for SQLi, we can lowercase the body | |
body = body.lower() | |
# XSS detection starts here | |
re_matches = sorted([(m.start(), m.group(), m.end()) for m in re.finditer(full_match, body)]) | |
if len(re_matches) > 0: | |
#scolon_matches = sorted([(m.start(), m.group()) for m in re.finditer(sc_full_match, body)]) | |
# If regex finds anything, get lxml matches | |
lxml_injs = self.get_lxml_matches(full_match, body, resp_url, delim) | |
if lxml_injs: | |
err = None | |
if len(re_matches) != len(lxml_injs): | |
spider.log('Error: mismatch in injections found by lxml and regex; higher chance of false positive for %s' % resp_url) | |
error = 'Error: mismatch in injections found by lxml and regex; higher chance of false positive' | |
mismatch = True | |
inj_data = self.combine_regex_lxml(lxml_injs, re_matches, body, mismatch, payload, delim) | |
# If mismatch is True, then "for offset in sorted(inj_data)" will fail with TypeError | |
try: | |
for offset in sorted(inj_data): | |
########## XSS LOGIC ############# | |
item = self.xss_logic(inj_data[offset], meta, resp_url, error) | |
if item: | |
item = self.url_item_filtering(item, spider) | |
if mismatch: | |
item['error'] = 'Mismatch: html parsed vs regex parsed injections, %d vs %d. Higher chance of false positive' % (len(injections), len(full_matches)) | |
self.write_to_file(item, spider) | |
return item | |
except TypeError: | |
if mismatch: | |
return | |
else: | |
raise | |
# No lxml_match_data | |
else: | |
spider.log('Error: regex found injection, but lxml did not') | |
# Catch all test payload chars no delims | |
# Catches DB errors | |
pl_lines_found = self.payloaded_lines(body, fuzz_payload) | |
if pl_lines_found: | |
item = self.make_item(meta, resp_url, pl_lines_found, fuzz_payload, None) | |
if item: | |
item = self.url_item_filtering(item, spider) | |
item['error'] = 'Payload delims do not surround this injection point. Found via search for entire payload.' | |
self.write_to_file(item, spider) | |
return item | |
raise DropItem('No XSS vulns in %s. type = %s, %s' % (resp_url, meta['xss_place'], meta['xss_param'])) | |
def sqli_check(self, body, orig_body): | |
''' Do a quick lookup in the response body for SQL errors. Both w3af's and DSSS.py's methods are in here | |
but sectoolsmarket.com shows DSSS as having better detection rates so using theirs ''' | |
# Taken from Damn Small SQLi Scanner | |
DBMS_ERRORS = {"MySQL": (r"SQL syntax.*MySQL", r"Warning.*mysql_.*", r"valid MySQL result", r"MySqlClient\."), | |
"PostgreSQL": (r"PostgreSQL.*ERROR", r"Warning.*\Wpg_.*", r"valid PostgreSQL result", r"Npgsql\."), | |
"Microsoft SQL Server": (r"Driver.* SQL[\-\_\ ]*Server", r"OLE DB.* SQL Server", r"(\W|\A)SQL Server.*Driver", r"Warning.*mssql_.*", | |
r"(\W|\A)SQL Server.*[0-9a-fA-F]{8}", r"(?s)Exception.*\WSystem\.Data\.SqlClient\.", r"(?s)Exception.*\WRoadhouse\.Cms\."), | |
"Microsoft Access": (r"Microsoft Access Driver", r"JET Database Engine", r"Access Database Engine"), | |
"Oracle": (r"ORA-[0-9][0-9][0-9][0-9]", r"Oracle error", r"Oracle.*Driver", r"Warning.*\Woci_.*", r"Warning.*\Wora_.*")} | |
for (dbms, regex) in ((dbms, regex) for dbms in DBMS_ERRORS for regex in DBMS_ERRORS[dbms]): | |
if re.search(regex, body, re.I) and not re.search(regex, orig_body, re.I): | |
return (dbms, regex) | |
return None, None | |
# Taken from w3af | |
#SQL_errors = ("System.Data.OleDb.OleDbException", | |
# "[SQL Server]", | |
# "[Microsoft][ODBC SQL Server Driver]", | |
# "[SQLServer JDBC Driver]", | |
# "[SqlException", | |
# "System.Data.SqlClient.SqlException", | |
# "Unclosed quotation mark after the character string", | |
# "'80040e14'", | |
# "mssql_query()", | |
# "odbc_exec()", | |
# "Microsoft OLE DB Provider for ODBC Drivers", | |
# "Microsoft OLE DB Provider for SQL Server", | |
# "Incorrect syntax near", | |
# "Sintaxis incorrecta cerca de", | |
# "Syntax error in string in query expression", | |
# "ADODB.Field (0x800A0BCD)<br>", | |
# "ADODB.Recordset'", | |
# "Unclosed quotation mark before the character string", | |
# "'80040e07'", | |
# "Microsoft SQL Native Client error", | |
# "SQLCODE", | |
# "DB2 SQL error:", | |
# "SQLSTATE", | |
# "[CLI Driver]", | |
# "[DB2/6000]", | |
# "Sybase message:", | |
# "Sybase Driver", | |
# "[SYBASE]", | |
# "Syntax error in query expression", | |
# "Data type mismatch in criteria expression.", | |
# "Microsoft JET Database Engine", | |
# "[Microsoft][ODBC Microsoft Access Driver]", | |
# "Microsoft OLE DB Provider for Oracle", | |
# "wrong number or types", | |
# "PostgreSQL query failed:", | |
# "supplied argument is not a valid PostgreSQL result", | |
# "unterminated quoted string at or near", | |
# "pg_query() [:", | |
# "pg_exec() [:", | |
# "supplied argument is not a valid MySQL", | |
# "Column count doesn\'t match value count at row", | |
# "mysql_fetch_array()", | |
# "mysql_", | |
# "on MySQL result index", | |
# "You have an error in your SQL syntax;", | |
# "You have an error in your SQL syntax near", | |
# "MySQL server version for the right syntax to use", | |
# "Division by zero in", | |
# "not a valid MySQL result", | |
# "[MySQL][ODBC", | |
# "Column count doesn't match", | |
# "the used select statements have different number of columns", | |
# "DBD::mysql::st execute failed", | |
# "DBD::mysql::db do failed:", | |
# "com.informix.jdbc", | |
# "Dynamic Page Generation Error:", | |
# "An illegal character has been found in the statement", | |
# "[Informix]", | |
# "<b>Warning</b>: ibase_", | |
# "Dynamic SQL Error", | |
# "[DM_QUERY_E_SYNTAX]", | |
# "has occurred in the vicinity of:", | |
# "A Parser Error (syntax error)", | |
# "java.sql.SQLException", | |
# "Unexpected end of command in statement", | |
# "[Macromedia][SQLServer JDBC Driver]", | |
# "could not prepare statement", | |
# "Unknown column", | |
# "where clause", | |
# "SqlServer", | |
# "syntax error") | |
#for e in SQL_errors: | |
# if e in body and e not in orig_body: | |
# return e | |
def unclaimedURL_check(self, body): | |
print('DEBUG unclaimedURL_check:: PROBLEM body ----- START') | |
print(str(body)) | |
print('DEBUG unclaimedURL_check:: PROBLEM body ------ END') | |
tree = fromstring(body) | |
scriptURLs = tree.xpath('//script/@src') | |
for url in scriptURLs: | |
parser = urlparse(url) | |
domain = parser.netloc | |
try: | |
gethostbyname(domain) | |
resolved = True | |
except gaierror: | |
resolved = False | |
if resolved == False: | |
return url | |
def xss_logic(self, injection, meta, resp_url, error): | |
''' XSS logic. Returns None if vulnerability not found | |
The breakout_chars var is a list(set()). This ensure we can | |
test for breakout given OR statements, like " or ; to breakout''' | |
# Gets rid of some false positives | |
# ex: https://www.facebook.com/directory/pages/9zqjxpo'%22()%7B%7D%3Cx%3E:9zqjxpo;9 | |
#if len(unfiltered) > len(self.test_str): | |
# return | |
# Unpack the injection | |
#tag_index, tag, attr, attr_val, payload, unfiltered_chars, line = injection | |
# get_unfiltered_chars() can only return a string 0+ characters, but never None | |
unfiltered_chars = injection[5] | |
payload = injection[4] | |
# injection[6] sometimes == '<p' maybe? | |
# It happened POSTing chatRecord to http://service.taobao.com/support/minerva/robot_save_chat_record.htm | |
# that page returned a Connection: Close header and no body | |
############### THIS NEEDS TO BE THE REFLECTED PAYLOAD | |
line = injection[6] | |
item_found = None | |
# get_reflected_chars() always returns a string | |
if len(unfiltered_chars) > 0: | |
chars_payloads = self.get_breakout_chars(injection, resp_url) | |
# breakout_chars always returns a , never None | |
if len(chars_payloads) > 0: | |
sugg_payloads = [] | |
for chars in chars_payloads: | |
if set(chars).issubset(set(unfiltered_chars)): | |
# Get rid of possible payloads with > in them if > not in unfiltered_chars | |
item_found = True | |
for possible_payload in chars_payloads[chars]: | |
if '>' not in unfiltered_chars: | |
if '>' in possible_payload: | |
continue | |
sugg_payloads.append(possible_payload) | |
if item_found: | |
return self.make_item(meta, resp_url, line, unfiltered_chars, sugg_payloads) | |
def get_breakout_chars(self, injection, resp_url): | |
''' Returns either None if no breakout chars were found | |
or a list of sets of potential breakout characters ''' | |
tag_index, tag, attr, attr_val, payload, unfiltered_chars, line = injection | |
pl_delim = payload[:6] | |
full_match = '%s.{0,85}?%s' % (pl_delim, pl_delim) | |
line = re.sub(full_match, 'INJECTION', line) | |
all_chars_payloads = {} | |
# Comment injection | |
if tag == '!--': | |
chars = ('>') | |
payload = '--><svG onLoad=prompt(9)>' | |
try: | |
all_chars_payloads[chars] += [payload] | |
except KeyError: | |
all_chars_payloads[chars] = [payload] | |
# Attribute injection | |
elif attr: | |
chars_payloads = self.attr_breakout(tag, attr, attr_val, pl_delim, line) | |
for k in chars_payloads: | |
try: | |
all_chars_payloads[k] += chars_payloads[k] | |
except KeyError: | |
all_chars_payloads[k] = chars_payloads[k] | |
# Between tag injection | |
else: | |
chars_payloads = self.tag_breakout(tag, line) | |
for k in chars_payloads: | |
try: | |
all_chars_payloads[k] += chars_payloads[k] | |
except KeyError: | |
all_chars_payloads[k] = chars_payloads[k] | |
# Dedupe the list of potential payloads | |
for chars in all_chars_payloads: | |
all_chars_payloads[chars] = list(set(all_chars_payloads[chars])) | |
return all_chars_payloads | |
def decomment_js(self, line): | |
''' Remove commented JS lines which screw with quote detection ''' | |
lines = line.splitlines() | |
decommented_lines = [l for l in lines if not l.strip().startswith('//')] | |
line = '\n'.join(decommented_lines) | |
return line | |
def tag_breakout(self, tag, line): | |
chars_payloads = {} | |
# Look for javascript breakouts | |
if tag == 'script': | |
# Get rid of comments which screw with the quote detector | |
decommented_js = self.decomment_js(line) | |
# Get rid of javascript escaped quotes | |
dquote_open, squote_open = self.get_quote_context(decommented_js) | |
if dquote_open: | |
chars = ('"') | |
payload = 'x";prompt(9);' | |
try: | |
chars_payloads[chars].append(payload) | |
except KeyError: | |
chars_payloads[chars] = [payload] | |
if squote_open: | |
chars = ("'") | |
payload = "x';prompt(9);" | |
try: | |
chars_payloads[chars].append(payload) | |
except KeyError: | |
chars_payloads[chars] = [payload] | |
chars = ('<', '>') | |
payload = '<svG onLoad=prompt(9)>' | |
try: | |
chars_payloads[chars].append(payload) | |
except KeyError: | |
chars_payloads[chars] = [payload] | |
# If neither double nor single quotes are open at injection point, we just need a ; | |
if not dquote_open and not squote_open: | |
chars = (";") | |
payload = ';prompt(9);' | |
try: | |
chars_payloads[chars].append(payload) | |
except KeyError: | |
chars_payloads[chars] = [payload] | |
chars = ("<", ">", "/") | |
payload = '</SCript><svG/onLoad=prompt(9)>' | |
try: | |
chars_payloads[chars].append(payload) | |
except KeyError: | |
chars_payloads[chars] = [payload] | |
# Everything that's not a script tag | |
else: | |
chars = ("<", ">") | |
payload = '<svG onLoad=prompt(9)>' | |
try: | |
chars_payloads[chars].append(payload) | |
except KeyError: | |
chars_payloads[chars] = [payload] | |
return chars_payloads | |
def get_attr_quote(self, attr, line): | |
''' Return the first quote in the string which | |
should always be the html quote if this is called | |
on a string of html with an attr ''' | |
attr_quote = None | |
# Split the line at the attribute once so target string is everything after the attr | |
split_line = line.split(attr, 1) | |
attr_split_lines = line.split(attr, 1) #[0] is '' | |
if len(attr_split_lines) > 1: | |
attr_split_line = attr + attr_split_lines[1] | |
# match starts with =, then find any amount of space (nongreedy with ?) until either ' or " | |
attr_quote = re.search('=\s*?(\'|")', attr_split_line) | |
if attr_quote: | |
attr_quote = attr_quote.group(1) | |
else: | |
attr_quote = None | |
return attr_quote | |
def attr_breakout(self, tag, attr, attr_val, delim, line): | |
breakout_chars = [] | |
sugg_payload = [] | |
chars_payloads = {} | |
dquote_open, squote_open = self.get_quote_context(line) | |
attr_quote = self.get_attr_quote(attr, line) | |
# javascript:alert(1) vulns | |
# We do this slicing operation because ;9 might be at the end | |
# although it's unnecessary for the payload | |
# CHECK HERE, PASS DOWN THE ORIG ATTR VAL | |
#if delim+'subbed' in attr_val: | |
if attr_val[:len(delim+'subbed')] == delim+'subbed': | |
if tag == 'a' and attr == 'href': | |
# Only need : ( and ) to use javascript:prompt(4) redir payload | |
chars = (':', '(', ')') | |
payload = 'JavaSCript:prompt(9)' | |
try: | |
chars_payloads[chars].append(payload) | |
except KeyError: | |
chars_payloads[chars] = [payload] | |
# Inject the frame or script source file | |
elif tag in ['script', 'frame', 'iframe']: | |
if attr == 'src': | |
chars = (':', '(', ')') | |
payload = 'JavaSCript:prompt(9)' | |
try: | |
chars_payloads[chars].append(payload) | |
except KeyError: | |
chars_payloads[chars] = [payload] | |
# Catching src=vbscript:... | |
# Exploit: msgbox (document.domain) | |
if re.match('vbscript:', attr_val): | |
if dquote_open and squote_open: | |
vbs_quote = self.opposite_quote(attr_quote) | |
chars = ("x", vbs_quote) | |
payload = vbs_quote + '%0a%0dMsgBox (xss)' | |
try: | |
chars_payloads[chars].append(payload) | |
except KeyError: | |
chars_payloads[chars] = [payload] | |
else: | |
chars = ("x") | |
payload = "%0a%0dMsgBox (xss)" | |
try: | |
chars_payloads[chars].append(payload) | |
except KeyError: | |
chars_payloads[chars] = [payload] | |
# Catching src=javascript:... | |
if re.match('javascript:', attr_val): | |
if dquote_open and squote_open: | |
js_quote = self.opposite_quote(attr_quote) | |
chars = (js_quote, ";") | |
payload = js_quote+";prompt(9);" | |
try: | |
chars_payloads[chars].append(payload) | |
except KeyError: | |
chars_payloads[chars] = [payload] | |
else: | |
chars = (";") | |
payload = ";prompt(9);" | |
try: | |
chars_payloads[chars].append(payload) | |
except KeyError: | |
chars_payloads[chars] = [payload] | |
# Check for JS attributes | |
if attr in self.event_attributes(): | |
if dquote_open and squote_open: | |
js_quote = self.opposite_quote(attr_quote) | |
chars = (";", js_quote) | |
payload = 'x'+js_quote+';prompt(9);'+js_quote | |
try: | |
chars_payloads[chars].append(payload) | |
except KeyError: | |
chars_payloads[chars] = [payload] | |
elif not dquote_open and not squote_open: | |
chars = (";") | |
payload = ';prompt(9);' | |
try: | |
chars_payloads[chars].append(payload) | |
except KeyError: | |
chars_payloads[chars] = [payload] | |
#elif squote_open and not dquote_open: | |
else: | |
chars = (";") | |
payload = ";prompt(9);" | |
try: | |
chars_payloads[chars].append(payload) | |
except KeyError: | |
chars_payloads[chars] = [payload] | |
# Check if quotes even exist in the line | |
if '"' in line or "'" in line: | |
if not attr_quote: | |
chars = ('<', '>') | |
payload = 'x><svG onLoad=prompt(9)>' | |
try: | |
chars_payloads[chars].append(payload) | |
except KeyError: | |
chars_payloads[chars] = [payload] | |
else: | |
chars = (attr_quote) | |
payload1 = 'x'+attr_quote+'><svG onLoad=prompt(9)>' | |
payload2 = 'x'+attr_quote+' onmouseover=prompt(9) '+attr_quote | |
payload3 = 'x'+attr_quote+'/onmouseover=prompt(9)/'+attr_quote | |
try: | |
chars_payloads[chars].append(payload1) | |
chars_payloads[chars].append(payload2) | |
chars_payloads[chars].append(payload3) | |
except KeyError: | |
chars_payloads[chars] = [payload1, payload2, payload3] | |
# If no quotes are open or they're just not found: | |
else: | |
# means you can inject your own attribute value like: | |
# x src=http://exploit.com | |
if tag in ['script', 'frame', 'iframe']: | |
chars = (':', ')', '(') | |
payload = 'x src=JaVaSCript:prompt(9)' | |
try: | |
chars_payloads[chars].append(payload) | |
except KeyError: | |
chars_payloads[chars] = [payload] | |
else: | |
chars = ("<", ">") | |
payload = 'x><svG onLoad=prompt(9)>' | |
try: | |
chars_payloads[chars].append(payload) | |
except KeyError: | |
chars_payloads[chars] = [payload] | |
return chars_payloads | |
def get_quote_context(self, line): | |
''' Goes through char by char to determine if double | |
or single quotes are open or not: True/None ''' | |
dquote_open = None | |
squote_open = None | |
# For something like <script>"hey":"hello's"</script> needed a way to remove the single quote | |
# from being considered open since it's not actually important. This prevents a lot of false+ | |
# like in ebay.com and twitter.com | |
first_open = None | |
for c in line: | |
if c == '"': | |
# I have noticed that booking.com throws false+ with this first_open stuff | |
if not first_open: | |
first_open = c | |
dquote_open = self.opposite(dquote_open) | |
if first_open == '"' and dquote_open == None: | |
first_open = None | |
if squote_open == True: | |
squote_open = None | |
elif c == "'": | |
if not first_open: | |
first_open = c | |
squote_open = self.opposite(squote_open) | |
if first_open == "'" and squote_open == None: | |
first_open = None | |
if dquote_open == True: | |
dquote_open = None | |
return dquote_open, squote_open | |
def opposite(self, obj): | |
''' Returns the obj as either True or None ''' | |
if obj: | |
obj = None | |
else: | |
obj = True | |
return obj | |
def opposite_quote(self, quote): | |
''' Return the opposite quote of the one give, single for double or | |
double for single ''' | |
if quote == '"': | |
oppo_quote = "'" | |
else: | |
oppo_quote = '"' | |
return oppo_quote | |
def get_lxml_matches(self, full_match, body, resp_url, delim): | |
# Replace the payloaded string with just the delim string (minus the ; ) | |
sub = delim+'subbed' | |
subbed_body = re.sub(full_match, sub, body) | |
doc = self.html_parser(subbed_body, resp_url) | |
lxml_injs = self.xpath_inj_points(sub, doc) | |
return lxml_injs | |
def html_parser(self, body, resp_url): | |
try: | |
# You must use lxml.html.soupparser or else candyass webdevs who use identical | |
# multiple html attributes with injections in them don't get caught | |
# That being said, soupparser is crazy slow and introduces a ton of | |
# new bugs so that is not an option at this point in time | |
doc = lxml.html.fromstring(body, base_url=resp_url) | |
except lxml.etree.ParserError: | |
self.log('ParserError from lxml on %s' % resp_url) | |
return | |
except lxml.etree.XMLSyntaxError: | |
self.log('XMLSyntaxError from lxml on %s' % resp_url) | |
return | |
return doc | |
#try: | |
# # You must use soupparser or else candyass webdevs who use identical | |
# # multiple html attributes with injections in them don't get caught | |
# # EXCEPT I found out that soupparser is a CPU MONSTER and intros more bugs | |
# doc = lxml.html.soupparser.fromstring(subbed_body) | |
#except ValueError: | |
# try: | |
# doc = lxml.html.fromstring(subbed_body) | |
# except lxml.etree.ParserError: | |
# self.log('ParserError from lxml on %s' % resp_url) | |
# return | |
# except lxml.etree.XMLSyntaxError: | |
# self.log('XMLSyntaxError from lxml on %s' % resp_url) | |
# return | |
#except lxml.etree.ParserError: | |
# self.log('ParserError from lxml on %s' % resp_url) | |
# return | |
#except lxml.etree.XMLSyntaxError: | |
# self.log('XMLSyntaxError from lxml on %s' % resp_url) | |
# return | |
def combine_regex_lxml(self, lxml_injs, full_matches, body, mismatch, payload, delim): | |
''' Combine lxml injection data with the 2 regex injection search data ''' | |
all_inj_data = {} | |
for idx, match in enumerate(full_matches): | |
try: | |
tag_index, tag = lxml_injs[idx][0] | |
except IndexError: | |
if mismatch: | |
return | |
else: | |
raise | |
# Always returns a populated dict, even if its {None:None} like | |
# with inbetween tag injections | |
attrs_attrvals = lxml_injs[idx][1] | |
if attrs_attrvals: | |
for a in attrs_attrvals: | |
attr = a | |
attr_val = attrs_attrvals[a] | |
# I don't think this is necessary now that we use lxml.html.soupparse.fromstring rather than lxml.html.fromstring | |
else: | |
spider.log("Error: Possible broken HTML. Lxml failed to find the attribute. twitter.com/?q=[payload] is an example.") | |
continue | |
# If it's an attribute inj then the tag will always be "<tag " | |
# If it's an injection in a comment or between tags | |
# it will always start with "<tag" (comment inj tag = "!--") | |
if attr: | |
tag_delim = '<'+tag+' ' | |
else: | |
tag_delim = '<'+tag | |
match_start_offset = match[0] | |
ret_between_delim = match[1] | |
match_end_offset = match[2] | |
sub = delim+'subbed' | |
start_of_match = body[match_start_offset:] | |
ref_payload_plus_2 = start_of_match[:(match_end_offset - match_start_offset)+2] | |
ref_payload = start_of_match[:(match_end_offset - match_start_offset)] | |
# ref_payload would only equal ref_payload_plus_2 if ;9 doesn't exist | |
if ref_payload != ref_payload_plus_2: | |
if ref_payload_plus_2[-2:] == ';9': | |
ref_payload = ref_payload_plus_2 | |
# split the body at the tag, then take the last fragment | |
# which is closest to the injection point as regexed | |
split_body = body[:match_start_offset] | |
line_no_tag = split_body.split(tag_delim)[-1]#.replace('\\"', '').replace("\\'", "") | |
line = tag_delim + line_no_tag + ref_payload | |
# Sometimes it may split wrong, in which case we drop that lxml match | |
if line_no_tag.startswith('<doctype') or line_no_tag.startswith('<html'): | |
line = '' | |
# Get the accurate attr, as the original attr and attr_val in this function | |
# would just be the first one found even if there were >1 attributes in lxml_injs | |
if attr: | |
attr_dict = self.accurate_attr(tag, attrs_attrvals, match, line) | |
for a in attr_dict: | |
attr = a | |
attr_val = attr_dict[a] | |
break | |
unfiltered_chars = self.get_unfiltered_chars(payload, ref_payload, delim, tag, attr) | |
# Common false+ shows only "> as unfiltered if script parses the chars between 2 unrelated delim strs | |
#if unfiltered_chars == '">': | |
# unfiltered_chars = '' | |
all_inj_data[match_start_offset] = [tag_index, tag, attr, attr_val, payload, unfiltered_chars, line] | |
return all_inj_data | |
def get_unfiltered_chars(self, payload, ref_payload, delim, tag, attr): | |
""" | |
Pull out just the unfiltered chars from the reflected chars | |
payload = delim+fuzz+delim+;9 | |
""" | |
ref_chars = ref_payload.replace(delim, '').replace('9', '') | |
fuzz_chars = payload.replace(delim, '').replace('9', '') | |
remove_chars = set([]) | |
unfiltered_chars_list = [] | |
html_entities = [''', '"', '<', '>'] | |
# Remove html encoded entities | |
for entity in html_entities: | |
if entity in ref_chars: | |
ref_chars.replace(entity, '') | |
#If injection is inside script tag, remove the escaped chars | |
if tag == 'script' or attr in self.event_attributes(): | |
ref_chars = ref_chars.replace("\\'", "").replace('\\"', '').replace('\;', '').replace('\\>', '').replace('\\<', '').replace('\\/', '') | |
for c in fuzz_chars: | |
if c in ref_chars: | |
continue | |
else: | |
remove_chars.add(c) | |
for char in fuzz_chars: | |
if char not in remove_chars: | |
unfiltered_chars_list.append(char) | |
unfiltered_chars = ''.join(unfiltered_chars_list) | |
return unfiltered_chars | |
def accurate_attr(self, tag, attrs_attrvals, match, line): | |
''' lxml cannot determine the order of attrs which is important | |
if we're using regex to find the location of the inj so this func | |
finds the accurate attr based on the regex match amongst a list of | |
attrs of the same tag. This is for multi injections in single attr_val | |
as well as multi injections in multiple attrs.''' | |
# Going for process of elimination first | |
copy_attrs_attrvals = attrs_attrvals.copy() | |
for attr in attrs_attrvals: | |
if attr+'=' not in line and attr+' =' not in line: | |
copy_attrs_attrvals.pop(attr, None) | |
# Keep doing research to find the accurate attr | |
if len(copy_attrs_attrvals) > 1: | |
attr_places = {} | |
for attr in copy_attrs_attrvals: | |
attrs_found = [x.end() for x in re.finditer(attr+'\s?=', line)] | |
attr_places[max(attrs_found)] = attr | |
# Get the attribute from the higher numbered key within attr_places | |
closest_attr = attr_places[max(attr_places)] | |
return {closest_attr:copy_attrs_attrvals[closest_attr]} | |
# Found the accurate attr | |
else: | |
for attr in copy_attrs_attrvals: | |
return {attr:copy_attrs_attrvals[attr]} | |
return {None:None} | |
def payloaded_lines(self, body, payload): | |
pl_lines_found = sorted([]) | |
for line in body.splitlines(): | |
if payload in line: | |
pl_lines_found.append(line) | |
return pl_lines_found | |
def make_item(self, meta, resp_url, line, unfiltered, sugg_payloads): | |
''' Create the vuln item ''' | |
item = vuln() | |
if isinstance(line, str): | |
item['line'] = line | |
else: | |
item['line'] = '\n'.join(line) | |
item['xss_payload'] = meta['payload'] | |
item['unfiltered'] = unfiltered | |
item['xss_param'] = meta['xss_param'] | |
item['xss_place'] = meta['xss_place'] | |
item['orig_url'] = meta['orig_url'] | |
item['resp_url'] = resp_url | |
if sugg_payloads: | |
item['sugg_payloads'] = ', '.join(sugg_payloads) | |
if 'POST_to' in meta: | |
item['POST_to'] = meta['POST_to'] | |
# Just make sure one of the options has been set | |
if item['unfiltered']: | |
return item | |
def xpath_inj_points(self, search_str, doc): | |
''' Searches lxml doc for any text, attributes, or comments | |
that reflect the subbed text ''' | |
injections = [] | |
same_tag_attrs = {} | |
# Why this xpath is wrong: perm.ly/dom4j-xpath-contains | |
#text_xss = doc.xpath("//*[contains(text(), '%s')]" % search_str) | |
# Will this find DB error message stuff without a tag? No. It will not. | |
#text_xss = doc.xpath("//*[text()]")#[contains(.,'%s')]]" % search_str) | |
# Return all text nodes that contain the search_str | |
text_xss = doc.xpath("//text()[contains(., '%s')]" % search_str) | |
# Returns parent elements //* with any attribute [@* that | |
# contains a value with search_str in it [contains(., '%s')] | |
attr_xss = doc.xpath("//*[@*[contains(., '%s')]]" % search_str) | |
# Return any comment that contains certain text | |
comment_xss = doc.xpath("//comment()[contains(., '%s')]" % search_str) | |
attr_inj = self.parse_attr_xpath(attr_xss, search_str, doc) | |
comm_inj = self.parse_comm_xpath(comment_xss, search_str, doc) | |
text_inj = self.parse_text_xpath(text_xss, search_str, doc) | |
injects = [attr_inj, comm_inj, text_inj] | |
for i in injects: | |
# i = attr_inj or comm_inj or text_inj list | |
for x in i: | |
# x = (tag_index, tag, attr, attr_val) | |
tag_index = x[0] | |
tag = x[1] | |
attr = x[2] | |
attr_val = x[3] | |
# Sometimes <script> tags don't appear within the doc?? Dunno why, maybe some weird ajax shit | |
# But if we remove them then we lose the matchups between lxml and regex so we must keep them | |
# Just make them useless by entering empty tag and putting them at the end of the lxml matches | |
# so a split at tag won't find anything | |
if not tag_index: | |
print ' '*36+'ERROR: Error: could not find tag index location. Element does not exist in root doc.' | |
tag_index = 999999999 | |
tag = '' | |
loc_tag = (tag_index, tag) | |
attr_attrval = {attr:attr_val} | |
# combine the multi attr injections into a dict otherwise | |
# just add the {attr, attr_val} injection as the second item | |
if attr: | |
if loc_tag in same_tag_attrs: | |
same_tag_attrs[loc_tag][attr] = attr_val | |
else: | |
same_tag_attrs[loc_tag] = attr_attrval | |
injections.append((loc_tag, attr_attrval)) | |
# Final injections list should be like: | |
# injections = [(tag_index, tag), {attribute1:attribute_value, attribute2:attribute_value}] | |
# with variables amount of attributes with injected values | |
for idx, i in enumerate(injections): | |
loc_tag = i[0] | |
if loc_tag in same_tag_attrs: | |
injections[idx] = (loc_tag, same_tag_attrs[loc_tag]) | |
injections = sorted(injections) | |
if len(injections) > 0: | |
return injections | |
def get_elem_position(self, elem, doc): | |
''' Iterate through all elements in doc | |
and match them up against the element found | |
during xpathing ''' | |
order = 0 | |
for i in doc.iter(): | |
order += 1 | |
if i == elem: | |
return order | |
def parse_attr_xpath(self, xpath, search_str, doc): | |
''' Find all tags with attributes that contain the subbed str ''' | |
attr_inj = [] | |
for x in xpath: | |
# x = http://lxml.de/api/lxml.etree._Element-class.html | |
tag_index = self.get_elem_position(x, doc) | |
tag = x.tag | |
items = x.items() | |
for i in items: | |
#i = (attr, attr_val) | |
attr = i[0] | |
attr_val = i[1] | |
found = re.findall(search_str, attr_val) | |
for f in found: | |
attr_inj.append((tag_index, tag, attr, attr_val)) | |
return attr_inj | |
def parse_comm_xpath(self, xpath, search_str, doc): | |
''' Parse the xpath comment search findings ''' | |
comm_inj = [] | |
for x in xpath: | |
parent = x.getparent() | |
text = x.text | |
found = re.findall(search_str, text) | |
for f in found: | |
tag_index = self.get_elem_position(x, doc) | |
# Set this tag so when we split the final line | |
# We split it at '<' + tag so that gives us | |
# a split at <!-- | |
tag = '!--' | |
comm_inj.append((tag_index, tag, None, None)) | |
return comm_inj | |
def parse_text_xpath(self, xpath, search_str, doc): | |
''' Creates injection points for the xpath that finds the payload in any html enclosed text ''' | |
text_inj = [] | |
for x in xpath: | |
parent = x.getparent() | |
# In case parent is a Comment() | |
while callable(parent.tag): | |
parent = parent.getparent() | |
tag = parent.tag | |
tag_index = self.get_elem_position(parent, doc) | |
found = re.findall(search_str, x.strip()) | |
for f in found: | |
text_inj.append((tag_index, tag, None, None)) | |
return text_inj | |
def unescape_payload(self, payload): | |
''' Unescape the various payload encodings (html and url encodings)''' | |
if '%' in payload: | |
payload = urllib.unquote_plus(payload) | |
#if '%' in payload: # in case we ever add double url encoding like %2522 for dub quote | |
# payload = urllib.unquote_plus(payload) | |
# only html-encoded payloads will have & in them | |
payload = HTMLParser.HTMLParser().unescape(payload) | |
return payload | |
def get_reflected_chars(self, tag, attr, ref_payload, delim, body, match_end_offset): | |
''' Check for the special chars and append them to a master list of tuples, one tuple per injection point | |
Always returns a string ''' | |
returned_chars = ref_payload.replace(delim, '').replace('9','') | |
return returned_chars | |
def url_item_filtering(self, item, spider): | |
''' Make sure we're not just repeating the same URL XSS over and over ''' | |
if item['xss_place'] == 'url': | |
if len(self.url_param_xss_items) > 0: | |
for i in self.url_param_xss_items: | |
# If the injection param, the url up until the injected param and the payload | |
# are all the same as a previous item, then don't bother creating the item | |
# Match tags where injection point was found | |
if item['xss_param'] == i['xss_param']: | |
# Match the URL up until the params | |
if item['orig_url'].split('?', 1)[0] == i['orig_url'].split('?', 1)[0]: | |
# Match the payload | |
if item['xss_payload'] == i['xss_payload']: | |
# Match the unfiltered characters | |
if item['unfiltered'] == i['unfiltered']: | |
raise DropItem('Duplicate item found: %s' % item['orig_url']) | |
self.url_param_xss_items.append(item) | |
return item | |
def event_attributes(self): | |
''' HTML tag attributes that allow javascript TAKEN OUT AT THE MOMENT''' | |
event_attributes = ['onafterprint', 'onbeforeprint', 'onbeforeunload', 'onerror', | |
'onhaschange', 'onload', 'onmessage', 'onoffline', 'ononline', | |
'onpagehide', 'onpageshow', 'onpopstate', 'onredo', 'onresize', | |
'onstorage', 'onundo', 'onunload', 'onblur', 'onchange', | |
'oncontextmenu', 'onfocus', 'onformchange', 'onforminput', | |
'oninput', 'oninvalid', 'onreset', 'onselect', 'onsubmit', | |
'onkeydown', 'onkeypress', 'onkeyup', 'onclick', 'ondblclick', | |
'ondrag', 'ondragend', 'ondragenter', 'ondragleave', 'ondragover', | |
'ondragstart', 'ondrop', 'onmousedown', 'onmousemove', | |
'onmouseout', 'onmouseover', 'onmouseup', 'onmousewheel', | |
'onscroll', 'onabort', 'oncanplay', 'oncanplaythrough', | |
'ondurationchange', 'onemptied', 'onended', 'onerror', | |
'onloadeddata', 'onloadedmetadata', 'onloadstart', 'onpause', | |
'onplay', 'onplaying', 'onprogress', 'onratechange', | |
'onreadystatechange', 'onseeked', 'onseeking', 'onstalled', | |
'onsuspend', 'ontimeupdate', 'onvolumechange', 'onwaiting'] | |
return event_attributes | |
def write_to_file(self, item, spider): | |
with open(self.filename, 'a+') as f: | |
f.write('\n') | |
f.write('URL: '+item['orig_url']+'\n') | |
spider.log(' URL: '+item['orig_url'], level=INFO) | |
f.write('response URL: '+item['resp_url']+'\n') | |
spider.log(' response URL: '+item['resp_url'], level=INFO) | |
if 'POST_to' in item: | |
f.write('POST url: '+item['POST_to']+'\n') | |
spider.log(' POST url: '+item['POST_to'], level=INFO) | |
f.write('Unfiltered: '+item['unfiltered']+'\n') | |
spider.log(' Unfiltered: '+item['unfiltered'], level=INFO) | |
f.write('Payload: '+item['xss_payload']+'\n') | |
spider.log(' Payload: '+item['xss_payload'], level=INFO) | |
f.write('Type: '+item['xss_place']+'\n') | |
spider.log(' Type: '+item['xss_place'], level=INFO) | |
f.write('Injection point: '+item['xss_param']+'\n') | |
spider.log(' Injection point: '+item['xss_param'], level=INFO) | |
if 'sugg_payloads' in item: | |
f.write('Possible payloads: '+item['sugg_payloads']+'\n') | |
spider.log(' Possible payloads: '+item['sugg_payloads'], level=INFO) | |
# Cut off the line at 500 | |
#f.write('Line: '+item['line'][-500:]+'\n') | |
f.write('Line: '+item['line']+'\n') | |
spider.log(' Line: '+item['line'], level=INFO) | |
if 'error' in item: | |
f.write('Error: '+item['error']+'\n') | |
spider.log(' Error: '+item['error'], level=INFO) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment