fahadysf/custom-appid.py

## custom-appid.py
#!/usr/local/bin/python2.7

"""
Copyright (C) 2018
Authors: Fahad Yousuf, Lorenzo Castelletti

This script parses a PCAP file and looks for common data patterns in TCP payloads.
These can then be used as signatures while building a custom App-ID.
It works for captures with multiple sessions of the same unknown-tcp traffic.

Known limitations:
-TCP Only.
-Only the first payload in each TCP flow is considered for matching.
-Intended to be used with context unknown-req-tcp-payload, but strings can always be converted to ASCII
-Manual reordering necessary if packets are out of order


"""


import dpkt
import binascii
import string

lengths = []
tomatch = []
globalmatch = []
strings = {}
complete = 1
tcpcounter = 0
ipcounter = 0
seq = 0
filename = 'http.cap'

# Functions that calculate the longest match given a set of strings. Thank you Stackoverflow :)
def long_substr(data):
    substr = ''
    if len(data) > 1 and len(data[0]) > 0:
        for i in range(len(data[0])):
            for j in range(len(data[0])-i+1):
                if j > len(substr) and is_substr(data[0][i:i+j], data):
                    substr = data[0][i:i+j]
    return substr

def is_substr(find, data):
    if len(data) < 1 and len(find) < 1:
        return False
    for i in range(len(data)):
        if find not in data[i]:
            return False
    return True

def hex_to_ascii(hexstr):
    if len(hexstr)%2==1:
        hexstr = "0"+hexstr
    data = (hexstr).decode("hex")
    output = "".join(c if c in string.printable else '.' for c in data)
    return output.strip()

# PCAP parsing starts here
for ts, pkt in dpkt.pcap.Reader(open(filename, 'rb')):
    try:
        eth = dpkt.ethernet.Ethernet(pkt)
    except:
        print("Could not process frame at timestamp: %s" % str(ts))
    ip = eth.data
    if (type(ip)== dpkt.ip.IP) and isinstance(ip.data, dpkt.tcp.TCP):
        tcp = ip.data
        # Save sequence number of the first segment in TCP flow
        if ( tcp.flags & dpkt.tcp.TH_SYN ) != 0 and ( tcp.flags & dpkt.tcp.TH_ACK ) == 0:
            seq = tcp.seq
        payload = binascii.hexlify(tcp.data).decode()
        length = len(payload)/2
        # If payload is first in the flow, grab length and actual payload in HEX for comparison
        if length > 13 and tcp.seq >= seq + 5:
            if length not in lengths:
                lengths.append(length)
            strings[payload] = length
            tcpcounter += 1
            print("TS %s: HEX: %s | ASCII: %s" % (str(ts), payload[:64], hex_to_ascii(payload[:64])))
        ipcounter += 1


# HEX strings matching starts here
lengths.sort()
for n in lengths:
    for index, value in strings.iteritems():
        if value == n:
            tomatch.append(index)
    match = long_substr(tomatch)

    if match != '':
        print('Common substring for segments with payload of %d bytes: %s | ASCII: %s' % (n, match[:64], hex_to_ascii(match[:64])) )
        if len(match)/2 < 7:
            complete = 0
            print('This substring is shorter than 7 bytes. See below for all payloads')
            for x in tomatch:
                print( "%s | ASCII: %s" % (x[:64], hex_to_ascii(x[:64])) )
        else:
            globalmatch.append(match)
    else:
        if len(tomatch) == 1:
            print('Common substring for segments with payload of %d bytes: %s | ASCII: %s' % (n, tomatch[0][:64], hex_to_ascii(tomatch[0][:64])))
            globalmatch.append(tomatch[0])
        else:
            complete = 0
            print('No common substring for segments with payload of %d bytes. See below for all payloads' % n)
            for x in tomatch:
                print( "%s | ASCII: %s" % (x[:64], hex_to_ascii(x[:64])) )

    tomatch = []

if complete == 1:
    match = long_substr(globalmatch)
    print
    print('*** Global common substring: %s ***' %  match[:64])
    if len(match) / 2 < 7:
        print('This string is shorter than 7 bytes. Use all previously found strings.')
else:
    print('*** No global common substring found ***')


print('Total number of TCP segments analysed for matches: %d' % tcpcounter)
print('Total number of analysed IP packets: %d' % ipcounter)
print('Displaying the first 32 bytes only.')
	#!/usr/local/bin/python2.7

	"""
	Copyright (C) 2018
	Authors: Fahad Yousuf, Lorenzo Castelletti

	This script parses a PCAP file and looks for common data patterns in TCP payloads.
	These can then be used as signatures while building a custom App-ID.
	It works for captures with multiple sessions of the same unknown-tcp traffic.

	Known limitations:
	-TCP Only.
	-Only the first payload in each TCP flow is considered for matching.
	-Intended to be used with context unknown-req-tcp-payload, but strings can always be converted to ASCII
	-Manual reordering necessary if packets are out of order


	"""


	import dpkt
	import binascii
	import string

	lengths = []
	tomatch = []
	globalmatch = []
	strings = {}
	complete = 1
	tcpcounter = 0
	ipcounter = 0
	seq = 0
	filename = 'http.cap'

	# Functions that calculate the longest match given a set of strings. Thank you Stackoverflow :)
	def long_substr(data):
	substr = ''
	if len(data) > 1 and len(data[0]) > 0:
	for i in range(len(data[0])):
	for j in range(len(data[0])-i+1):
	if j > len(substr) and is_substr(data[0][i:i+j], data):
	substr = data[0][i:i+j]
	return substr

	def is_substr(find, data):
	if len(data) < 1 and len(find) < 1:
	return False
	for i in range(len(data)):
	if find not in data[i]:
	return False
	return True

	def hex_to_ascii(hexstr):
	if len(hexstr)%2==1:
	hexstr = "0"+hexstr
	data = (hexstr).decode("hex")
	output = "".join(c if c in string.printable else '.' for c in data)
	return output.strip()

	# PCAP parsing starts here
	for ts, pkt in dpkt.pcap.Reader(open(filename, 'rb')):
	try:
	eth = dpkt.ethernet.Ethernet(pkt)
	except:
	print("Could not process frame at timestamp: %s" % str(ts))
	ip = eth.data
	if (type(ip)== dpkt.ip.IP) and isinstance(ip.data, dpkt.tcp.TCP):
	tcp = ip.data
	# Save sequence number of the first segment in TCP flow
	if ( tcp.flags & dpkt.tcp.TH_SYN ) != 0 and ( tcp.flags & dpkt.tcp.TH_ACK ) == 0:
	seq = tcp.seq
	payload = binascii.hexlify(tcp.data).decode()
	length = len(payload)/2
	# If payload is first in the flow, grab length and actual payload in HEX for comparison
	if length > 13 and tcp.seq >= seq + 5:
	if length not in lengths:
	lengths.append(length)
	strings[payload] = length
	tcpcounter += 1
	print("TS %s: HEX: %s \| ASCII: %s" % (str(ts), payload[:64], hex_to_ascii(payload[:64])))
	ipcounter += 1


	# HEX strings matching starts here
	lengths.sort()
	for n in lengths:
	for index, value in strings.iteritems():
	if value == n:
	tomatch.append(index)
	match = long_substr(tomatch)

	if match != '':
	print('Common substring for segments with payload of %d bytes: %s \| ASCII: %s' % (n, match[:64], hex_to_ascii(match[:64])) )
	if len(match)/2 < 7:
	complete = 0
	print('This substring is shorter than 7 bytes. See below for all payloads')
	for x in tomatch:
	print( "%s \| ASCII: %s" % (x[:64], hex_to_ascii(x[:64])) )
	else:
	globalmatch.append(match)
	else:
	if len(tomatch) == 1:
	print('Common substring for segments with payload of %d bytes: %s \| ASCII: %s' % (n, tomatch[0][:64], hex_to_ascii(tomatch[0][:64])))
	globalmatch.append(tomatch[0])
	else:
	complete = 0
	print('No common substring for segments with payload of %d bytes. See below for all payloads' % n)
	for x in tomatch:
	print( "%s \| ASCII: %s" % (x[:64], hex_to_ascii(x[:64])) )

	tomatch = []

	if complete == 1:
	match = long_substr(globalmatch)
	print
	print('* Global common substring: %s *' % match[:64])
	if len(match) / 2 < 7:
	print('This string is shorter than 7 bytes. Use all previously found strings.')
	else:
	print('* No global common substring found *')


	print('Total number of TCP segments analysed for matches: %d' % tcpcounter)
	print('Total number of analysed IP packets: %d' % ipcounter)
	print('Displaying the first 32 bytes only.')