Skip to content

Instantly share code, notes, and snippets.

Last active September 18, 2018 19:58
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save fahadysf/466fcdb60a5ccc440b869b67e647d4ea to your computer and use it in GitHub Desktop.
Find strings for APP-ID from PCAP files
Copyright (C) 2018
Authors: Fahad Yousuf, Lorenzo Castelletti
This script parses a PCAP file and looks for common data patterns in TCP payloads.
These can then be used as signatures while building a custom App-ID.
It works for captures with multiple sessions of the same unknown-tcp traffic.
Known limitations:
-TCP Only.
-Only the first payload in each TCP flow is considered for matching.
-Intended to be used with context unknown-req-tcp-payload, but strings can always be converted to ASCII
-Manual reordering necessary if packets are out of order
import dpkt
import binascii
import string
lengths = []
tomatch = []
globalmatch = []
strings = {}
complete = 1
tcpcounter = 0
ipcounter = 0
seq = 0
filename = 'http.cap'
# Functions that calculate the longest match given a set of strings. Thank you Stackoverflow :)
def long_substr(data):
substr = ''
if len(data) > 1 and len(data[0]) > 0:
for i in range(len(data[0])):
for j in range(len(data[0])-i+1):
if j > len(substr) and is_substr(data[0][i:i+j], data):
substr = data[0][i:i+j]
return substr
def is_substr(find, data):
if len(data) < 1 and len(find) < 1:
return False
for i in range(len(data)):
if find not in data[i]:
return False
return True
def hex_to_ascii(hexstr):
if len(hexstr)%2==1:
hexstr = "0"+hexstr
data = (hexstr).decode("hex")
output = "".join(c if c in string.printable else '.' for c in data)
return output.strip()
# PCAP parsing starts here
for ts, pkt in dpkt.pcap.Reader(open(filename, 'rb')):
eth = dpkt.ethernet.Ethernet(pkt)
print("Could not process frame at timestamp: %s" % str(ts))
ip =
if (type(ip)== dpkt.ip.IP) and isinstance(, dpkt.tcp.TCP):
tcp =
# Save sequence number of the first segment in TCP flow
if ( tcp.flags & dpkt.tcp.TH_SYN ) != 0 and ( tcp.flags & dpkt.tcp.TH_ACK ) == 0:
seq = tcp.seq
payload = binascii.hexlify(
length = len(payload)/2
# If payload is first in the flow, grab length and actual payload in HEX for comparison
if length > 13 and tcp.seq >= seq + 5:
if length not in lengths:
strings[payload] = length
tcpcounter += 1
print("TS %s: HEX: %s | ASCII: %s" % (str(ts), payload[:64], hex_to_ascii(payload[:64])))
ipcounter += 1
# HEX strings matching starts here
for n in lengths:
for index, value in strings.iteritems():
if value == n:
match = long_substr(tomatch)
if match != '':
print('Common substring for segments with payload of %d bytes: %s | ASCII: %s' % (n, match[:64], hex_to_ascii(match[:64])) )
if len(match)/2 < 7:
complete = 0
print('This substring is shorter than 7 bytes. See below for all payloads')
for x in tomatch:
print( "%s | ASCII: %s" % (x[:64], hex_to_ascii(x[:64])) )
if len(tomatch) == 1:
print('Common substring for segments with payload of %d bytes: %s | ASCII: %s' % (n, tomatch[0][:64], hex_to_ascii(tomatch[0][:64])))
complete = 0
print('No common substring for segments with payload of %d bytes. See below for all payloads' % n)
for x in tomatch:
print( "%s | ASCII: %s" % (x[:64], hex_to_ascii(x[:64])) )
tomatch = []
if complete == 1:
match = long_substr(globalmatch)
print('*** Global common substring: %s ***' % match[:64])
if len(match) / 2 < 7:
print('This string is shorter than 7 bytes. Use all previously found strings.')
print('*** No global common substring found ***')
print('Total number of TCP segments analysed for matches: %d' % tcpcounter)
print('Total number of analysed IP packets: %d' % ipcounter)
print('Displaying the first 32 bytes only.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment