gwillem/parse-scans.io.py

## parse-scans.io.py
#!/usr/bin/env python3

"""

Parse scan.io archive to find specific HTTP responses.
Will save positive hits to <timestamp>/<ip>.log

Usage:

$ parse-scans.io.py https://scans.io/data/rapid7/sonar.http/20151110-http.gz

Requires Python 3

Latest Ubuntu requirements: apt-get install python3 python3-ujson python3-requests

"""

import requests
import sys
import gzip
import ujson as json
import base64
import os

try:
    url = sys.argv[1]
except:
    print("Missing URL as argument")
    sys.exit(1)

#url = 'https://scans.io/data/rapid7/sonar.http/20151110-http.gz'

ts = url.split('/')[-1].split('-')[0]

if not os.path.exists(ts):
    os.mkdir(ts)

fingerprints = (
    b'RegExp("[0-9]{13,16}")',
    b'''jQuery('[id*="cc_ss_issue"]').val()''',
    b'querySelectorAll("input, select, textarea, checkbox"',
)

lines = 0

resp = requests.get(url, stream=True)
decompressed = gzip.GzipFile(fileobj=resp.raw)
for line in decompressed:
    entry = json.loads(line.decode())
    decoded = base64.b64decode(entry['data'])
    lines += 1
    if lines % 10000 == 0:
        print("%10d lines" % lines)
    for fp in fingerprints:
        if fp in decoded:
            filename = ts + '/' + entry['ip'] + '.log'
            print("\t" + filename)
            with open(filename, 'wb') as f:
                f.write(decoded)
            break
	#!/usr/bin/env python3

	"""

	Parse scan.io archive to find specific HTTP responses.
	Will save positive hits to <timestamp>/<ip>.log

	Usage:

	$ parse-scans.io.py https://scans.io/data/rapid7/sonar.http/20151110-http.gz

	Requires Python 3

	Latest Ubuntu requirements: apt-get install python3 python3-ujson python3-requests

	"""

	import requests
	import sys
	import gzip
	import ujson as json
	import base64
	import os

	try:
	url = sys.argv[1]
	except:
	print("Missing URL as argument")
	sys.exit(1)

	#url = 'https://scans.io/data/rapid7/sonar.http/20151110-http.gz'

	ts = url.split('/')[-1].split('-')[0]

	if not os.path.exists(ts):
	os.mkdir(ts)

	fingerprints = (
	b'RegExp("[0-9]{13,16}")',
	b'''jQuery('[id*="cc_ss_issue"]').val()''',
	b'querySelectorAll("input, select, textarea, checkbox"',
	)

	lines = 0

	resp = requests.get(url, stream=True)
	decompressed = gzip.GzipFile(fileobj=resp.raw)
	for line in decompressed:
	entry = json.loads(line.decode())
	decoded = base64.b64decode(entry['data'])
	lines += 1
	if lines % 10000 == 0:
	print("%10d lines" % lines)
	for fp in fingerprints:
	if fp in decoded:
	filename = ts + '/' + entry['ip'] + '.log'
	print("\t" + filename)
	with open(filename, 'wb') as f:
	f.write(decoded)
	break