ryantuck/parse_trufflehog.py

## parse_trufflehog.py
"""
TruffleHog Results Parsing

Designed to operate on the output of trufflehog's json output:

    $ trufflehog --json <my_repo> > my_output.json

Expects a `trufflehog_output.json` file, and a `trufflehog_whitelist.yml` file.

Whitelist config file should look like:

    string_prefixes:
      - not/bad/string
    string_suffixes:
      - endofnotbadstring
    strings:
      - averylongstringthatisnotbad
    paths:
      - path/to/file/containing/bunch/of/high/entropy/strings

By default, outputs offending paths and strings like so:

    /path/number/1
        LONGSTRING
        LONGSTRING2
    /path/number/2
        LONGSTRING2
        LONGSTRING3

The idea here is:

    1. Run with no whitelisted configs etc
    2. Start digging through example diffs to see what you can whitelist
    3. Add strings/paths/prefixes/suffixes to whitelist
    4. Rinse and repeat until you've whittled your output down to bad strings
"""
# pylint: disable=invalid-name
import json
import yaml

ISSUES_FILEPATH = 'trufflehog_output.json'
WHITELIST_FILEPATH = 'trufflehog_whitelist.yml'

def read_issues():
    """
    Returns list of issues as dicts.
    """
    with open(ISSUES_FILEPATH) as f:
        return [json.loads(row.strip()) for row in f.readlines()]


def _get_whitelist_section(section):
    with open(WHITELIST_FILEPATH) as f:
        cfg = yaml.load(f)
    return cfg.get(section, [])


def _parse_strings(strings):
    """
    Returns stringsFound field as list
    """
    if isinstance(strings, list):
        return strings
    if isinstance(strings, str):
        return [strings]
    raise Exception(strings)


def is_offending_string(string, prefixes=None, suffixes=None, strings=None):
    """
    Check to see if string has not been whitelisted.

    Optionally pass in any whitelisted `prefixes`, `suffixes`, or `strings`.
    """
    if any(string.startswith(prefix) for prefix in prefixes):
        return False
    if any(string.endswith(suffix) for suffix in suffixes):
        return False
    if string in strings:
        return False
    return True


def file_breakdowns():
    """
    Return a dict like:
        {path: [unique strings]}
    For all valid offending files.
    """
    data = read_issues()

    ok_prefixes = _get_whitelist_section('string_prefixes')
    ok_suffixes = _get_whitelist_section('string_suffixes')
    ok_strings = _get_whitelist_section('strings')
    ok_paths = _get_whitelist_section('paths')

    paths = sorted(set(o['path'] for o in data if o['path'] not in ok_paths))
    results = {p: [] for p in paths}
    for p in paths:
        for o in data:
            if o['path'] == p:
                path_strings = _parse_strings(o['stringsFound'])
                bad_strings = [
                    s
                    for s in path_strings
                    if is_offending_string(
                        string=s,
                        prefixes=ok_prefixes,
                        suffixes=ok_suffixes,
                        strings=ok_strings,
                    )
                ]
                results[p] += bad_strings
    return {
        path: set(s for s in strings)
        for path, strings in results.items()
        if strings != []
    }


def main():
    """
    Main function to output files and their non-whitelisted strings.
    """
    fb = file_breakdowns()
    for path, strings in fb.items():
        print(path)
        for s in strings:
            print(f'    {s}')


if __name__ == '__main__':
    main()
	"""
	TruffleHog Results Parsing

	Designed to operate on the output of trufflehog's json output:

	$ trufflehog --json <my_repo> > my_output.json

	Expects a `trufflehog_output.json` file, and a `trufflehog_whitelist.yml` file.

	Whitelist config file should look like:

	string_prefixes:
	- not/bad/string
	string_suffixes:
	- endofnotbadstring
	strings:
	- averylongstringthatisnotbad
	paths:
	- path/to/file/containing/bunch/of/high/entropy/strings

	By default, outputs offending paths and strings like so:

	/path/number/1
	LONGSTRING
	LONGSTRING2
	/path/number/2
	LONGSTRING2
	LONGSTRING3

	The idea here is:

	1. Run with no whitelisted configs etc
	2. Start digging through example diffs to see what you can whitelist
	3. Add strings/paths/prefixes/suffixes to whitelist
	4. Rinse and repeat until you've whittled your output down to bad strings
	"""
	# pylint: disable=invalid-name
	import json
	import yaml

	ISSUES_FILEPATH = 'trufflehog_output.json'
	WHITELIST_FILEPATH = 'trufflehog_whitelist.yml'

	def read_issues():
	"""
	Returns list of issues as dicts.
	"""
	with open(ISSUES_FILEPATH) as f:
	return [json.loads(row.strip()) for row in f.readlines()]


	def _get_whitelist_section(section):
	with open(WHITELIST_FILEPATH) as f:
	cfg = yaml.load(f)
	return cfg.get(section, [])


	def _parse_strings(strings):
	"""
	Returns stringsFound field as list
	"""
	if isinstance(strings, list):
	return strings
	if isinstance(strings, str):
	return [strings]
	raise Exception(strings)


	def is_offending_string(string, prefixes=None, suffixes=None, strings=None):
	"""
	Check to see if string has not been whitelisted.

	Optionally pass in any whitelisted `prefixes`, `suffixes`, or `strings`.
	"""
	if any(string.startswith(prefix) for prefix in prefixes):
	return False
	if any(string.endswith(suffix) for suffix in suffixes):
	return False
	if string in strings:
	return False
	return True


	def file_breakdowns():
	"""
	Return a dict like:
	{path: [unique strings]}
	For all valid offending files.
	"""
	data = read_issues()

	ok_prefixes = _get_whitelist_section('string_prefixes')
	ok_suffixes = _get_whitelist_section('string_suffixes')
	ok_strings = _get_whitelist_section('strings')
	ok_paths = _get_whitelist_section('paths')

	paths = sorted(set(o['path'] for o in data if o['path'] not in ok_paths))
	results = {p: [] for p in paths}
	for p in paths:
	for o in data:
	if o['path'] == p:
	path_strings = _parse_strings(o['stringsFound'])
	bad_strings = [
	s
	for s in path_strings
	if is_offending_string(
	string=s,
	prefixes=ok_prefixes,
	suffixes=ok_suffixes,
	strings=ok_strings,
	)
	]
	results[p] += bad_strings
	return {
	path: set(s for s in strings)
	for path, strings in results.items()
	if strings != []
	}


	def main():
	"""
	Main function to output files and their non-whitelisted strings.
	"""
	fb = file_breakdowns()
	for path, strings in fb.items():
	print(path)
	for s in strings:
	print(f' {s}')


	if __name__ == '__main__':
	main()