ThoundsN/snapshot2path.py

## snapshot2path.py
import re
import sys
import os

rubbish_list = list((
    "access-control-allow-credentials",
    "{","}","chrome","data:image/png;base64", "<a", "zendesk"
))

def isUrl(line):
    if '/' not in line:
        return False
    if re.search('/[a-z0-9_-]*/',line):
        return True

def mightBeGarbage(line):
    if re.search("\.(png|jpg|jpeg|gif|svg|bmp|ttf|avif|wav|mp4|aac|ajax|css|all|woff|js)",line):
        return True
    for word in rubbish_list:
        if word in line:
            return True
    return False

if __name__ == '__main__':
    snapshot_infile = sys.argv[1]
    pathlist_outfile = sys.argv[2]
    working_dir = os.sep.join(os.path.realpath(snapshot_infile).split(os.sep)[:-1])
    pathlist_outfile = working_dir + os.sep + pathlist_outfile
    # print(pathlist_outfile)
    results = set()
    with open(snapshot_infile,'r') as r:
        for line in r:
            if isUrl(line):
                if not mightBeGarbage(line):
                    results.add(line)

    with open(pathlist_outfile,'w') as w:
        for entry in results:
            w.write(entry)
	import re
	import sys
	import os

	rubbish_list = list((
	"access-control-allow-credentials",
	"{","}","chrome","data:image/png;base64", "<a", "zendesk"
	))

	def isUrl(line):
	if '/' not in line:
	return False
	if re.search('/[a-z0-9_-]*/',line):
	return True

	def mightBeGarbage(line):
	if re.search("\.(png\|jpg\|jpeg\|gif\|svg\|bmp\|ttf\|avif\|wav\|mp4\|aac\|ajax\|css\|all\|woff\|js)",line):
	return True
	for word in rubbish_list:
	if word in line:
	return True
	return False

	if __name__ == '__main__':
	snapshot_infile = sys.argv[1]
	pathlist_outfile = sys.argv[2]
	working_dir = os.sep.join(os.path.realpath(snapshot_infile).split(os.sep)[:-1])
	pathlist_outfile = working_dir + os.sep + pathlist_outfile
	# print(pathlist_outfile)
	results = set()
	with open(snapshot_infile,'r') as r:
	for line in r:
	if isUrl(line):
	if not mightBeGarbage(line):
	results.add(line)

	with open(pathlist_outfile,'w') as w:
	for entry in results:
	w.write(entry)