Skip to content

Instantly share code, notes, and snippets.

@ThoundsN
Created June 20, 2021 10:05
Show Gist options
  • Save ThoundsN/8b4a22e6c4b6ca43f22c8b32c92f3e80 to your computer and use it in GitHub Desktop.
Save ThoundsN/8b4a22e6c4b6ca43f22c8b32c92f3e80 to your computer and use it in GitHub Desktop.
From chrome snapshot file to path list
import re
import sys
import os
rubbish_list = list((
"access-control-allow-credentials",
"{","}","chrome","data:image/png;base64", "<a", "zendesk"
))
def isUrl(line):
if '/' not in line:
return False
if re.search('/[a-z0-9_-]*/',line):
return True
def mightBeGarbage(line):
if re.search("\.(png|jpg|jpeg|gif|svg|bmp|ttf|avif|wav|mp4|aac|ajax|css|all|woff|js)",line):
return True
for word in rubbish_list:
if word in line:
return True
return False
if __name__ == '__main__':
snapshot_infile = sys.argv[1]
pathlist_outfile = sys.argv[2]
working_dir = os.sep.join(os.path.realpath(snapshot_infile).split(os.sep)[:-1])
pathlist_outfile = working_dir + os.sep + pathlist_outfile
# print(pathlist_outfile)
results = set()
with open(snapshot_infile,'r') as r:
for line in r:
if isUrl(line):
if not mightBeGarbage(line):
results.add(line)
with open(pathlist_outfile,'w') as w:
for entry in results:
w.write(entry)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment