Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import sys
import re
import os
prefix = sys.argv[1]
print("Extract:"+prefix);
def lreplace(pattern, sub, string):
"""
Replaces 'pattern' in 'string' with 'sub' if 'pattern' starts 'string'.
"""
return re.sub('^%s' % pattern, sub, string)
def parseLine(line):
output = ""
parts = line.split('\"')+line.split("\'")
return [part for part in parts if part.startswith(prefix)]
files = [f for f in os.listdir('.') if os.path.isfile(f)]
for f in files:
if f.startswith("html.html"):
infile = f
outfile = f.replace("html.html","output.txt")
try:
with open(infile, encoding='utf8') as input, open(outfile,"w", encoding='utf8') as outf:
lines = [parseLine(line) for line in list(input) if prefix in line]
d = {}
for line in lines:
for attr in line:
d[value] = attr+"\n"
for key, value in sorted(d.items()): # Note the () after items!
outf.write(key);
except UnicodeDecodeError:
with open(infile) as input, open(outfile,"w", encoding='utf8') as outf:
lines = [parseLine(line) for line in list(input) if prefix in line]
d = {}
for line in lines:
for attr in line:
d[attr] = attr+"\n"
for key, value in sorted(d.items()): # Note the () after items!
outf.write(value);
print("Done.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment