Skip to content

Instantly share code, notes, and snippets.

@NoxMortem
Created May 19, 2016
Embed
What would you like to do?
A tiny python script which parses an input file in the same folder for strings between "" or '' and searches for a prefix. Example usage: python parse.py "html.html" "attr_"
import sys
import re
import os
inputfile = sys.argv[1]
prefix = sys.argv[2]
print("Extract:"+prefix);
def lreplace(pattern, sub, string):
"""
Replaces 'pattern' in 'string' with 'sub' if 'pattern' starts 'string'.
"""
return re.sub('^%s' % pattern, sub, string)
def parseLine(line):
output = ""
parts = line.split('\"')+line.split("\'")
return [part for part in parts if part.startswith(prefix)]
files = [f for f in os.listdir('.') if os.path.isfile(f)]
for f in files:
if f.startswith(inputfile):
infile = f
outfile = "output.txt"
try:
with open(infile, encoding='utf8') as input, open(outfile,"w", encoding='utf8') as outf:
lines = [parseLine(line) for line in list(input) if prefix in line]
d = {}
for line in lines:
for attr in line:
d[value] = attr+"\n"
for key, value in sorted(d.items()): # Note the () after items!
outf.write(key);
except UnicodeDecodeError:
with open(infile) as input, open(outfile,"w", encoding='utf8') as outf:
lines = [parseLine(line) for line in list(input) if prefix in line]
d = {}
for line in lines:
for attr in line:
d[attr] = attr+"\n"
for key, value in sorted(d.items()): # Note the () after items!
outf.write(value);
print("Done.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment