Skip to content

Instantly share code, notes, and snippets.

@alpiepho
Created October 1, 2017 21:58
Show Gist options
  • Save alpiepho/64bdfaea76a5b872c23aa68e4df37fc9 to your computer and use it in GitHub Desktop.
Save alpiepho/64bdfaea76a5b872c23aa68e4df37fc9 to your computer and use it in GitHub Desktop.
Python - tool to parse THE ONE FILE and pull out data as a CSV file
#!/usr/bin/python
import getopt
import os
import sys
def Usage():
print("Usage: %s -i <file> -o <file>" % sys.argv[0])
print(" -i <file> Input HTM file")
print(" -o <file> Output CSV file")
def getTDStr(contents, index, startTag):
index = contents.find(startTag, index)
index = contents.find(TAG_CLOSE, index)
a = index+1
index = contents.find(TD_CLOSE, index)
b = index
str = contents[a:b]
return index, str
def outputString(outFp, str):
outFp.write("\"" + str + "\"" + "\n")
def outputList(outFp, list):
for i in xrange(len(list)):
list[i] = "\"" + list[i] + "\""
outFp.write(','.join(list) + "\n")
TABLE_OPENTAG = "<table id=\"isi-report\">"
TABLE_CLOSETAG = "</table>"
TR_DATA_HEADER = "<tr class=\"dataHeader"
BLANK_TRTDTDTR = "<tr><td></td></tr>"
def getTableIndexes(contents):
tableOpenIndex = contents.find(TABLE_OPENTAG)
tableCloseIndex = contents.find(TABLE_CLOSETAG)
dataOpenIndex = contents.find(TR_DATA_HEADER)
dataCloseIndex = contents.find(BLANK_TRTDTDTR, dataOpenIndex)
return tableOpenIndex, tableCloseIndex, dataOpenIndex, dataCloseIndex
TD_ISI_GROUP = "<td isi-group="
TD_ISI_GROUP_MEMBER = "<td isi-group-member="
TD_CLOSE = "</td>"
TAG_CLOSE = ">"
SPAN_LABEL = "<span isi-label=\"\">"
SPAN_VALUE = "<span isi-value=\"\">"
SPAN_CLOSE = "</span>"
def getGroupInfo(outFp, contents, startIndex, endIndex):
index = startIndex
while index < endIndex:
if contents.startswith(BLANK_TRTDTDTR, index):
outputString(outFp, "")
if contents.startswith(TD_ISI_GROUP, index):
index, str = getTDStr(contents, index, TD_ISI_GROUP)
outputString(outFp, str)
if contents.startswith(TD_ISI_GROUP_MEMBER, index):
index, str = getTDStr(contents, index, TD_ISI_GROUP_MEMBER)
outputString(outFp, str)
index += 1
return index
TD_DATA_HEADER = "<td isi-data-column-header="
TR_DATA = "<tr class=\"data\""
TR_CLOSETAG = "</tr>"
TD_DATA1 = "<td class="
TD_DATAN = "<td>"
def getDataAndValues(outFp, contents, startIndex, endIndex):
index = startIndex
line = []
while index < endIndex:
if contents.startswith(TR_CLOSETAG, index):
outputList(outFp, line)
line = []
if contents.startswith(TD_DATA_HEADER, index):
index, str = getTDStr(contents, index, TD_DATA_HEADER)
line.append(str)
if contents.startswith(TD_DATA1, index):
index, str = getTDStr(contents, index, TD_DATA1)
line.append(str)
if contents.startswith(TD_DATAN, index):
index, str = getTDStr(contents, index, TD_DATAN)
line.append(str)
index += 1
inFilename = ''
outFilename = ''
try:
# process command arguments
ouropts, args = getopt.getopt(sys.argv[1:],"i:o:h")
for o, a in ouropts:
if o == '-i':
inFilename = a
elif o == '-o':
outFilename = a
elif o == '-h':
Usage()
sys.exit(0)
except getopt.GetoptError as e:
print(str(e))
Usage()
sys.exit(2)
if type(inFilename) != str or len(inFilename) <= 0:
print("please use -i for input HTM log file")
Usage()
sys.exit(0)
if type(outFilename) != str or len(outFilename) <= 0:
print("please use -o for output CSV log file")
Usage()
sys.exit(0)
with open(outFilename, 'wb') as outFp:
with open(inFilename, 'rb') as inFp:
contents = inFp.readlines()
contents = ''.join(contents)
tableOpenIndex, tableCloseIndex, dataOpenIndex, dataCloseIndex = getTableIndexes(contents)
index = getGroupInfo( outFp, contents, tableOpenIndex, dataOpenIndex)
index = getDataAndValues(outFp, contents, dataOpenIndex, dataCloseIndex)
index = getGroupInfo( outFp, contents, dataCloseIndex, tableCloseIndex)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment