Skip to content

Instantly share code, notes, and snippets.

@mastry
Last active November 11, 2015 21:41
Show Gist options
  • Save mastry/8d1c9344344d63d039dd to your computer and use it in GitHub Desktop.
Save mastry/8d1c9344344d63d039dd to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# stripxml.py
#
# Usage: stripxml.py
# or
# python stripxml.py
# Prints the text from the specified XML file (all tags are filtered out). The input file is unaltered.
#
import sys
from enum import Enum
class State(Enum):
"""
Define some parser states.
We're either procesing a TAG or processing raw TEXT.
"""
TAG = 0
TEXT = 1
UNKNOWN = 100
def handleTAG(character, acc):
"""
Handle the TAG state.
If we have a '>' character, then transition to the TEXT state.
Otherwise, remain in the TAG state.
The accumulator is never changed in this state.
"""
if character == '>':
return State.TEXT, acc
else:
return State.TAG, acc
def handleTEXT(character, acc):
"""
Handle the TEXT state.
If we have a '<' character, then transition to the TAG state.
Otherwise, accumulate the current character and remain in the TEXT state.
"""
if character == '<':
return State.TAG, acc + "\n"
else:
return State.TEXT, acc + c
def handleUnknown(character, acc):
"""
Handle the UNKNOWN state.
We should never get here, but accumulate characters just in case.
"""
return State.UNKNOWN, acc + character
def proc(state, c, acc):
handler = {State.TAG:handleTAG, State.TEXT:handleTEXT}.get(state, handleUnknown)
return handler(c, acc)
currentState = State.TAG
acc = ""
file = open(sys.argv[1], 'r', encoding='utf-8')
xml = file.read()
file.close()
for c in xml:
currentState, acc = proc(currentState, c, acc)
lines = [str.strip(s) for s in acc.splitlines() ]
output = [l for l in lines if len(l) > 0]
for s in output:
print(s)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment