Last active
November 11, 2015 21:41
-
-
Save mastry/8d1c9344344d63d039dd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# stripxml.py | |
# | |
# Usage: stripxml.py | |
# or | |
# python stripxml.py | |
# Prints the text from the specified XML file (all tags are filtered out). The input file is unaltered. | |
# | |
import sys | |
from enum import Enum | |
class State(Enum): | |
""" | |
Define some parser states. | |
We're either procesing a TAG or processing raw TEXT. | |
""" | |
TAG = 0 | |
TEXT = 1 | |
UNKNOWN = 100 | |
def handleTAG(character, acc): | |
""" | |
Handle the TAG state. | |
If we have a '>' character, then transition to the TEXT state. | |
Otherwise, remain in the TAG state. | |
The accumulator is never changed in this state. | |
""" | |
if character == '>': | |
return State.TEXT, acc | |
else: | |
return State.TAG, acc | |
def handleTEXT(character, acc): | |
""" | |
Handle the TEXT state. | |
If we have a '<' character, then transition to the TAG state. | |
Otherwise, accumulate the current character and remain in the TEXT state. | |
""" | |
if character == '<': | |
return State.TAG, acc + "\n" | |
else: | |
return State.TEXT, acc + c | |
def handleUnknown(character, acc): | |
""" | |
Handle the UNKNOWN state. | |
We should never get here, but accumulate characters just in case. | |
""" | |
return State.UNKNOWN, acc + character | |
def proc(state, c, acc): | |
handler = {State.TAG:handleTAG, State.TEXT:handleTEXT}.get(state, handleUnknown) | |
return handler(c, acc) | |
currentState = State.TAG | |
acc = "" | |
file = open(sys.argv[1], 'r', encoding='utf-8') | |
xml = file.read() | |
file.close() | |
for c in xml: | |
currentState, acc = proc(currentState, c, acc) | |
lines = [str.strip(s) for s in acc.splitlines() ] | |
output = [l for l in lines if len(l) > 0] | |
for s in output: | |
print(s) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment