Skip to content

Instantly share code, notes, and snippets.

@axfelix
Last active May 28, 2020 18:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save axfelix/fe5e1435243b607bbae1446971334744 to your computer and use it in GitHub Desktop.
Save axfelix/fe5e1435243b607bbae1446971334744 to your computer and use it in GitHub Desktop.
import sys
import io
import csv
import re
import lxml.etree as le
class RegexDict(dict):
def get_matching(self, event):
return (self[key] for key in self if re.match(key, event))
# http://wiki.tei-c.org/index.php/Remove-Namespaces.xsl
xslt='''<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" indent="no"/>
<xsl:template match="/|comment()|processing-instruction()">
<xsl:copy>
<xsl:apply-templates/>
</xsl:copy>
</xsl:template>
<xsl:template match="*">
<xsl:element name="{local-name()}">
<xsl:apply-templates select="@*|node()"/>
</xsl:element>
</xsl:template>
<xsl:template match="@*">
<xsl:attribute name="{local-name()}">
<xsl:value-of select="."/>
</xsl:attribute>
</xsl:template>
</xsl:stylesheet>
'''
xslt_doc = le.fromstring(xslt)
transform = le.XSLT(xslt_doc)
xml = le.parse(sys.argv[1])
xml = transform(xml)
root = xml.getroot()
columns = list(set([x.tag for x in root.iter() if x.text is not None]))
columns = columns + ['ID', 'JurisdictionCode', 'JurisdictionDescription', 'AssessmentAreaCode', 'AssessmentAreaDescription', 'SaleID', 'SaleAction', 'LegalDescriptionID', 'FolioAddressID', 'FolioAddressAction', 'FolioAdd', 'RegionalHospitalDistrictDescription', 'RegionalDistrictCode', 'SchoolDistrictCode', 'RegionalDistrictDescription', 'SchoolDistrictDescription', 'RegionalHospitalDistrictCode']
for x in range(1,100):
columns += ['PostalZip' + str(x) + 'A', 'StreetNumber' + str(x) + 'A', 'StreetType' + str(x) + 'A', 'FolioAddressAction' + str(x) + 'A', 'PrimaryFlag' + str(x) + 'A', 'StreetName' + str(x) + 'A', 'UnitNumber' + str(x) + 'A', 'ProvinceState' + str(x) + 'A', 'FolioAddress' + str(x) + 'A', 'City' + str(x) + 'A', 'FolioAddressID' + str(x) + 'A', 'MapReferenceNumber' + str(x) + 'A', 'StreetDirectionPrefix' + str(x) + 'A', 'StreetDirectionSuffix' + str(x) + 'A']
with open(re.sub('\.xml', '.csv', sys.argv[1]), 'w', newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=columns)
writer.writeheader()
folio_records = root.xpath("//FolioRecord")
for record in folio_records:
address_fields = 0
row = {}
row['ID'] = record.attrib['ID']
jurisdiction = record.getparent().getparent()
for child in jurisdiction:
if child == "JurisdictionCode":
row['JurisdictionCode'] = child.text
if child == "JurisdictionDescription":
row['JurisdictionDescription'] = child.text
assessment_area = jurisdiction.getparent().getparent()
for child in assessment_area:
if child == "AssessmentAreaCode":
row['AssessmentAreaCode'] = child.text
if child == "AssessmentAreaDescription":
row['AssessmentAreaDescription'] = child.text
for element in record.iter():
if element.tag == 'Sale':
row['SaleID'] = element.attrib['ID']
try:
row['SaleAction'] = element.attrib['Action']
except:
pass
if element.tag == 'LegalDescription':
row['LegalDescriptionID'] = element.attrib['ID']
# Allow address_field tags to repeat, this is XML after all
if element.tag == 'FolioAddress':
address_fields += 1
row[('FolioAddressID' + str(address_fields) + 'A')] = element.attrib['ID']
try:
row[('FolioAddressAction' + str(address_fields) + 'A')] = element.attrib['Action']
except:
pass
for address_field in element.iter():
row[(address_field.tag + str(address_fields) + 'A')] = address_field.text
# These can repeat as well, handle them differently
if element.tag == 'DistrictCode':
row[(element.getparent().tag + 'Code')] = element.text
if element.tag == 'DistrictDescription':
row[(element.getparent().tag + 'Description')] = element.text
if element.text is not None and len(element.text) > 0 and element.tag != 'DistrictCode' and element.tag != 'DistrictDescription':
row[element.tag] = element.text
# Remove the last address_field tags as those are now duplicated
rd = RegexDict(row)
for o in rd.get_matching('.*' + str(address_fields) + 'A'):
row.pop(o)
writer.writerow(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment