axfelix/bca.py

## bca.py
import sys
import io
import csv
import re
import lxml.etree as le

class RegexDict(dict):
    def get_matching(self, event):
        return (self[key] for key in self if re.match(key, event))

# http://wiki.tei-c.org/index.php/Remove-Namespaces.xsl
xslt='''<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" indent="no"/>

<xsl:template match="/|comment()|processing-instruction()">
    <xsl:copy>
      <xsl:apply-templates/>
    </xsl:copy>
</xsl:template>

<xsl:template match="*">
    <xsl:element name="{local-name()}">
      <xsl:apply-templates select="@*|node()"/>
    </xsl:element>
</xsl:template>

<xsl:template match="@*">
    <xsl:attribute name="{local-name()}">
      <xsl:value-of select="."/>
    </xsl:attribute>
</xsl:template>
</xsl:stylesheet>
'''

xslt_doc = le.fromstring(xslt)
transform = le.XSLT(xslt_doc)
xml = le.parse(sys.argv[1])
xml = transform(xml)
root = xml.getroot()

columns = list(set([x.tag for x in root.iter() if x.text is not None]))
columns = columns + ['ID', 'JurisdictionCode', 'JurisdictionDescription', 'AssessmentAreaCode', 'AssessmentAreaDescription', 'SaleID', 'SaleAction', 'LegalDescriptionID', 'FolioAddressID', 'FolioAddressAction', 'FolioAdd', 'RegionalHospitalDistrictDescription', 'RegionalDistrictCode', 'SchoolDistrictCode', 'RegionalDistrictDescription', 'SchoolDistrictDescription', 'RegionalHospitalDistrictCode']

for x in range(1,100):
    columns += ['PostalZip' + str(x) + 'A', 'StreetNumber' + str(x) + 'A', 'StreetType' + str(x) + 'A', 'FolioAddressAction' + str(x) + 'A', 'PrimaryFlag' + str(x) + 'A', 'StreetName' + str(x) + 'A', 'UnitNumber' + str(x) + 'A', 'ProvinceState' + str(x) + 'A', 'FolioAddress' + str(x) + 'A', 'City' + str(x) + 'A', 'FolioAddressID' + str(x) + 'A', 'MapReferenceNumber' + str(x) + 'A', 'StreetDirectionPrefix' + str(x) + 'A', 'StreetDirectionSuffix' + str(x) + 'A']

with open(re.sub('\.xml', '.csv', sys.argv[1]), 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=columns)
    writer.writeheader()

    folio_records = root.xpath("//FolioRecord")
    for record in folio_records:
        address_fields = 0
        row = {}
        row['ID'] = record.attrib['ID']

        jurisdiction = record.getparent().getparent()
        for child in jurisdiction:
            if child == "JurisdictionCode":
                row['JurisdictionCode'] = child.text
            if child == "JurisdictionDescription":
                row['JurisdictionDescription'] = child.text

        assessment_area = jurisdiction.getparent().getparent()
        for child in assessment_area:
            if child == "AssessmentAreaCode":
                row['AssessmentAreaCode'] = child.text
            if child == "AssessmentAreaDescription":
                row['AssessmentAreaDescription'] = child.text

        for element in record.iter():
            if element.tag == 'Sale':
                row['SaleID'] = element.attrib['ID']
                try:
                    row['SaleAction'] = element.attrib['Action']
                except:
                    pass

            if element.tag == 'LegalDescription':
                row['LegalDescriptionID'] = element.attrib['ID']

            # Allow address_field tags to repeat, this is XML after all
            if element.tag == 'FolioAddress':
                address_fields += 1
                row[('FolioAddressID' + str(address_fields) + 'A')] = element.attrib['ID']
                try:
                    row[('FolioAddressAction' + str(address_fields) + 'A')] = element.attrib['Action']
                except:
                    pass
                for address_field in element.iter():
                    row[(address_field.tag + str(address_fields) + 'A')] = address_field.text

            # These can repeat as well, handle them differently
            if element.tag == 'DistrictCode':
                row[(element.getparent().tag + 'Code')] = element.text
            if element.tag == 'DistrictDescription':
                row[(element.getparent().tag + 'Description')] = element.text

            if element.text is not None and len(element.text) > 0 and element.tag != 'DistrictCode' and element.tag != 'DistrictDescription':
                row[element.tag] = element.text

        # Remove the last address_field tags as those are now duplicated
        rd = RegexDict(row)
        for o in rd.get_matching('.*' + str(address_fields) + 'A'):
            row.pop(o)

        writer.writerow(row)
	import sys
	import io
	import csv
	import re
	import lxml.etree as le

	class RegexDict(dict):
	def get_matching(self, event):
	return (self[key] for key in self if re.match(key, event))

	# http://wiki.tei-c.org/index.php/Remove-Namespaces.xsl
	xslt='''<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
	<xsl:output method="xml" indent="no"/>

	<xsl:template match="/\|comment()\|processing-instruction()">
	<xsl:copy>
	<xsl:apply-templates/>
	</xsl:copy>
	</xsl:template>

	<xsl:template match="*">
	<xsl:element name="{local-name()}">
	<xsl:apply-templates select="@*\|node()"/>
	</xsl:element>
	</xsl:template>

	<xsl:template match="@*">
	<xsl:attribute name="{local-name()}">
	<xsl:value-of select="."/>
	</xsl:attribute>
	</xsl:template>
	</xsl:stylesheet>
	'''

	xslt_doc = le.fromstring(xslt)
	transform = le.XSLT(xslt_doc)
	xml = le.parse(sys.argv[1])
	xml = transform(xml)
	root = xml.getroot()

	columns = list(set([x.tag for x in root.iter() if x.text is not None]))
	columns = columns + ['ID', 'JurisdictionCode', 'JurisdictionDescription', 'AssessmentAreaCode', 'AssessmentAreaDescription', 'SaleID', 'SaleAction', 'LegalDescriptionID', 'FolioAddressID', 'FolioAddressAction', 'FolioAdd', 'RegionalHospitalDistrictDescription', 'RegionalDistrictCode', 'SchoolDistrictCode', 'RegionalDistrictDescription', 'SchoolDistrictDescription', 'RegionalHospitalDistrictCode']

	for x in range(1,100):
	columns += ['PostalZip' + str(x) + 'A', 'StreetNumber' + str(x) + 'A', 'StreetType' + str(x) + 'A', 'FolioAddressAction' + str(x) + 'A', 'PrimaryFlag' + str(x) + 'A', 'StreetName' + str(x) + 'A', 'UnitNumber' + str(x) + 'A', 'ProvinceState' + str(x) + 'A', 'FolioAddress' + str(x) + 'A', 'City' + str(x) + 'A', 'FolioAddressID' + str(x) + 'A', 'MapReferenceNumber' + str(x) + 'A', 'StreetDirectionPrefix' + str(x) + 'A', 'StreetDirectionSuffix' + str(x) + 'A']

	with open(re.sub('\.xml', '.csv', sys.argv[1]), 'w', newline='') as csvfile:
	writer = csv.DictWriter(csvfile, fieldnames=columns)
	writer.writeheader()

	folio_records = root.xpath("//FolioRecord")
	for record in folio_records:
	address_fields = 0
	row = {}
	row['ID'] = record.attrib['ID']

	jurisdiction = record.getparent().getparent()
	for child in jurisdiction:
	if child == "JurisdictionCode":
	row['JurisdictionCode'] = child.text
	if child == "JurisdictionDescription":
	row['JurisdictionDescription'] = child.text

	assessment_area = jurisdiction.getparent().getparent()
	for child in assessment_area:
	if child == "AssessmentAreaCode":
	row['AssessmentAreaCode'] = child.text
	if child == "AssessmentAreaDescription":
	row['AssessmentAreaDescription'] = child.text

	for element in record.iter():
	if element.tag == 'Sale':
	row['SaleID'] = element.attrib['ID']
	try:
	row['SaleAction'] = element.attrib['Action']
	except:
	pass

	if element.tag == 'LegalDescription':
	row['LegalDescriptionID'] = element.attrib['ID']

	# Allow address_field tags to repeat, this is XML after all
	if element.tag == 'FolioAddress':
	address_fields += 1
	row[('FolioAddressID' + str(address_fields) + 'A')] = element.attrib['ID']
	try:
	row[('FolioAddressAction' + str(address_fields) + 'A')] = element.attrib['Action']
	except:
	pass
	for address_field in element.iter():
	row[(address_field.tag + str(address_fields) + 'A')] = address_field.text

	# These can repeat as well, handle them differently
	if element.tag == 'DistrictCode':
	row[(element.getparent().tag + 'Code')] = element.text
	if element.tag == 'DistrictDescription':
	row[(element.getparent().tag + 'Description')] = element.text

	if element.text is not None and len(element.text) > 0 and element.tag != 'DistrictCode' and element.tag != 'DistrictDescription':
	row[element.tag] = element.text

	# Remove the last address_field tags as those are now duplicated
	rd = RegexDict(row)
	for o in rd.get_matching('.*' + str(address_fields) + 'A'):
	row.pop(o)

	writer.writerow(row)