yoki/0_XML.py

## 0_XML.py
# xml document
# edit.py
# encode.py
# extract.py
# namespace.py
# search.py

## edit.py
# http://chimera.labs.oreilly.com/books/1230000000393/ch06.html#_solution_99
>>> from xml.etree.ElementTree import parse, Element
>>> doc = parse('pred.xml')
>>> root = doc.getroot()

>>> # Remove a few elements
>>> root.remove(root.find('sri'))

>>> # Insert a new element after <nm>...</nm>
>>> root.getchildren().index(root.find('nm'))
1
>>> e = Element('spam')
>>> e.text = 'This is a test'
>>> root.insert(2, e)

>>> # Write back to a file
>>> doc.write('newpred.xml', xml_declaration=True)

## escape.py
# http://chimera.labs.oreilly.com/books/1230000000393/ch02.html#_solution_37
>>> s = 'Elements are written as "<tag>text</tag>".'
>>> import html
>>> print(html.escape(s))
Elements are written as &quot;&lt;tag&gt;text&lt;/tag&gt;&quot;.


>>> s = 'Spicy Jalapeño'
>>> s.encode('ascii', errors='xmlcharrefreplace')
b'Spicy Jalape&#241;o'


>>> s = 'Spicy &quot;Jalape&#241;o&quot.'
>>> from html.parser import HTMLParser
>>> p = HTMLParser()
>>> p.unescape(s)
'Spicy "Jalapeño".'

## extract.py
#http://chimera.labs.oreilly.com/books/1230000000393/ch06.html#_solution_96
from urllib.request import urlopen
from xml.etree.ElementTree import parse

# Download the RSS feed and parse it
u = urlopen('http://planet.python.org/rss20.xml')
doc = parse(u)

# Extract and output tags of interest
for item in doc.iterfind('channel/item'):
  title = item.findtext('title')
  date = item.findtext('pubDate')
  link = item.findtext('link')


#--------------- Extract tag, text, attributes
e = doc.find('channel/title')
>>> e
<Element 'title' at 0x10135b310>
>>> e.tag
'title'
>>> e.text
'Planet Python'
>>> e.get('some_attribute')

## namespace.py
# http://chimera.labs.oreilly.com/books/1230000000393/ch06.html#_solution_100

## search.py


#---------XPATH--------------------
# https://docs.python.org/3.5/library/xml.etree.elementtree.html#supported-xpath-syntax

import xml.etree.ElementTree as ET
root = ET.fromstring(countrydata)

# Top-level elements
root.findall(".")

# All 'neighbor' grand-children of 'country' children of the top-level
# elements
root.findall("./country/neighbor")

# Nodes with name='Singapore' that have a 'year' child
root.findall(".//year/..[@name='Singapore']")

# 'year' nodes that are children of nodes with name='Singapore'
root.findall(".//*[@name='Singapore']/year")

# All 'neighbor' nodes that are the second child of their parent
root.findall(".//neighbor[2]")
	# xml document
	# edit.py
	# encode.py
	# extract.py
	# namespace.py
	# search.py
	# http://chimera.labs.oreilly.com/books/1230000000393/ch06.html#_solution_99
	>>> from xml.etree.ElementTree import parse, Element
	>>> doc = parse('pred.xml')
	>>> root = doc.getroot()

	>>> # Remove a few elements
	>>> root.remove(root.find('sri'))

	>>> # Insert a new element after <nm>...</nm>
	>>> root.getchildren().index(root.find('nm'))
	1
	>>> e = Element('spam')
	>>> e.text = 'This is a test'
	>>> root.insert(2, e)

	>>> # Write back to a file
	>>> doc.write('newpred.xml', xml_declaration=True)
	# http://chimera.labs.oreilly.com/books/1230000000393/ch02.html#_solution_37
	>>> s = 'Elements are written as "<tag>text</tag>".'
	>>> import html
	>>> print(html.escape(s))
	Elements are written as "<tag>text</tag>".


	>>> s = 'Spicy Jalapeño'
	>>> s.encode('ascii', errors='xmlcharrefreplace')
	b'Spicy Jalapeño'


	>>> s = 'Spicy "Jalapeño&quot.'
	>>> from html.parser import HTMLParser
	>>> p = HTMLParser()
	>>> p.unescape(s)
	'Spicy "Jalapeño".'
	#http://chimera.labs.oreilly.com/books/1230000000393/ch06.html#_solution_96
	from urllib.request import urlopen
	from xml.etree.ElementTree import parse

	# Download the RSS feed and parse it
	u = urlopen('http://planet.python.org/rss20.xml')
	doc = parse(u)

	# Extract and output tags of interest
	for item in doc.iterfind('channel/item'):
	title = item.findtext('title')
	date = item.findtext('pubDate')
	link = item.findtext('link')


	#--------------- Extract tag, text, attributes
	e = doc.find('channel/title')
	>>> e
	<Element 'title' at 0x10135b310>
	>>> e.tag
	'title'
	>>> e.text
	'Planet Python'
	>>> e.get('some_attribute')


	#---------XPATH--------------------
	# https://docs.python.org/3.5/library/xml.etree.elementtree.html#supported-xpath-syntax

	import xml.etree.ElementTree as ET
	root = ET.fromstring(countrydata)

	# Top-level elements
	root.findall(".")

	# All 'neighbor' grand-children of 'country' children of the top-level
	# elements
	root.findall("./country/neighbor")

	# Nodes with name='Singapore' that have a 'year' child
	root.findall(".//year/..[@name='Singapore']")

	# 'year' nodes that are children of nodes with name='Singapore'
	root.findall(".//*[@name='Singapore']/year")

	# All 'neighbor' nodes that are the second child of their parent
	root.findall(".//neighbor[2]")