3.4. Serialization XML, XSLT, XPath¶
3.4.1. xml
module from standard library¶
xml
module from standard library:
from xml.etree.ElementTree import parse
FILE = r'_temporary.xml'
# <execute>
# <command timeout="2">/bin/ls -la /etc/</command>
# <command>/bin/ls -l /home/ /tmp/</command>
# <command timeout="1">/bin/sleep 2</command>
# <command timeout="2">/bin/echo 'hello'</command>
# </execute>
root = parse(FILE).getroot()
for command in root.findall('./command'):
print(command.tag)
print(command.text)
print(command.attrib)
print()
# command
# /bin/ls -la /etc/
# {'timeout': '2'}
#
# command
# /bin/ls -l /home/ /tmp/
# {}
#
# command
# /bin/sleep 2
# {'timeout': '1'}
#
# command
# /bin/echo 'hello'
# {'timeout': '2'}
3.4.2. lxml
module¶
pip install lxml
3.4.3. Creating elements¶
Creating elements:
from lxml.etree import tostring, Element
root = Element("iris")
print(tostring(root))
# b'<iris/>'
Adding elements using list interface:
from lxml.etree import tostring, Element
root = Element('iris')
root.append(Element('setosa'))
root.append(Element('versicolor'))
root.append(Element('virginica'))
print(tostring(root))
# b'<iris><setosa/><versicolor/><virginica/></iris>'
3.4.4. Length of a subtree¶
Length of a subtree:
from lxml.etree import Element
root = Element('iris')
root.append(Element('setosa'))
root.append(Element('versicolor'))
root.append(Element('virginica'))
print(len(root))
# 3
3.4.5. Selecting subtree¶
Selecting subtree:
from lxml.etree import Element
root = Element('iris')
root.append(Element('setosa'))
root.append(Element('versicolor'))
root.append(Element('virginica'))
selected = root[2]
print(selected.tag)
# virginica
Where is selected element:
from lxml.etree import Element
root = Element('iris')
root.append(Element('setosa'))
root.append(Element('versicolor'))
root.append(Element('virginica'))
selected = root[1]
root.index(selected)
# 1
selected = root[2]
root.index(selected)
# 2
3.4.6. Element tree as a lists¶
Elements are lists:
from lxml.etree import tostring, Element
root = Element('iris")
root.append(Element('setosa"))
root.append(Element('versicolor"))
root.append(Element('virginica"))
children = list(root)
print(children)
# [
# <Element setosa at 0x113cd4048>,
# <Element versicolor at 0x113cd4188>,
# <Element virginica at 0x113cd41c8>
# ]
Iterating over elements:
from lxml.etree import Element
root = Element("iris")
root.append(Element("setosa"))
root.append(Element("versicolor"))
root.append(Element("virginica"))
for child in root:
print(child.tag)
# setosa
# versicolor
# virginica
Slicing elements:
from lxml.etree import Element
root = Element("iris")
root.append(Element("setosa"))
root.append(Element("versicolor"))
root.append(Element("virginica"))
root.insert(0, Element("arctica"))
start = root[:1]
end = root[-1:]
print(start[0].tag) # arctica
print(end[0].tag) # virginica
3.4.7. Elements as a dict¶
Create element using dict
interface:
from lxml.etree import tostring, Element
tag = Element("iris", kingdom="plantae")
print(tostring(tag))
# b'<iris kingdom="plantae"/>'
Get element attributes and values:
from lxml.etree import tostring, Element
tag = Element("iris", kingdom="plantae")
print(tag.get("kingdom")) # plantae
print(tag.get("not-existing")) # None
Set element attributes and values:
from lxml.etree import tostring, Element
tag = Element("iris", kingdom="plantae")
tag.set("kind", "flower")
print(tag.get("kind"))
# flower
print(tostring(tag))
# b'<iris kingdom="plantae" kind="flower"/>'
Elements carry attributes as a dict:
from lxml.etree import Element
tag = Element("iris", kingdom="plantae")
tag.set("kind", "flower")
tag.keys()
# ['kind', 'kingdom']
tag.values()
# ['plantae', 'flower']
tag.items()
# [('kingdom', 'plantae'), ('kind', 'flower')]
Iterating over element attributes and values:
from lxml.etree import Element
tag = Element("iris", kingdom="plantae")
tag.set("kind", "flower")
for key, value in tag.items():
print(f'{key} -> {value}')
# kingdom -> plantae
# kind -> flower
Elements carry attributes as a dict:
from lxml.etree import Element
tag = Element("iris", kingdom="plantae")
tag.set("kind", "flower")
tag.attrib['kingdom']
# 'plantae'
tag.attrib['not-existing']
# Traceback (most recent call last):
# KeyError: 'not-existing'
tag.attrib['species'] = 'Setosa'
tag.attrib.get('species')
# 'Setosa'
tag.attrib
# {'kingdom': 'plantae', 'kind': 'flower'}
tag.attrib.items()
# [('kingdom', 'plantae'), ('kind', 'flower'), ('species', 'Setosa')]
3.4.8. Elements contain text¶
from lxml.etree import tostring, Element
tag = Element("iris")
tag.text = "Setosa"
tag.text
# 'Setosa'
tostring(tag)
# b'<iris>Setosa</iris>'
3.4.9. Tree iteration¶
from lxml.etree import tostring, Element, SubElement
root = Element("iris")
SubElement(root, "species").text = "Setosa"
SubElement(root, "species").text = "Virginica"
SubElement(root, "flower").text = "Versicolor"
print(tostring(root, pretty_print=True))
# b'<iris>
# <species>Setosa</species>
# <species>Virginica</species>
# <flower>Versicolor</flower>
# </iris>'
for element in root.iter():
print(f'{element.tag} -> {element.text}')
# iris -> None
# species -> Setosa
# species -> Virginica
# flower -> Versicolor
for element in root.iter("species"):
print(f'{element.tag} -> {element.text}')
# species -> Setosa
# species -> Virginica
for element in root.iter("species", "flower"):
print(f'{element.tag} -> {element.text}')
# species -> Setosa
# species -> Virginica
# flower -> Versicolor
3.4.10. Entities¶
from lxml.etree import tostring, Element, SubElement, Entity
root = Element("iris")
print(tostring(root))
# b'<iris/>'
root.append(Entity("#234"))
print(tostring(root))
# b'<iris>ê</iris>'
3.4.11. Comments¶
from lxml.etree import tostring, Element, SubElement, Comment
root = Element("iris")
print(tostring(root))
# b'<iris/>'
root.append(Comment("Hello World"))
print(tostring(root))
# b'<iris><!--Hello World--></iris>'
from lxml.etree import tostring, Element, SubElement
root = Element('iris')
SubElement(root, 'species').text = 'setosa'
SubElement(root, 'species').text = 'virginica'
SubElement(root, 'flower').text = 'versicolor'
print(tostring(root))
# b'<iris><species>setosa</species><species>virginica</species><flower>versicolor</flower></iris>'
from lxml.etree import tostring, Element, Entity
root = Element('iris')
root.append(Entity('#234'))
print(tostring(root))
# b'<iris>ê</iris>'
from lxml.etree import tostring, Element, Comment
root = Element('iris')
root.append(Comment('Hello World'))
print(tostring(root))
# b'<iris><!--Hello World--></iris>'
from lxml.etree import tostring, Element, Entity, Comment
root = Element('iris')
root.append(Element('species'))
root.append(Element('species'))
root.append(Element('flower'))
root.append(Entity('#234'))
root.append(Comment('Hello World'))
print(tostring(root))
# b'<iris><species/><species/><flower/>ê<!--Hello World--></iris>'
for element in root.iter():
if isinstance(element.tag, str):
print(f'Tag: {element.tag} -> {element.text}')
else:
print(f'Special: {element} -> {element.text}')
# Tag: iris -> None
# Tag: species -> None
# Tag: species -> None
# Tag: flower -> None
# Special: ê -> ê
# Special: <!--Hello World--> -> Hello World
for element in root.iter(tag=Element):
print(f'{element.tag} -> {element.text}')
# iris -> None
# species -> None
# species -> None
# flower -> None
for element in root.iter(tag=Entity):
print(element.text)
# ê
for element in root.iter(tag=Comment):
print(element.text)
# Hello World
3.4.12. Serialization¶
from lxml.etree import tostring, XML
root = XML('<root><a><b/></a></root>')
tostring(root)
# b'<root><a><b/></a></root>'
print(tostring(root, xml_declaration=True))
# b"<?xml version='1.0' encoding='ASCII'?>\n<root><a><b/></a></root>"
print(tostring(root, encoding='utf-8'))
# b'<root><a><b/></a></root>'
print(tostring(root, encoding='iso-8859-2'))
# b"<?xml version='1.0' encoding='iso-8859-2'?>\n<root><a><b/></a></root>"
print(tostring(root, pretty_print=True))
# b'<root>\n <a>\n <b/>\n </a>\n</root>\n'
print(tostring(root, pretty_print=True).decode())
# <root>
# <a>
# <b/>
# </a>
# </root>
from lxml.etree import tostring, XML
root = XML('<html><head/><body><p>Hello<br/>World</p></body></html>')
# default: method = 'xml'
tostring(root)
# b'<html><head/><body><p>Hello<br/>World</p></body></html>'
tostring(root, method='xml')
# b'<html><head/><body><p>Hello<br/>World</p></body></html>'
tostring(root, method='html')
# b'<html><head></head><body><p>Hello<br>World</p></body></html>'
print(tostring(root, method='html', pretty_print=True))
# b'<html>\n<head></head>\n<body><p>Hello<br>World</p></body>\n</html>\n'
print(tostring(root, method='html', pretty_print=True).decode())
# <html>
# <head></head>
# <body><p>Hello<br>World</p></body>
# </html>
tostring(root, method='text')
# b'HelloWorld'
3.4.13. Working with HTML¶
Using
lxml
module
<html><body>Iris<br/>Setosa</body></html>
from lxml.etree import tostring, Element, SubElement
html = Element("html")
body = SubElement(html, "body")
body.text = "Iris"
tostring(html)
# b'<html><body>Iris</body></html>'
br = SubElement(body, "br")
tostring(html)
# b'<html><body>Iris<br/></body></html>'
br.tail = 'Setosa'
tostring(html)
# b'<html><body>Iris<br/>Setosa</body></html>'
3.4.14. XPATH¶
Using
lxml
module
print(html.xpath("string()")) # lxml.etree only!
# IrisSetosa
print(html.xpath("//text()")) # lxml.etree only!
# ['Iris', 'Setosa']
3.4.15. XSLT¶
Using
lxml
module
3.4.16. Example 1¶
from io import StringIO
from lxml.etree import XML, XSLT, parse
TEMPLATE = """
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:template match="/">
<my_tag>
<xsl:value-of select="/outer/inner/text()" />
</my_tag>
</xsl:template>
</xsl:stylesheet>
"""
DATA = """
<outer>
<inner>Hello World</inner>
</outer>
"""
transform = XSLT(XML(TEMPLATE))
data = parse(StringIO(DATA))
result = transform(data)
print(result)
# <?xml version="1.0"?>
# <my_tag>Hello World</my_tag>
3.4.17. Example 2¶
from io import StringIO
from lxml.etree import XML, XSLT, parse
DATA = """
<astronauts>
<astro>
<firstname>Jan</firstname>
<lastname>Twardowski</lastname>
</astro>
<astro>
<firstname>Mark</firstname>
<lastname>Watney</lastname>
</astro>
</astronauts>
"""
TEMPLATE = """
<html xsl:version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<table>
<thead>
<tr>
<th>First Name</th>
<th>Last Name</th>
</tr>
</thead>
<tbody>
<xsl:for-each select="astronauts/astro">
<tr>
<td><xsl:value-of select="firstname"/></td>
<td><xsl:value-of select="lastname"/></td>
</tr>
</xsl:for-each>
</tbody>
</table>
</html>
"""
transform = XSLT(XML(TEMPLATE))
data = parse(StringIO(DATA))
result = transform(data)
print(result)
# <html><table>
# <thead><tr>
# <th>First Name</th>
# <th>Last Name</th>
# </tr></thead>
# <tbody>
# <tr>
# <td>Jan</td>
# <td>Twardowski</td>
# </tr>
# <tr>
# <td>Mark</td>
# <td>Watney</td>
# </tr>
# </tbody>
# </table></html>
3.4.18. Example 3¶
from io import StringIO
from lxml.etree import XML, XSLT, parse
DATA = """
<CATALOG>
<PLANT>
<COMMON>Bloodroot</COMMON>
<BOTANICAL>Sanguinaria canadensis</BOTANICAL>
<ZONE>4</ZONE>
<LIGHT>Mostly Shady</LIGHT>
<PRICE>$2.44</PRICE>
<AVAILABILITY>031599</AVAILABILITY>
</PLANT>
<PLANT>
<COMMON>Columbine</COMMON>
<BOTANICAL>Aquilegia canadensis</BOTANICAL>
<ZONE>3</ZONE>
<LIGHT>Mostly Shady</LIGHT>
<PRICE>$9.37</PRICE>
<AVAILABILITY>030699</AVAILABILITY>
</PLANT>
</CATALOG>
"""
TEMPLATE = """
<html xsl:version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<style>
body {font-family: Arial; font-size: 1em; background-color: #EEEEEE}
div.title {background-color: teal; color: white; padding: 4px}
div.description {margin-left:20px;margin-bottom:1em;font-size:10pt}
span {font-weight: bold}
</style>
<body>
<xsl:for-each select="CATALOG/PLANT">
<div class="title">
<span><xsl:value-of select="BOTANICAL"/></span>
<xsl:value-of select="PRICE"/>
</div>
<div class="description">
<xsl:value-of select="description"/>
<span> (<xsl:value-of select="AVAILABILITY"/> will be available)</span>
</div>
</xsl:for-each>
</body>
</html>
"""
transform = XSLT(XML(TEMPLATE))
data = parse(StringIO(DATA))
result = transform(data)
print(result)
# <html>
# <style>
# body {font-family: Arial; font-size: 1em; background-color: #EEEEEE}
# div.title {background-color: teal; color: white; padding: 4px}
# div.description {margin-left:20px;margin-bottom:1em;font-size:10pt}
# span {font-weight: bold}
# </style>
# <body>
# <div class="title">
# <span>Sanguinaria canadensis</span>$2.44</div>
# <div class="description"><span> (031599 will be available)</span></div>
# <div class="title">
# <span>Aquilegia canadensis</span>$9.37</div>
# <div class="description"><span> (030699 will be available)</span></div>
# </body>
# </html>
3.4.19. Assignments¶
"""
* Assignment: Serialization XML Parsing
* Complexity: easy
* Lines of code: 20 lines
* Time: 13 min
English:
1. Use data from "Given" section (see below)
2. Convert input data to `list[dict]`
Polish:
1. Użyj danych z sekcji "Given" (patrz poniżej)
2. Przekonwertuj dane wejściowe do `list[dict]`
"""
# Given
DATA = """<?xml version="1.0" encoding="UTF-8"?>
<CATALOG>
<PLANT>
<COMMON>Bloodroot</COMMON>
<BOTANICAL>Sanguinaria canadensis</BOTANICAL>
<ZONE>4</ZONE>
<LIGHT>Mostly Shady</LIGHT>
<PRICE>$2.44</PRICE>
<AVAILABILITY>031599</AVAILABILITY>
</PLANT>
<PLANT>
<COMMON>Columbine</COMMON>
<BOTANICAL>Aquilegia canadensis</BOTANICAL>
<ZONE>3</ZONE>
<LIGHT>Mostly Shady</LIGHT>
<PRICE>$9.37</PRICE>
<AVAILABILITY>030699</AVAILABILITY>
</PLANT>
<PLANT>
<COMMON>Marsh Marigold</COMMON>
<BOTANICAL>Caltha palustris</BOTANICAL>
<ZONE>4</ZONE>
<LIGHT>Mostly Sunny</LIGHT>
<PRICE>$6.81</PRICE>
<AVAILABILITY>051799</AVAILABILITY>
</PLANT>
<PLANT>
<COMMON>Cowslip</COMMON>
<BOTANICAL>Caltha palustris</BOTANICAL>
<ZONE>4</ZONE>
<LIGHT>Mostly Shady</LIGHT>
<PRICE>$9.90</PRICE>
<AVAILABILITY>030699</AVAILABILITY>
</PLANT>
<CATALOG>"""
"""
* Assignment: Serialization XSLT Transformation
* Complexity: medium
* Lines of code: 5 lines
* Time: 13 min
English:
1. Use data from "Given" section (see below)
2. Convert input data to `list[dict]`
Polish:
1. Użyj danych z sekcji "Given" (patrz poniżej)
2. Przekonwertuj dane wejściowe do `list[dict]`
"""
# Given
DATA = """<?xml version="1.0" encoding="UTF-8"?>
<breakfast_menu>
<food>
<name>Belgian Waffles</name>
<price>$5.95</price>
<description>Two of our famous Belgian Waffles with plenty of real maple syrup</description>
<calories>650</calories>
</food>
<food>
<name>Strawberry Belgian Waffles</name>
<price>$7.95</price>
<description>Light Belgian waffles covered with strawberries and whipped cream</description>
<calories>900</calories>
</food>
<food>
<name>Berry-Berry Belgian Waffles</name>
<price>$8.95</price>
<description>Light Belgian waffles covered with an assortment of fresh berries and whipped cream</description>
<calories>900</calories>
</food>
<food>
<name>French Toast</name>
<price>$4.50</price>
<description>Thick slices made from our homemade sourdough bread</description>
<calories>600</calories>
</food>
<food>
<name>Homestyle Breakfast</name>
<price>$6.95</price>
<description>Two eggs, bacon or sausage, toast, and our ever-popular hash browns</description>
<calories>950</calories>
</food>
</breakfast_menu>
"""