Skip to content

Commit ceeca03

Browse files
committed
Support for unions
1 parent 46000d0 commit ceeca03

File tree

12 files changed

+712
-181
lines changed

12 files changed

+712
-181
lines changed

README.md

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,19 @@ if __name__ == "__main__":
6565
* Lists of child elements are supported
6666
* Inheritance does work, but has the same limitations as dataclasses. Inheriting from base classes with required fields and declaring optional fields doesn't work due to field order. This isn't recommended
6767
* Namespace support is decent as long as correctly declared. I've tried on several real-world examples, although they were known to be valid. `lxml` does a great job at expanding namespace information when loading and simplifying it when saving
68+
* Union child types are supported. When loading XML, they are attempted to be parsed in order
69+
70+
## Gotchas
71+
72+
### Whitespace
73+
74+
If you are able to, it is strongly recommended you strip whitespace from the input via `lxml`:
75+
76+
```python
77+
parser = etree.XMLParser(remove_blank_text=True)
78+
```
79+
80+
By default, `lxml` preserves whitespace. This can cause a problem when checking if elements have no text. The library does attempt to strip these; literally via Python's `strip()`. But `lxml` is likely faster and more robust.
6881

6982
## Limitations and Assumptions
7083

@@ -77,8 +90,8 @@ Most of these limitations/assumptions are enforced. They may make this project u
7790
* It isn't possible to pass any parameters to the wrapped `@dataclass` decorator
7891
* Some properties of dataclass `field`s are not exposed: `default_factory`, `repr`, `hash`, `init`, `compare`. For most, it is because I don't understand the implications fully or how that would be useful for XML. `default_factory` is hard only because of [the overloaded type signatures](https://github.com/python/typeshed/blob/master/stdlib/3.7/dataclasses.pyi), and getting that to work with `mypy`
7992
* Deserialisation is strict; missing required attributes and child elements will cause an error. I want this to be the default behaviour, but it should be straightforward to add a parameter to `load` for lenient operation
80-
* Unions of types aren't yet supported
8193
* Dataclasses must be written by hand, no tools are provided to generate these from, DTDs, XML schema definitions, or RELAX NG schemas
94+
* Union types must have the same element/tag name and namespace. Otherwise, two different dataclass attributes (XML child fields) may be used
8295

8396
## Development
8497

functional/__init__.py

Whitespace-only changes.

functional/container_test.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
from pathlib import Path
22
from typing import List
3-
3+
import pytest
44
from lxml import etree
55

66
from xml_dataclasses import attr, child, dump, load, xml_dataclass
77

8+
from .utils import lmxl_dump
9+
810
BASE = Path(__file__).resolve(strict=True).parent
911

1012
CONTAINER_NS = "urn:oasis:names:tc:opendocument:xmlns:container"
@@ -32,15 +34,10 @@ class Container:
3234
# (it's missing links)
3335

3436

35-
def lmxl_dump(el):
36-
encoded = etree.tostring(
37-
el, encoding="utf-8", pretty_print=True, xml_declaration=True
38-
)
39-
return encoded.decode("utf-8")
40-
41-
42-
def test_functional_container():
43-
el = etree.parse(str(BASE / "container.xml")).getroot()
37+
@pytest.mark.parametrize("remove_blank_text", [True, False])
38+
def test_functional_container_no_whitespace(remove_blank_text):
39+
parser = etree.XMLParser(remove_blank_text=remove_blank_text)
40+
el = etree.parse(str(BASE / "container.xml"), parser).getroot()
4441
original = lmxl_dump(el)
4542
container = load(Container, el, "container")
4643
assert container == Container(

functional/package.xml

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
<?xml version='1.0' encoding='utf-8'?>
2+
<package xmlns="http://www.idpf.org/2007/opf" version="2.0" unique-identifier="13b2938e-05ad-4096-abde-0f377b895d61" xml:lang="en">
3+
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
4+
<dc:identifier id="13b2938e-05ad-4096-abde-0f377b895d61">13b2938e-05ad-4096-abde-0f377b895d61</dc:identifier>
5+
<dc:title>XML dataclass test</dc:title>
6+
<dc:language>en</dc:language>
7+
<meta content="cover-image" name="cover"/>
8+
<meta property="dcterms:modified">2020-02-02T12:00:00Z</meta>
9+
</metadata>
10+
<manifest>
11+
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
12+
<item id="cover" href="xhtml/cover.html" media-type="application/xhtml+xml"/>
13+
<item id="title" href="xhtml/title.html" media-type="application/xhtml+xml"/>
14+
<item id="dedication" href="xhtml/dedication.html" media-type="application/xhtml+xml"/>
15+
<item id="contents" href="xhtml/contents.html" media-type="application/xhtml+xml"/>
16+
<item id="intro" href="xhtml/intro.html" media-type="application/xhtml+xml"/>
17+
<item id="ch01" href="xhtml/ch01.html" media-type="application/xhtml+xml"/>
18+
<item id="ch02" href="xhtml/ch02.html" media-type="application/xhtml+xml"/>
19+
<item id="ch03" href="xhtml/ch03.html" media-type="application/xhtml+xml"/>
20+
<item id="ch04" href="xhtml/ch04.html" media-type="application/xhtml+xml"/>
21+
<item id="ch05" href="xhtml/ch05.html" media-type="application/xhtml+xml"/>
22+
<item id="ch06" href="xhtml/ch06.html" media-type="application/xhtml+xml"/>
23+
<item id="ch07" href="xhtml/ch07.html" media-type="application/xhtml+xml"/>
24+
<item id="ch08" href="xhtml/ch08.html" media-type="application/xhtml+xml"/>
25+
<item id="ch09" href="xhtml/ch09.html" media-type="application/xhtml+xml"/>
26+
<item id="ch10" href="xhtml/ch10.html" media-type="application/xhtml+xml"/>
27+
<item id="ch11" href="xhtml/ch11.html" media-type="application/xhtml+xml"/>
28+
<item id="ch12" href="xhtml/ch12.html" media-type="application/xhtml+xml"/>
29+
<item id="ch13" href="xhtml/ch13.html" media-type="application/xhtml+xml"/>
30+
<item id="ch14" href="xhtml/ch14.html" media-type="application/xhtml+xml"/>
31+
<item id="ch15" href="xhtml/ch15.html" media-type="application/xhtml+xml"/>
32+
<item id="ch16" href="xhtml/ch16.html" media-type="application/xhtml+xml"/>
33+
<item id="ch17" href="xhtml/ch17.html" media-type="application/xhtml+xml"/>
34+
<item id="ch18" href="xhtml/ch18.html" media-type="application/xhtml+xml"/>
35+
<item id="ch19" href="xhtml/ch19.html" media-type="application/xhtml+xml"/>
36+
<item id="ch20" href="xhtml/ch20.html" media-type="application/xhtml+xml"/>
37+
</manifest>
38+
<spine toc="ncx">
39+
<itemref idref="cover"/>
40+
<itemref idref="title"/>
41+
<itemref idref="dedication"/>
42+
<itemref idref="contents"/>
43+
<itemref idref="intro"/>
44+
<itemref idref="ch01"/>
45+
<itemref idref="ch02"/>
46+
<itemref idref="ch03"/>
47+
<itemref idref="ch04"/>
48+
<itemref idref="ch05"/>
49+
<itemref idref="ch06"/>
50+
<itemref idref="ch07"/>
51+
<itemref idref="ch08"/>
52+
<itemref idref="ch09"/>
53+
<itemref idref="ch10"/>
54+
<itemref idref="ch11"/>
55+
<itemref idref="ch12"/>
56+
<itemref idref="ch13"/>
57+
<itemref idref="ch14"/>
58+
<itemref idref="ch15"/>
59+
<itemref idref="ch16"/>
60+
<itemref idref="ch17"/>
61+
<itemref idref="ch18"/>
62+
<itemref idref="ch19"/>
63+
<itemref idref="ch20"/>
64+
</spine>
65+
</package>

functional/package_test.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
from enum import Enum
2+
from pathlib import Path
3+
from typing import List, Mapping, Optional, Union
4+
5+
from lxml import etree
6+
7+
from xml_dataclasses import attr, child, dump, load, text, xml_dataclass
8+
9+
from .utils import lmxl_dump
10+
11+
BASE = Path(__file__).resolve(strict=True).parent
12+
13+
14+
class NsMap(Enum):
15+
xml = "http://www.w3.org/XML/1998/namespace"
16+
opf = "http://www.idpf.org/2007/opf"
17+
dc = "http://purl.org/dc/elements/1.1/"
18+
dcterms = "http://purl.org/dc/terms/"
19+
20+
@classmethod
21+
def to_dict(cls, default_ns: Optional[str] = None) -> Mapping[Optional[str], str]:
22+
nsmap = dict(cls.__members__)
23+
if default_ns:
24+
nsmap[None] = default_ns
25+
return nsmap
26+
27+
28+
@xml_dataclass
29+
class DublinCoreMd:
30+
__ns__ = NsMap.dc.value
31+
value: str = text()
32+
id: Optional[str] = attr(default=None)
33+
34+
35+
@xml_dataclass
36+
class MdMeta3:
37+
__ns__ = NsMap.opf.value
38+
39+
property: str = attr()
40+
value: str = text()
41+
42+
43+
@xml_dataclass
44+
class MdMeta2:
45+
__ns__ = NsMap.opf.value
46+
content: str = attr()
47+
name: str = attr()
48+
49+
50+
@xml_dataclass
51+
class Metadata3:
52+
__ns__ = NsMap.opf.value
53+
54+
identifier: List[DublinCoreMd] = child()
55+
title: List[DublinCoreMd] = child()
56+
language: List[DublinCoreMd] = child()
57+
meta: List[Union[MdMeta3, MdMeta2]] = child()
58+
59+
60+
@xml_dataclass
61+
class Item3:
62+
__ns__ = NsMap.opf.value
63+
id: str = attr()
64+
href: str = attr()
65+
media_type: str = attr(rename="media-type")
66+
67+
68+
@xml_dataclass
69+
class Manifest3:
70+
__ns__ = NsMap.opf.value
71+
item: List[Item3] = child()
72+
73+
74+
@xml_dataclass
75+
class ItemRef3:
76+
__ns__ = NsMap.opf.value
77+
idref: str = attr()
78+
properties: Optional[str] = attr(default=None)
79+
80+
81+
@xml_dataclass
82+
class Spine3:
83+
__ns__ = NsMap.opf.value
84+
itemref: List[ItemRef3] = child()
85+
toc: Optional[str] = attr(default=None)
86+
87+
88+
@xml_dataclass
89+
class Package3:
90+
__ns__ = NsMap.opf.value
91+
version: str = attr()
92+
unique_identifier: str = attr(rename="unique-identifier")
93+
metadata: Metadata3 = child()
94+
manifest: Manifest3 = child()
95+
spine: Spine3 = child()
96+
id: Optional[str] = attr(default=None)
97+
lang: Optional[str] = attr(default=None, namespace=NsMap.xml.value)
98+
dir: Optional[str] = attr(default=None)
99+
100+
101+
def test_functional_package():
102+
parser = etree.XMLParser(remove_blank_text=True)
103+
el = etree.parse(str(BASE / "package.xml"), parser).getroot()
104+
original = lmxl_dump(el)
105+
package = load(Package3, el, "package")
106+
el = dump(package, "package", NsMap.to_dict(NsMap.opf.value))
107+
roundtrip = lmxl_dump(el)
108+
assert original == roundtrip

functional/utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from lxml import etree
2+
3+
4+
def lmxl_dump(el):
5+
encoded = etree.tostring(
6+
el, encoding="utf-8", pretty_print=True, xml_declaration=True
7+
)
8+
return encoded.decode("utf-8")

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "xml_dataclasses"
3-
version = "0.0.2"
3+
version = "0.0.3"
44
description = "(De)serialize XML documents into specially-annotated dataclasses"
55
authors = ["Toby Fleming <tobywf@users.noreply.github.com>"]
66
license = "MPL-2.0"

src/xml_dataclasses/serde.py

Lines changed: 47 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from __future__ import annotations
22

33
from collections import defaultdict
4-
from typing import Any, Dict, List, Mapping, Optional, Type, TypeVar
4+
from typing import Any, Dict, List, Mapping, Optional, Type, TypeVar, Union
55

66
from lxml.builder import ElementMaker # type: ignore
77

@@ -11,16 +11,6 @@
1111
_T = TypeVar("_T")
1212

1313

14-
def _unpack_child(info: ChildInfo[_T], value: Any, el_tag: str) -> Any:
15-
if info.is_list:
16-
return [load(info.base_type, v) for v in value]
17-
18-
if len(value) != 1:
19-
raise ValueError(f"Multiple child elements '{info.xml_name}' in '{el_tag}'")
20-
21-
return load(info.base_type, value[0])
22-
23-
2414
def _load_attributes(cls: Type[XmlDataclass], el: Any) -> Mapping[str, str]:
2515
values = {}
2616
processed = set()
@@ -62,6 +52,9 @@ def _load_text(info: TextInfo[Any], el: Any) -> Mapping[str, str]:
6252

6353

6454
def _load_children(cls: Type[XmlDataclass], el: Any) -> Mapping[str, XmlDataclass]:
55+
if el.text and el.text.strip():
56+
raise ValueError(f"Element '{el.tag}' has text (expected child elements only)")
57+
6558
# child elements can be duplicated
6659
el_children: Dict[str, List[Any]] = defaultdict(list)
6760
for e in el.iterchildren():
@@ -70,16 +63,54 @@ def _load_children(cls: Type[XmlDataclass], el: Any) -> Mapping[str, XmlDataclas
7063
values = {}
7164
processed = set()
7265

73-
for child in cls.__children__:
66+
def _unpack_union_child(
67+
child: ChildInfo[Any], value: Any
68+
) -> Union[XmlDataclass, List[XmlDataclass]]:
69+
exceptions = []
70+
# try to find one matching type
71+
for base_type in child.base_types:
72+
try:
73+
return load(base_type, value)
74+
except ValueError as e:
75+
exceptions.append(e)
76+
77+
raise ValueError(
78+
f"Invalid child elements found for '{child.dt_name}' in '{el.tag}':\n"
79+
+ "\n".join(str(e) for e in exceptions)
80+
)
81+
82+
def _get_one_child_value(child: ChildInfo[Any]) -> Any:
7483
# defaultdict can't raise KeyError
7584
if child.xml_name in el_children:
76-
child_value = _unpack_child(child, el_children[child.xml_name], el.tag)
85+
value = el_children[child.xml_name]
7786
else:
78-
if child.is_required:
87+
if not child.is_required:
88+
return child.field.default
89+
90+
raise ValueError(
91+
f"Required child element '{child.xml_name}' not found in '{el.tag}'"
92+
)
93+
94+
if not child.is_list:
95+
if len(value) != 1:
7996
raise ValueError(
80-
f"Required child element '{child.xml_name}' not found in '{el.tag}'"
97+
f"Multiple child elements '{child.xml_name}' in '{el.tag}'"
8198
)
82-
child_value = child.field.default
99+
value = value[0]
100+
101+
if len(child.base_types) == 1:
102+
# nice path for default use-case
103+
base_type = child.base_types[0]
104+
if child.is_list:
105+
return [load(base_type, v) for v in value]
106+
return load(base_type, value)
107+
108+
if child.is_list:
109+
return [_unpack_union_child(child, v) for v in value]
110+
return _unpack_union_child(child, value)
111+
112+
for child in cls.__children__:
113+
child_value = _get_one_child_value(child)
83114
processed.add(child.xml_name)
84115
values[child.dt_name] = child_value
85116

0 commit comments

Comments
 (0)