| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283 |
- '''
- Parser de fichiers XML
- Fournit un dictionnaire des données sous la forme
- {'root.child1.child2.node': 'value'}
- @author: olivier.massot, sept. 2017
- '''
- import re
- from lxml import etree # @UnresolvedImport
- from path import Path
- _regex = re.compile(r"(?:\{https?:\/\/.*\})(.*)")
- CACHE = {}
- class XmlError(Exception):
- pass
- def _clean_tag(tag):
- """ retire un eventuel schema {http://*} en debut de chaine """
- try:
- return _regex.search(tag).group(1)
- except:
- return tag
- def _parse(filepath):
- if not filepath in CACHE:
- try:
- tree = etree.parse(filepath)
- CACHE[filepath] = tree
- except etree.XMLSyntaxError:
- raise XmlError("Le fichier XML contient des erreurs! ('{}')".format(filepath.name))
- return CACHE[filepath]
- def getroottag(filepath):
- return _clean_tag(_parse(filepath).getroot().tag)
- def first_children_tags(filepath):
- return [_clean_tag(elt.tag) for elt in _parse(filepath).getroot()]
- def parse(filepath, start=""):
- """ parse un fichier Xml et retourne un dictionnaire
- de la forme {"node1.node2.element.tag": value}
- """
- tree = _parse(filepath)
- root = tree.getroot()
- data = {}
- def _iter(elt, in_start_node=False, breadcrumb=tuple()):
- if in_start_node:
- breadcrumb += (_clean_tag(elt.tag),)
- if not start or _clean_tag(elt.tag) == start:
- in_start_node = True
- for child in elt:
- if not len(child) > 0 and in_start_node:
- key = ".".join(breadcrumb + (_clean_tag(child.tag),))
- data[key] = child.text
- else:
- _iter(child, in_start_node, breadcrumb)
- _iter(root)
- return data
- if __name__ == "__main__":
- import sys
- try:
- filepath = Path(sys.argv[1])
- except IndexError:
- print("Erreur: vous devez passer le chemin d'un fichier en parametre\nExemple:\n python xmlparser.py c:\dict.xml")
- sys.exit(1)
- print("\n".join(list(parse(filepath).keys())))
|