''' Parser de fichiers XML Fournit un dictionnaire des données sous la forme {'root.child1.child2.node': 'value'} @author: olivier.massot, sept. 2017 ''' import re from lxml import etree # @UnresolvedImport from path import Path _regex = re.compile(r"(?:\{https?:\/\/.*\})(.*)") CACHE = {} class XmlError(Exception): pass def _clean_tag(tag): """ retire un eventuel schema {http://*} en debut de chaine """ try: return _regex.search(tag).group(1) except: return tag def _parse(filepath): if not filepath in CACHE: try: tree = etree.parse(filepath) CACHE[filepath] = tree except etree.XMLSyntaxError: raise XmlError("Le fichier XML contient des erreurs! ('{}')".format(filepath.name)) return CACHE[filepath] def getroottag(filepath): return _clean_tag(_parse(filepath).getroot().tag) def first_children_tags(filepath): return [_clean_tag(elt.tag) for elt in _parse(filepath).getroot()] def parse(filepath, start=""): """ parse un fichier Xml et retourne un dictionnaire de la forme {"node1.node2.element.tag": value} """ tree = _parse(filepath) root = tree.getroot() data = {} def _iter(elt, in_start_node=False, breadcrumb=tuple()): if in_start_node: breadcrumb += (_clean_tag(elt.tag),) if not start or _clean_tag(elt.tag) == start: in_start_node = True for child in elt: if not len(child) > 0 and in_start_node: key = ".".join(breadcrumb + (_clean_tag(child.tag),)) data[key] = child.text else: _iter(child, in_start_node, breadcrumb) _iter(root) return data if __name__ == "__main__": import sys try: filepath = Path(sys.argv[1]) except IndexError: print("Erreur: vous devez passer le chemin d'un fichier en parametre\nExemple:\n python xmlparser.py c:\dict.xml") sys.exit(1) print("\n".join(list(parse(filepath).keys())))