xmlparser.py 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. '''
  2. Parser de fichiers XML
  3. Fournit un dictionnaire des données sous la forme
  4. {'root.child1.child2.node': 'value'}
  5. @author: olivier.massot, sept. 2017
  6. '''
  7. import re
  8. from lxml import etree # @UnresolvedImport
  9. from path import Path
  10. _regex = re.compile(r"(?:\{https?:\/\/.*\})(.*)")
  11. CACHE = {}
  12. class XmlError(Exception):
  13. pass
  14. def _clean_tag(tag):
  15. """ retire un eventuel schema {http://*} en debut de chaine """
  16. try:
  17. return _regex.search(tag).group(1)
  18. except:
  19. return tag
  20. def _parse(filepath):
  21. if not filepath in CACHE:
  22. try:
  23. tree = etree.parse(filepath)
  24. CACHE[filepath] = tree
  25. except etree.XMLSyntaxError:
  26. raise XmlError("Le fichier XML contient des erreurs! ('{}')".format(filepath.name))
  27. return CACHE[filepath]
  28. def getroottag(filepath):
  29. return _clean_tag(_parse(filepath).getroot().tag)
  30. def first_children_tags(filepath):
  31. return [_clean_tag(elt.tag) for elt in _parse(filepath).getroot()]
  32. def parse(filepath, start=""):
  33. """ parse un fichier Xml et retourne un dictionnaire
  34. de la forme {"node1.node2.element.tag": value}
  35. """
  36. tree = _parse(filepath)
  37. root = tree.getroot()
  38. data = {}
  39. def _iter(elt, in_start_node=False, breadcrumb=tuple()):
  40. if in_start_node:
  41. breadcrumb += (_clean_tag(elt.tag),)
  42. if not start or _clean_tag(elt.tag) == start:
  43. in_start_node = True
  44. for child in elt:
  45. if not len(child) > 0 and in_start_node:
  46. key = ".".join(breadcrumb + (_clean_tag(child.tag),))
  47. data[key] = child.text
  48. else:
  49. _iter(child, in_start_node, breadcrumb)
  50. _iter(root)
  51. return data
  52. if __name__ == "__main__":
  53. import sys
  54. try:
  55. filepath = Path(sys.argv[1])
  56. except IndexError:
  57. print("Erreur: vous devez passer le chemin d'un fichier en parametre\nExemple:\n python xmlparser.py c:\dict.xml")
  58. sys.exit(1)
  59. print("\n".join(list(parse(filepath).keys())))