datachecker.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
  1. '''
  2. Python 3.7+
  3. @author: olivier.massot, sept 2018
  4. '''
  5. from datetime import datetime
  6. import json
  7. import logging
  8. import subprocess # @UnusedImport
  9. import tempfile # @UnusedImport
  10. import zipfile
  11. from path import Path, TempDir
  12. import pypyodbc
  13. import shapefile
  14. import yaml
  15. from core import logconf
  16. from core.constants import MAIN
  17. logger = logging.getLogger("datachecker")
  18. logconf.start("datachecker", logging.INFO)
  19. pypyodbc.lowercase = False
  20. logger.disabled = True
  21. # TODO: Vérifier la projection (besoin de GDAL/OGR)
  22. # TODO: fonctions de controle spéciales
  23. class ReportField():
  24. def __init__(self, name, value = "", error=""):
  25. self.name = name
  26. self.value = value
  27. self._error = error
  28. self._valid = True
  29. @property
  30. def error(self):
  31. return self._error
  32. @error.setter
  33. def error(self, error):
  34. self._valid = False
  35. logger.error("%s - %s ('%s')", self.name, error, self.value)
  36. self._error = error
  37. @property
  38. def valid(self):
  39. return self._valid
  40. class ReportRecord():
  41. def __init__(self, index):
  42. self.index = index
  43. self._valid = True
  44. self._errors = []
  45. self._fields = []
  46. @property
  47. def errors(self):
  48. return self._errors
  49. def add_error(self, error):
  50. self._valid = False
  51. logger.error("Ligne %s - %s", self.index, error)
  52. self._errors.append(error)
  53. @property
  54. def valid(self):
  55. return self._valid
  56. @property
  57. def fields(self):
  58. return self._fields
  59. def add_field(self, field):
  60. if not field.valid:
  61. self._valid=False
  62. self._fields.append(field)
  63. class ReportFile():
  64. def __init__(self, file):
  65. self.file = file
  66. self.headers = []
  67. self._valid = True
  68. self._errors = []
  69. self._records = []
  70. @property
  71. def valid(self):
  72. return self._valid
  73. @property
  74. def records(self):
  75. return self._records
  76. def add_record(self, row):
  77. if not row.valid:
  78. self._valid=False
  79. self._records.append(row)
  80. @property
  81. def errors(self):
  82. return self._errors
  83. def add_error(self, error):
  84. self._valid=False
  85. logger.error("Fichier %s - %s", self.file, error)
  86. self._errors.append(error)
  87. class Report():
  88. def __init__(self, title, report_files=[]):
  89. self.title = title
  90. self.report_files = report_files
  91. @property
  92. def valid(self):
  93. return all([r.valid for r in self.report_files])
  94. def to_json(self):
  95. return json.dumps(self)
  96. def check(subject, checker):
  97. """ prends un dossier ou une archive en entier et vérifie son contenu selon les règles données par le fichier de config """
  98. subject, checker = Path(subject), Path(checker)
  99. if subject.isfile():
  100. with TempDir() as dirname:
  101. zip_ref = zipfile.ZipFile(subject, 'r')
  102. zip_ref.extractall(dirname)
  103. zip_ref.close()
  104. if Path(dirname / subject.stem).isdir(): # cas où l'archive contient un dossier qui lui-même contient les fichiers
  105. dirname /= subject.stem
  106. return check_folder(dirname, checker)
  107. elif subject.isdir():
  108. return check_folder(subject, checker)
  109. else:
  110. raise IOError(f"Impossible de trouver le fichier ou répertoire: {subject}")
  111. def check_folder(folder, checker):
  112. logger.info("***** Traitement de '%s' *****", folder.name)
  113. logger.info("> Controlleur: '%s'", checker.name)
  114. report = Report("Contrôle des données de {} au format {}".format(folder.name, checker.stem))
  115. db_cache = {}
  116. with open(checker, "r") as cf:
  117. config = yaml.load(cf)
  118. databases = {}
  119. for dbname, dsn in config.get("databases", {}).items():
  120. cnn = pypyodbc.connect(dsn)
  121. databases[dbname] = cnn
  122. for filename, model in config["files"].items():
  123. path_ = folder / filename
  124. logger.info("* Traitement de %s", path_.name)
  125. report_file = ReportFile(path_.name)
  126. if not path_.isfile():
  127. report_file.add_error("Fichier introuvable")
  128. continue
  129. try:
  130. sf = shapefile.Reader(path_)
  131. except shapefile.ShapefileException:
  132. report_file.add_error("Fichier illisible")
  133. continue
  134. if "srid" in config:
  135. pass
  136. xmin, xmax, ymin, ymax = (int(config.get("xmin", 0)),
  137. int(config.get("xmax", float("inf"))),
  138. int(config.get("ymin", 0)),
  139. int(config.get("ymax", float("inf")))
  140. )
  141. if "shape_type" in model:
  142. shape_names = {1:"Point", 3:"Polyligne", 5:"Polygone"}
  143. if sf.shapeType != model["shape_type"]:
  144. report_file.add_error("Le fichier shapefile n'est pas de type {}".format(shape_names[model["shape_type"]]))
  145. del sf
  146. continue
  147. records = sf.shapeRecords()
  148. if not records and not model["can_be_empty"]:
  149. report_file.add_error("Le fichier shapefile ne contient aucune donnees")
  150. del sf, records
  151. continue
  152. if not "fields" in model:
  153. continue
  154. fields = [f[0] for f in sf.fields if f[0] != 'DeletionFlag']
  155. report_file.headers = list(model["fields"].keys())
  156. # parcours et controle des enregistrements
  157. for i, record in enumerate(records):
  158. logger.info("\n> Enregistrement n°%s\n", i)
  159. report_record = ReportRecord(i)
  160. record_data = {field: record.record[i] for i, field in enumerate(fields)}
  161. try:
  162. x1, y1, x2, y2 = sf.shapes()[i].bbox
  163. except AttributeError:
  164. x1, y1 = sf.shapes()[i].points[0]
  165. x2, y2 = x1, y1
  166. if not xmin <= x1 <= xmax or not xmin <= x2 <= xmax or \
  167. not ymin <= y1 <= ymax or not ymin <= y2 <= ymax:
  168. report_record.add_error("L'élément est situé hors de la zone géographique autorisée")
  169. for fieldname, fieldmodel in model["fields"].items():
  170. report_field = ReportField(fieldname)
  171. try:
  172. val = record_data[fieldname]
  173. except KeyError:
  174. if fieldmodel.get("required", True):
  175. report_field.error = "Champs manquant"
  176. report_record.add_field(report_field)
  177. continue
  178. report_field.value = val
  179. type_ = fieldmodel.get("type", "str")
  180. if type_ == "int":
  181. try:
  182. _ = int(val)
  183. except (TypeError, ValueError):
  184. report_field.error = "Valeur Invalide, un nombre entier est attendu"
  185. report_record.add_field(report_field)
  186. continue
  187. elif type_ == "float":
  188. try:
  189. _ = float(val)
  190. except (TypeError, ValueError):
  191. report_field.error = "Valeur Invalide, un nombre décimal est attendu"
  192. report_record.add_field(report_field)
  193. continue
  194. elif type_ == "datetime":
  195. try:
  196. _ = datetime.strptime(val, fieldmodel.get("date_format", "%d/%m/%Y"))
  197. except ValueError:
  198. report_field.error = "Valeur Invalide, une date est attendue"
  199. report_record.add_field(report_field)
  200. continue
  201. else:
  202. if not fieldmodel.get("allow_empty", False) and not val:
  203. report_field.error = "Ce champs ne peut pas être vide"
  204. report_record.add_field(report_field)
  205. continue
  206. if type_ == "str" and "max_len" in fieldmodel:
  207. if len(str(val)) > fieldmodel["max_len"]:
  208. report_field.error = "Trop long, la longueur max. est de {}".format(fieldmodel["max_len"])
  209. report_record.add_field(report_field)
  210. continue
  211. try:
  212. if not val in fieldmodel["in_list"]:
  213. report_field.error = "Valeur invalide, pas dans la liste"
  214. report_record.add_field(report_field)
  215. continue
  216. except KeyError:
  217. pass
  218. if "in_table" in fieldmodel:
  219. key = tuple([fieldmodel["in_table"]["db"], fieldmodel["in_table"]["table"], fieldmodel["in_table"]["field"]])
  220. if not key in db_cache:
  221. db = databases[fieldmodel["in_table"]["db"]]
  222. cursor = db.cursor()
  223. cursor.execute("SELECT DISTINCT {} FROM {};".format(fieldmodel["in_table"]["field"], fieldmodel["in_table"]["table"]))
  224. rows = [val[0] for val in cursor.fetchall()]
  225. db_cache[key] = rows
  226. if not val in db_cache[key]:
  227. report_field.error = "Valeur invalide, pas dans la liste"
  228. report_record.add_field(report_field)
  229. continue
  230. report_record.add_field(report_field)
  231. report_file.add_record(report_record)
  232. report.report_files.append(report_file)
  233. del sf, records
  234. return report
  235. if __name__ == "__main__":
  236. logger.disabled = False
  237. subject = MAIN / "work" / "SCOPELEC_CAP_097AP0_REC_180829_OK.zip"
  238. checker = MAIN / "checkers" / "netgeo_v2-2_doe.yaml"
  239. report = check(subject, checker)
  240. logger.info("-- Fin --")