datachecker.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. '''
  2. Python 3.7+
  3. @author: olivier.massot, sept 2018
  4. '''
  5. from datetime import datetime
  6. import json
  7. import logging
  8. import subprocess # @UnusedImport
  9. import tempfile # @UnusedImport
  10. import zipfile
  11. from path import Path, TempDir
  12. import pypyodbc
  13. import shapefile
  14. import yaml
  15. from core import logconf
  16. from core.constants import MAIN
  17. logger = logging.getLogger("datachecker")
  18. logconf.start("datachecker", logging.INFO)
  19. pypyodbc.lowercase = False
  20. logger.disabled = True
  21. # TODO: Vérifier la projection (besoin de GDAL/OGR)
  22. # TODO: fonctions de controle spéciales
  23. class ReportField():
  24. def __init__(self, name, value = "", error=""):
  25. self.name = name
  26. self.value = value
  27. self._error = error
  28. self._valid = True
  29. @property
  30. def error(self):
  31. return self._error
  32. @error.setter
  33. def error(self, error):
  34. self._valid = False
  35. logger.error("%s - %s ('%s')", self.name, error, self.value)
  36. self._error = error
  37. @property
  38. def valid(self):
  39. return self._valid
  40. def to_dict(self):
  41. return {
  42. "name": self.name,
  43. "value": self.value,
  44. "error": self._error,
  45. "valid": self._valid
  46. }
  47. class ReportRecord():
  48. def __init__(self, index):
  49. self.index = index
  50. self._valid = True
  51. self._errors = []
  52. self._fields = []
  53. @property
  54. def errors(self):
  55. return self._errors
  56. def add_error(self, error):
  57. self._valid = False
  58. logger.error("Ligne %s - %s", self.index, error)
  59. self._errors.append(error)
  60. @property
  61. def valid(self):
  62. return self._valid
  63. @property
  64. def fields(self):
  65. return self._fields
  66. def add_field(self, field):
  67. if not field.valid:
  68. self._valid=False
  69. self._fields.append(field)
  70. def to_dict(self):
  71. return {
  72. "index": self.index,
  73. "valid": self._valid,
  74. "errors": self._errors,
  75. "fields": [f.to_dict() for f in self._fields]
  76. }
  77. class ReportFile():
  78. def __init__(self, file):
  79. self.file = file
  80. self.headers = []
  81. self._valid = True
  82. self._errors = []
  83. self._records = []
  84. @property
  85. def valid(self):
  86. return self._valid
  87. @property
  88. def records(self):
  89. return self._records
  90. def add_record(self, row):
  91. if not row.valid:
  92. self._valid=False
  93. self._records.append(row)
  94. @property
  95. def errors(self):
  96. return self._errors
  97. def add_error(self, error):
  98. self._valid=False
  99. logger.error("Fichier %s - %s", self.file, error)
  100. self._errors.append(error)
  101. def to_dict(self):
  102. return {
  103. "file": self.file,
  104. "headers": self.headers,
  105. "valid": self.valid,
  106. "errors": self._errors,
  107. "records": [r.to_dict() for r in self._records]
  108. }
  109. class Report():
  110. def __init__(self, title, report_files=[]):
  111. self.title = title
  112. self.report_files = report_files
  113. @property
  114. def valid(self):
  115. return all([r.valid for r in self.report_files])
  116. def to_dict(self):
  117. return {
  118. "title": self.title,
  119. "report_files": [rf.to_dict() for rf in self.report_files],
  120. "valid": self.valid
  121. }
  122. def to_json(self):
  123. return json.dumps(self.to_dict())
  124. def check(subject, checker):
  125. """ prends un dossier ou une archive en entier et vérifie son contenu selon les règles données par le fichier de config """
  126. subject, checker = Path(subject), Path(checker)
  127. if subject.isfile():
  128. with TempDir() as dirname:
  129. zip_ref = zipfile.ZipFile(subject, 'r')
  130. zip_ref.extractall(dirname)
  131. zip_ref.close()
  132. if Path(dirname / subject.stem).isdir(): # cas où l'archive contient un dossier qui lui-même contient les fichiers
  133. dirname /= subject.stem
  134. return check_folder(dirname, checker)
  135. elif subject.isdir():
  136. return check_folder(subject, checker)
  137. else:
  138. raise IOError(f"Impossible de trouver le fichier ou répertoire: {subject}")
  139. def check_folder(folder, checker):
  140. logger.info("***** Traitement de '%s' *****", folder.name)
  141. logger.info("> Controlleur: '%s'", checker.name)
  142. report = Report("Contrôle des données de {} au format {}".format(folder.name, checker.stem))
  143. db_cache = {}
  144. with open(checker, "r") as cf:
  145. config = yaml.load(cf)
  146. databases = {}
  147. for dbname, dsn in config.get("databases", {}).items():
  148. cnn = pypyodbc.connect(dsn)
  149. databases[dbname] = cnn
  150. for filename, model in config["files"].items():
  151. path_ = folder / filename
  152. logger.info("* Traitement de %s", path_.name)
  153. report_file = ReportFile(path_.name)
  154. if not path_.isfile():
  155. report_file.add_error("Fichier introuvable")
  156. continue
  157. try:
  158. sf = shapefile.Reader(path_)
  159. except shapefile.ShapefileException:
  160. report_file.add_error("Fichier illisible")
  161. continue
  162. if "srid" in config:
  163. pass
  164. xmin, xmax, ymin, ymax = (int(config.get("xmin", 0)),
  165. int(config.get("xmax", float("inf"))),
  166. int(config.get("ymin", 0)),
  167. int(config.get("ymax", float("inf")))
  168. )
  169. if "shape_type" in model:
  170. shape_names = {1:"Point", 3:"Polyligne", 5:"Polygone"}
  171. if sf.shapeType != model["shape_type"]:
  172. report_file.add_error("Le fichier shapefile n'est pas de type {}".format(shape_names[model["shape_type"]]))
  173. del sf
  174. continue
  175. records = sf.shapeRecords()
  176. if not records and not model["can_be_empty"]:
  177. report_file.add_error("Le fichier shapefile ne contient aucune donnees")
  178. del sf, records
  179. continue
  180. if not "fields" in model:
  181. continue
  182. fields = [f[0] for f in sf.fields if f[0] != 'DeletionFlag']
  183. report_file.headers = list(model["fields"].keys())
  184. # parcours et controle des enregistrements
  185. for i, record in enumerate(records):
  186. logger.info("\n> Enregistrement n°%s\n", i)
  187. report_record = ReportRecord(i)
  188. record_data = {field: record.record[i] for i, field in enumerate(fields)}
  189. try:
  190. x1, y1, x2, y2 = sf.shapes()[i].bbox
  191. except AttributeError:
  192. x1, y1 = sf.shapes()[i].points[0]
  193. x2, y2 = x1, y1
  194. if not xmin <= x1 <= xmax or not xmin <= x2 <= xmax or \
  195. not ymin <= y1 <= ymax or not ymin <= y2 <= ymax:
  196. report_record.add_error("L'élément est situé hors de la zone géographique autorisée")
  197. for fieldname, fieldmodel in model["fields"].items():
  198. report_field = ReportField(fieldname)
  199. try:
  200. val = record_data[fieldname]
  201. except KeyError:
  202. if fieldmodel.get("required", True):
  203. report_field.error = "Champs manquant"
  204. report_record.add_field(report_field)
  205. continue
  206. report_field.value = val
  207. type_ = fieldmodel.get("type", "str")
  208. if type_ == "int":
  209. try:
  210. _ = int(val)
  211. except (TypeError, ValueError):
  212. report_field.error = "Valeur Invalide, un nombre entier est attendu"
  213. report_record.add_field(report_field)
  214. continue
  215. elif type_ == "float":
  216. try:
  217. _ = float(val)
  218. except (TypeError, ValueError):
  219. report_field.error = "Valeur Invalide, un nombre décimal est attendu"
  220. report_record.add_field(report_field)
  221. continue
  222. elif type_ == "datetime":
  223. try:
  224. _ = datetime.strptime(val, fieldmodel.get("date_format", "%d/%m/%Y"))
  225. except ValueError:
  226. report_field.error = "Valeur Invalide, une date est attendue"
  227. report_record.add_field(report_field)
  228. continue
  229. else:
  230. if not fieldmodel.get("allow_empty", False) and not val:
  231. report_field.error = "Ce champs ne peut pas être vide"
  232. report_record.add_field(report_field)
  233. continue
  234. if type_ == "str" and "max_len" in fieldmodel:
  235. if len(str(val)) > fieldmodel["max_len"]:
  236. report_field.error = "Trop long, la longueur max. est de {}".format(fieldmodel["max_len"])
  237. report_record.add_field(report_field)
  238. continue
  239. try:
  240. if not val in fieldmodel["in_list"]:
  241. report_field.error = "Valeur invalide, pas dans la liste"
  242. report_record.add_field(report_field)
  243. continue
  244. except KeyError:
  245. pass
  246. if "in_table" in fieldmodel:
  247. key = tuple([fieldmodel["in_table"]["db"], fieldmodel["in_table"]["table"], fieldmodel["in_table"]["field"]])
  248. if not key in db_cache:
  249. db = databases[fieldmodel["in_table"]["db"]]
  250. cursor = db.cursor()
  251. cursor.execute("SELECT DISTINCT {} FROM {};".format(fieldmodel["in_table"]["field"], fieldmodel["in_table"]["table"]))
  252. rows = [val[0] for val in cursor.fetchall()]
  253. db_cache[key] = rows
  254. if not val in db_cache[key]:
  255. report_field.error = "Valeur invalide, pas dans la liste"
  256. report_record.add_field(report_field)
  257. continue
  258. report_record.add_field(report_field)
  259. report_file.add_record(report_record)
  260. report.report_files.append(report_file)
  261. del sf, records
  262. return report
  263. if __name__ == "__main__":
  264. logger.disabled = False
  265. subject = MAIN / "work" / "STURNO_192AP0_REC_COMPLEMENT_180822_OK.zip"
  266. checker = MAIN / "checkers" / "netgeo_v2-2_doe.yaml"
  267. report = check(subject, checker)
  268. with open(MAIN / "report.json", "w+") as fp:
  269. json.dump(report.to_dict(), fp)
  270. logger.info("-- Fin --")