''' Python 3.7+ @author: olivier.massot, sept 2018 ''' from datetime import datetime import json import logging import subprocess # @UnusedImport import tempfile # @UnusedImport import zipfile from path import Path, TempDir import pypyodbc import shapefile import yaml from core import logconf from core.constants import MAIN logger = logging.getLogger("datachecker") logconf.start("datachecker", logging.INFO) pypyodbc.lowercase = False logger.disabled = True # TODO: Vérifier la projection (besoin de GDAL/OGR) # TODO: fonctions de controle spéciales class ReportField(): def __init__(self, name, value = "", error=""): self.name = name self.value = value self._error = error self._valid = True @property def error(self): return self._error @error.setter def error(self, error): self._valid = False logger.error("%s - %s ('%s')", self.name, error, self.value) self._error = error @property def valid(self): return self._valid def to_dict(self): return { "name": self.name, "value": self.value, "error": self._error, "valid": self._valid } class ReportRecord(): def __init__(self, index): self.index = index self._valid = True self._errors = [] self._fields = [] @property def errors(self): return self._errors def add_error(self, error): self._valid = False logger.error("Ligne %s - %s", self.index, error) self._errors.append(error) @property def valid(self): return self._valid @property def fields(self): return self._fields def add_field(self, field): if not field.valid: self._valid=False self._fields.append(field) def to_dict(self): return { "index": self.index, "valid": self._valid, "errors": self._errors, "fields": [f.to_dict() for f in self._fields] } class ReportFile(): def __init__(self, file): self.file = file self.headers = [] self._valid = True self._errors = [] self._records = [] @property def valid(self): return self._valid @property def records(self): return self._records def add_record(self, row): if not row.valid: self._valid=False self._records.append(row) @property def errors(self): return self._errors def add_error(self, error): self._valid=False logger.error("Fichier %s - %s", self.file, error) self._errors.append(error) def to_dict(self): return { "file": self.file, "headers": self.headers, "valid": self.valid, "errors": self._errors, "records": [r.to_dict() for r in self._records] } class Report(): def __init__(self, title, report_files=[]): self.title = title self.report_files = report_files @property def valid(self): return all([r.valid for r in self.report_files]) def to_dict(self): return { "title": self.title, "report_files": [rf.to_dict() for rf in self.report_files], "valid": self.valid } def to_json(self): return json.dumps(self.to_dict()) def check(subject, checker): """ prends un dossier ou une archive en entier et vérifie son contenu selon les règles données par le fichier de config """ subject, checker = Path(subject), Path(checker) if subject.isfile(): with TempDir() as dirname: zip_ref = zipfile.ZipFile(subject, 'r') zip_ref.extractall(dirname) zip_ref.close() if Path(dirname / subject.stem).isdir(): # cas où l'archive contient un dossier qui lui-même contient les fichiers dirname /= subject.stem return check_folder(dirname, checker) elif subject.isdir(): return check_folder(subject, checker) else: raise IOError(f"Impossible de trouver le fichier ou répertoire: {subject}") def check_folder(folder, checker): logger.info("***** Traitement de '%s' *****", folder.name) logger.info("> Controlleur: '%s'", checker.name) report = Report("Contrôle des données de {} au format {}".format(folder.name, checker.stem)) db_cache = {} with open(checker, "r") as cf: config = yaml.load(cf) databases = {} for dbname, dsn in config.get("databases", {}).items(): cnn = pypyodbc.connect(dsn) databases[dbname] = cnn for filename, model in config["files"].items(): path_ = folder / filename logger.info("* Traitement de %s", path_.name) report_file = ReportFile(path_.name) if not path_.isfile(): report_file.add_error("Fichier introuvable") continue try: sf = shapefile.Reader(path_) except shapefile.ShapefileException: report_file.add_error("Fichier illisible") continue if "srid" in config: pass xmin, xmax, ymin, ymax = (int(config.get("xmin", 0)), int(config.get("xmax", float("inf"))), int(config.get("ymin", 0)), int(config.get("ymax", float("inf"))) ) if "shape_type" in model: shape_names = {1:"Point", 3:"Polyligne", 5:"Polygone"} if sf.shapeType != model["shape_type"]: report_file.add_error("Le fichier shapefile n'est pas de type {}".format(shape_names[model["shape_type"]])) del sf continue records = sf.shapeRecords() if not records and not model["can_be_empty"]: report_file.add_error("Le fichier shapefile ne contient aucune donnees") del sf, records continue if not "fields" in model: continue fields = [f[0] for f in sf.fields if f[0] != 'DeletionFlag'] report_file.headers = list(model["fields"].keys()) # parcours et controle des enregistrements for i, record in enumerate(records): logger.info("\n> Enregistrement n°%s\n", i) report_record = ReportRecord(i) record_data = {field: record.record[i] for i, field in enumerate(fields)} try: x1, y1, x2, y2 = sf.shapes()[i].bbox except AttributeError: x1, y1 = sf.shapes()[i].points[0] x2, y2 = x1, y1 if not xmin <= x1 <= xmax or not xmin <= x2 <= xmax or \ not ymin <= y1 <= ymax or not ymin <= y2 <= ymax: report_record.add_error("L'élément est situé hors de la zone géographique autorisée") for fieldname, fieldmodel in model["fields"].items(): report_field = ReportField(fieldname) try: val = record_data[fieldname] except KeyError: if fieldmodel.get("required", True): report_field.error = "Champs manquant" report_record.add_field(report_field) continue report_field.value = val type_ = fieldmodel.get("type", "str") if type_ == "int": try: _ = int(val) except (TypeError, ValueError): report_field.error = "Valeur Invalide, un nombre entier est attendu" report_record.add_field(report_field) continue elif type_ == "float": try: _ = float(val) except (TypeError, ValueError): report_field.error = "Valeur Invalide, un nombre décimal est attendu" report_record.add_field(report_field) continue elif type_ == "datetime": try: _ = datetime.strptime(val, fieldmodel.get("date_format", "%d/%m/%Y")) except ValueError: report_field.error = "Valeur Invalide, une date est attendue" report_record.add_field(report_field) continue else: if not fieldmodel.get("allow_empty", False) and not val: report_field.error = "Ce champs ne peut pas être vide" report_record.add_field(report_field) continue if type_ == "str" and "max_len" in fieldmodel: if len(str(val)) > fieldmodel["max_len"]: report_field.error = "Trop long, la longueur max. est de {}".format(fieldmodel["max_len"]) report_record.add_field(report_field) continue try: if not val in fieldmodel["in_list"]: report_field.error = "Valeur invalide, pas dans la liste" report_record.add_field(report_field) continue except KeyError: pass if "in_table" in fieldmodel: key = tuple([fieldmodel["in_table"]["db"], fieldmodel["in_table"]["table"], fieldmodel["in_table"]["field"]]) if not key in db_cache: db = databases[fieldmodel["in_table"]["db"]] cursor = db.cursor() cursor.execute("SELECT DISTINCT {} FROM {};".format(fieldmodel["in_table"]["field"], fieldmodel["in_table"]["table"])) rows = [val[0] for val in cursor.fetchall()] db_cache[key] = rows if not val in db_cache[key]: report_field.error = "Valeur invalide, pas dans la liste" report_record.add_field(report_field) continue report_record.add_field(report_field) report_file.add_record(report_record) report.report_files.append(report_file) del sf, records return report if __name__ == "__main__": logger.disabled = False subject = MAIN / "work" / "STURNO_192AP0_REC_COMPLEMENT_180822_OK.zip" checker = MAIN / "checkers" / "netgeo_v2-2_doe.yaml" report = check(subject, checker) with open(MAIN / "report.json", "w+") as fp: json.dump(report.to_dict(), fp) logger.info("-- Fin --")