Explorar el Código

Suite du debroussaillage

olivier.massot hace 7 años
padre
commit
6d65f5881f

+ 0 - 0
memory/__init__.py


+ 80 - 0
memory/train.py

@@ -0,0 +1,80 @@
+"""Example of training spaCy's named entity recognizer, starting off with an
+existing model or a blank model.
+For more details, see the documentation:
+* Training: https://spacy.io/usage/training
+* NER: https://spacy.io/usage/linguistic-features#named-entities
+Compatible with: spaCy v2.0.0+
+"""
+import random
+
+import fr_core_news_md
+from path import Path
+import spacy
+
+
+# training data
+TRAIN_DATA = [
+    ('Qui est Georges Brassens?', { 'entities': [(8, 24, 'PERSON')] }),
+    ("J'aime Strasbourg et Avignon.", { 'entities': [(7, 17, 'LOC'), (21, 28, 'LOC')] }),
+    ("J'aime Strasbourg et Avignon.", { 'entities': [(7, 17, 'LOC'), (21, 28, 'LOC')] }),
+]
+
+MODEL_DIR = Path(__file__).parent / "data"
+
+
+def main(n_iter=100):
+    """Load the model, set up the pipeline and train the entity recognizer."""
+    nlp = spacy.load(MODEL_DIR)  # load existing spaCy model @UndefinedVariable
+    print("Model loaded")
+
+    # create the built-in pipeline components and add them to the pipeline
+    # nlp.create_pipe works for built-ins that are registered with spaCy
+    if 'ner' not in nlp.pipe_names:
+        ner = nlp.create_pipe('ner')
+        nlp.add_pipe(ner, last=True)
+    # otherwise, get it so we can add labels
+    else:
+        ner = nlp.get_pipe('ner')
+
+    # add labels
+    for _, annotations in TRAIN_DATA:
+        for ent in annotations.get('entities'):
+            ner.add_label(ent[2])
+
+    # get names of other pipes to disable them during training
+    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
+    with nlp.disable_pipes(*other_pipes):  # only train NER
+        optimizer = nlp.begin_training()
+        for _ in range(n_iter):
+            random.shuffle(TRAIN_DATA)
+            losses = {}
+            for text, annotations in TRAIN_DATA:
+                nlp.update(
+                    [text],  # batch of texts
+                    [annotations],  # batch of annotations
+                    drop=0.5,  # dropout - make it harder to memorise data
+                    sgd=optimizer,  # callable to update weights
+                    losses=losses)
+            print(losses)
+
+    # test the trained model
+    for text, _ in TRAIN_DATA:
+        doc = nlp(text)
+        print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
+        print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
+
+    # save model to output directory
+    if not MODEL_DIR.exists():
+        MODEL_DIR.mkdir()
+    nlp.to_disk(MODEL_DIR)
+    print("Saved model to", MODEL_DIR)
+
+
+if __name__ == '__main__':
+    main()
+
+    # Expected output:
+    # Entities [('Georges Brassens', 'PERSON')]
+    # Tokens [('Qui', '', 2), ('est', '', 2), ('Georges', 'PERSON', 3), ('Brassens', 'PERSON', 1), ('?', '', 2)]
+    # Entities [('Strasbourg', 'LOC'), ('Avignon', 'LOC')]
+    # Tokens [("J'", '', 2), ('aime', '', 2), ('Strasbourg', 'LOC', 3), ('et', '', 2), ('Avignon', 'LOC', 3), ('.', '', 2)]

+ 641 - 0
memory/training_data_example.json

@@ -0,0 +1,641 @@
+[
+    {
+      "id": "wsj_0200",
+      "paragraphs": [
+        {
+          "raw": "In an Oct. 19 review of \"The Misanthrope\" at Chicago's Goodman Theatre (\"Revitalized Classics Take the Stage in Windy City,\" Leisure & Arts), the role of Celimene, played by Kim Cattrall, was mistakenly attributed to Christina Haag. Ms. Haag plays Elianti.",
+          "sentences": [
+            {
+              "tokens": [
+                {
+                  "head": 44,
+                  "dep": "prep",
+                  "tag": "IN",
+                  "orth": "In",
+                  "ner": "O",
+                  "id": 0
+                },
+                {
+                  "head": 3,
+                  "dep": "det",
+                  "tag": "DT",
+                  "orth": "an",
+                  "ner": "O",
+                  "id": 1
+                },
+                {
+                  "head": 2,
+                  "dep": "nmod",
+                  "tag": "NNP",
+                  "orth": "Oct.",
+                  "ner": "B-DATE",
+                  "id": 2
+                },
+                {
+                  "head": -1,
+                  "dep": "nummod",
+                  "tag": "CD",
+                  "orth": "19",
+                  "ner": "L-DATE",
+                  "id": 3
+                },
+                {
+                  "head": -4,
+                  "dep": "pobj",
+                  "tag": "NN",
+                  "orth": "review",
+                  "ner": "O",
+                  "id": 4
+                },
+                {
+                  "head": -1,
+                  "dep": "prep",
+                  "tag": "IN",
+                  "orth": "of",
+                  "ner": "O",
+                  "id": 5
+                },
+                {
+                  "head": 2,
+                  "dep": "punct",
+                  "tag": "``",
+                  "orth": "``",
+                  "ner": "O",
+                  "id": 6
+                },
+                {
+                  "head": 1,
+                  "dep": "det",
+                  "tag": "DT",
+                  "orth": "The",
+                  "ner": "B-WORK_OF_ART",
+                  "id": 7
+                },
+                {
+                  "head": -3,
+                  "dep": "pobj",
+                  "tag": "NN",
+                  "orth": "Misanthrope",
+                  "ner": "L-WORK_OF_ART",
+                  "id": 8
+                },
+                {
+                  "head": -1,
+                  "dep": "punct",
+                  "tag": "''",
+                  "orth": "''",
+                  "ner": "O",
+                  "id": 9
+                },
+                {
+                  "head": -2,
+                  "dep": "prep",
+                  "tag": "IN",
+                  "orth": "at",
+                  "ner": "O",
+                  "id": 10
+                },
+                {
+                  "head": 3,
+                  "dep": "poss",
+                  "tag": "NNP",
+                  "orth": "Chicago",
+                  "ner": "U-GPE",
+                  "id": 11
+                },
+                {
+                  "head": -1,
+                  "dep": "case",
+                  "tag": "POS",
+                  "orth": "'s",
+                  "ner": "O",
+                  "id": 12
+                },
+                {
+                  "head": 1,
+                  "dep": "compound",
+                  "tag": "NNP",
+                  "orth": "Goodman",
+                  "ner": "B-FAC",
+                  "id": 13
+                },
+                {
+                  "head": -4,
+                  "dep": "pobj",
+                  "tag": "NNP",
+                  "orth": "Theatre",
+                  "ner": "L-FAC",
+                  "id": 14
+                },
+                {
+                  "head": 4,
+                  "dep": "punct",
+                  "tag": "-LRB-",
+                  "orth": "(",
+                  "ner": "O",
+                  "id": 15
+                },
+                {
+                  "head": 3,
+                  "dep": "punct",
+                  "tag": "``",
+                  "orth": "``",
+                  "ner": "O",
+                  "id": 16
+                },
+                {
+                  "head": 1,
+                  "dep": "amod",
+                  "tag": "VBN",
+                  "orth": "Revitalized",
+                  "ner": "B-WORK_OF_ART",
+                  "id": 17
+                },
+                {
+                  "head": 1,
+                  "dep": "nsubj",
+                  "tag": "NNS",
+                  "orth": "Classics",
+                  "ner": "I-WORK_OF_ART",
+                  "id": 18
+                },
+                {
+                  "head": -15,
+                  "dep": "appos",
+                  "tag": "VBP",
+                  "orth": "Take",
+                  "ner": "I-WORK_OF_ART",
+                  "id": 19
+                },
+                {
+                  "head": 1,
+                  "dep": "det",
+                  "tag": "DT",
+                  "orth": "the",
+                  "ner": "I-WORK_OF_ART",
+                  "id": 20
+                },
+                {
+                  "head": -2,
+                  "dep": "dobj",
+                  "tag": "NN",
+                  "orth": "Stage",
+                  "ner": "I-WORK_OF_ART",
+                  "id": 21
+                },
+                {
+                  "head": -3,
+                  "dep": "prep",
+                  "tag": "IN",
+                  "orth": "in",
+                  "ner": "I-WORK_OF_ART",
+                  "id": 22
+                },
+                {
+                  "head": 1,
+                  "dep": "compound",
+                  "tag": "NNP",
+                  "orth": "Windy",
+                  "ner": "I-WORK_OF_ART",
+                  "id": 23
+                },
+                {
+                  "head": -2,
+                  "dep": "pobj",
+                  "tag": "NNP",
+                  "orth": "City",
+                  "ner": "L-WORK_OF_ART",
+                  "id": 24
+                },
+                {
+                  "head": -6,
+                  "dep": "punct",
+                  "tag": ",",
+                  "orth": ",",
+                  "ner": "O",
+                  "id": 25
+                },
+                {
+                  "head": -7,
+                  "dep": "punct",
+                  "tag": "''",
+                  "orth": "''",
+                  "ner": "O",
+                  "id": 26
+                },
+                {
+                  "head": -8,
+                  "dep": "npadvmod",
+                  "tag": "NN",
+                  "orth": "Leisure",
+                  "ner": "B-ORG",
+                  "id": 27
+                },
+                {
+                  "head": -1,
+                  "dep": "cc",
+                  "tag": "CC",
+                  "orth": "&",
+                  "ner": "I-ORG",
+                  "id": 28
+                },
+                {
+                  "head": -2,
+                  "dep": "conj",
+                  "tag": "NNS",
+                  "orth": "Arts",
+                  "ner": "L-ORG",
+                  "id": 29
+                },
+                {
+                  "head": -11,
+                  "dep": "punct",
+                  "tag": "-RRB-",
+                  "orth": ")",
+                  "ner": "O",
+                  "id": 30
+                },
+                {
+                  "head": 13,
+                  "dep": "punct",
+                  "tag": ",",
+                  "orth": ",",
+                  "ner": "O",
+                  "id": 31
+                },
+                {
+                  "head": 1,
+                  "dep": "det",
+                  "tag": "DT",
+                  "orth": "the",
+                  "ner": "O",
+                  "id": 32
+                },
+                {
+                  "head": 11,
+                  "dep": "nsubjpass",
+                  "tag": "NN",
+                  "orth": "role",
+                  "ner": "O",
+                  "id": 33
+                },
+                {
+                  "head": -1,
+                  "dep": "prep",
+                  "tag": "IN",
+                  "orth": "of",
+                  "ner": "O",
+                  "id": 34
+                },
+                {
+                  "head": -1,
+                  "dep": "pobj",
+                  "tag": "NNP",
+                  "orth": "Celimene",
+                  "ner": "U-PERSON",
+                  "id": 35
+                },
+                {
+                  "head": -3,
+                  "dep": "punct",
+                  "tag": ",",
+                  "orth": ",",
+                  "ner": "O",
+                  "id": 36
+                },
+                {
+                  "head": -4,
+                  "dep": "acl",
+                  "tag": "VBN",
+                  "orth": "played",
+                  "ner": "O",
+                  "id": 37
+                },
+                {
+                  "head": -1,
+                  "dep": "agent",
+                  "tag": "IN",
+                  "orth": "by",
+                  "ner": "O",
+                  "id": 38
+                },
+                {
+                  "head": 1,
+                  "dep": "compound",
+                  "tag": "NNP",
+                  "orth": "Kim",
+                  "ner": "B-PERSON",
+                  "id": 39
+                },
+                {
+                  "head": -2,
+                  "dep": "pobj",
+                  "tag": "NNP",
+                  "orth": "Cattrall",
+                  "ner": "L-PERSON",
+                  "id": 40
+                },
+                {
+                  "head": -8,
+                  "dep": "punct",
+                  "tag": ",",
+                  "orth": ",",
+                  "ner": "O",
+                  "id": 41
+                },
+                {
+                  "head": 2,
+                  "dep": "auxpass",
+                  "tag": "VBD",
+                  "orth": "was",
+                  "ner": "O",
+                  "id": 42
+                },
+                {
+                  "head": 1,
+                  "dep": "advmod",
+                  "tag": "RB",
+                  "orth": "mistakenly",
+                  "ner": "O",
+                  "id": 43
+                },
+                {
+                  "head": 0,
+                  "dep": "root",
+                  "tag": "VBN",
+                  "orth": "attributed",
+                  "ner": "O",
+                  "id": 44
+                },
+                {
+                  "head": -1,
+                  "dep": "prep",
+                  "tag": "IN",
+                  "orth": "to",
+                  "ner": "O",
+                  "id": 45
+                },
+                {
+                  "head": 1,
+                  "dep": "compound",
+                  "tag": "NNP",
+                  "orth": "Christina",
+                  "ner": "B-PERSON",
+                  "id": 46
+                },
+                {
+                  "head": -2,
+                  "dep": "pobj",
+                  "tag": "NNP",
+                  "orth": "Haag",
+                  "ner": "L-PERSON",
+                  "id": 47
+                },
+                {
+                  "head": -4,
+                  "dep": "punct",
+                  "tag": ".",
+                  "orth": ".",
+                  "ner": "O",
+                  "id": 48
+                }
+              ],
+              "brackets": [
+                {
+                  "first": 2,
+                  "last": 3,
+                  "label": "NML"
+                },
+                {
+                  "first": 1,
+                  "last": 4,
+                  "label": "NP"
+                },
+                {
+                  "first": 7,
+                  "last": 8,
+                  "label": "NP-TTL"
+                },
+                {
+                  "first": 11,
+                  "last": 12,
+                  "label": "NP"
+                },
+                {
+                  "first": 11,
+                  "last": 14,
+                  "label": "NP"
+                },
+                {
+                  "first": 10,
+                  "last": 14,
+                  "label": "PP-LOC"
+                },
+                {
+                  "first": 6,
+                  "last": 14,
+                  "label": "NP"
+                },
+                {
+                  "first": 5,
+                  "last": 14,
+                  "label": "PP"
+                },
+                {
+                  "first": 1,
+                  "last": 14,
+                  "label": "NP"
+                },
+                {
+                  "first": 17,
+                  "last": 18,
+                  "label": "NP-SBJ"
+                },
+                {
+                  "first": 20,
+                  "last": 21,
+                  "label": "NP"
+                },
+                {
+                  "first": 23,
+                  "last": 24,
+                  "label": "NP"
+                },
+                {
+                  "first": 22,
+                  "last": 24,
+                  "label": "PP-LOC"
+                },
+                {
+                  "first": 19,
+                  "last": 24,
+                  "label": "VP"
+                },
+                {
+                  "first": 17,
+                  "last": 24,
+                  "label": "S-HLN"
+                },
+                {
+                  "first": 27,
+                  "last": 29,
+                  "label": "NP-TMP"
+                },
+                {
+                  "first": 15,
+                  "last": 30,
+                  "label": "NP"
+                },
+                {
+                  "first": 1,
+                  "last": 30,
+                  "label": "NP"
+                },
+                {
+                  "first": 0,
+                  "last": 30,
+                  "label": "PP-LOC"
+                },
+                {
+                  "first": 32,
+                  "last": 33,
+                  "label": "NP"
+                },
+                {
+                  "first": 35,
+                  "last": 35,
+                  "label": "NP"
+                },
+                {
+                  "first": 34,
+                  "last": 35,
+                  "label": "PP"
+                },
+                {
+                  "first": 32,
+                  "last": 35,
+                  "label": "NP"
+                },
+                {
+                  "first": 39,
+                  "last": 40,
+                  "label": "NP-LGS"
+                },
+                {
+                  "first": 38,
+                  "last": 40,
+                  "label": "PP"
+                },
+                {
+                  "first": 37,
+                  "last": 40,
+                  "label": "VP"
+                },
+                {
+                  "first": 32,
+                  "last": 41,
+                  "label": "NP-SBJ-2"
+                },
+                {
+                  "first": 43,
+                  "last": 43,
+                  "label": "ADVP-MNR"
+                },
+                {
+                  "first": 46,
+                  "last": 47,
+                  "label": "NP"
+                },
+                {
+                  "first": 45,
+                  "last": 47,
+                  "label": "PP-CLR"
+                },
+                {
+                  "first": 44,
+                  "last": 47,
+                  "label": "VP"
+                },
+                {
+                  "first": 42,
+                  "last": 47,
+                  "label": "VP"
+                },
+                {
+                  "first": 0,
+                  "last": 48,
+                  "label": "S"
+                }
+              ]
+            },
+            {
+              "tokens": [
+                {
+                  "head": 1,
+                  "dep": "compound",
+                  "tag": "NNP",
+                  "orth": "Ms.",
+                  "ner": "O",
+                  "id": 0
+                },
+                {
+                  "head": 1,
+                  "dep": "nsubj",
+                  "tag": "NNP",
+                  "orth": "Haag",
+                  "ner": "U-PERSON",
+                  "id": 1
+                },
+                {
+                  "head": 0,
+                  "dep": "root",
+                  "tag": "VBZ",
+                  "orth": "plays",
+                  "ner": "O",
+                  "id": 2
+                },
+                {
+                  "head": -1,
+                  "dep": "dobj",
+                  "tag": "NNP",
+                  "orth": "Elianti",
+                  "ner": "U-PERSON",
+                  "id": 3
+                },
+                {
+                  "head": -2,
+                  "dep": "punct",
+                  "tag": ".",
+                  "orth": ".",
+                  "ner": "O",
+                  "id": 4
+                }
+              ],
+              "brackets": [
+                {
+                  "first": 0,
+                  "last": 1,
+                  "label": "NP-SBJ"
+                },
+                {
+                  "first": 3,
+                  "last": 3,
+                  "label": "NP"
+                },
+                {
+                  "first": 2,
+                  "last": 3,
+                  "label": "VP"
+                },
+                {
+                  "first": 0,
+                  "last": 4,
+                  "label": "S"
+                }
+              ]
+            }
+          ]
+        }
+      ]
+    }
+  ]

+ 0 - 8
requirements.txt

@@ -5,13 +5,5 @@ spacy
 ## Package moyen (env 130mo)
 # python -m spacy download fr_core_news_md
 
-## OU
-
-## Big package (env. 1.3go, pas testé avec spacy 2.0)
-# python -m spacy download fr_depvec_web_lg
-
-## Named entities
-# python -m spacy download xx_ent_wiki_sm
-
 # Microsoft Visual C++ 14.0
 

+ 14 - 0
resources/notes.txt

@@ -29,6 +29,20 @@ https://medium.com/artists-and-machine-intelligence/adventures-in-narrated-reali
 Apis
 http://developer.wordnik.com/
 
+
+CORPUS:
+https://www.ortolang.fr
+http://www.resourcebook.eu/
+https://vlo.clarin.eu
+
+VISUALISEURS:
+https://explosion.ai/demos/displacy-ent
+
+Training Tool:
+https://prodi.gy/
+
+
+
 DIVERS
 
 Nestor: taches de base

+ 0 - 0
visualizer/__init__.py


BIN
visualizer/exemple.PNG


+ 10 - 0
visualizer/vizualizer.py

@@ -0,0 +1,10 @@
+from spacy import displacy
+
+from core.nlp import nlp
+
+
+doc = nlp(u"J'ai trois beau enfants qui se nomment Nahé, Ancelo et Lino")
+
+options = {'compact': True, 'bg': '#09a3d5',
+           'color': 'white', 'font': 'Source Sans Pro'}
+displacy.serve(doc, style='dep', options=options)