olinox14
/
nestor


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
							"""Example of training spaCy's named entity recognizer, starting off with an
existing model or a blank model.
For more details, see the documentation:
* Training: https://spacy.io/usage/training
* NER: https://spacy.io/usage/linguistic-features#named-entities
Compatible with: spaCy v2.0.0+
"""
import random

import fr_core_news_md
from path import Path
import spacy


# training data
TRAIN_DATA = [
    ('Qui est Georges Brassens?', { 'entities': [(8, 24, 'PERSON')] }),
    ("J'aime Strasbourg et Avignon.", { 'entities': [(7, 17, 'LOC'), (21, 28, 'LOC')] }),
    ("J'aime Strasbourg et Avignon.", { 'entities': [(7, 17, 'LOC'), (21, 28, 'LOC')] }),
]

MODEL_DIR = Path(__file__).parent / "data"


def main(n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    nlp = spacy.load(MODEL_DIR)  # load existing spaCy model @UndefinedVariable
    print("Model loaded")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe('ner')

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for _ in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
        print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if not MODEL_DIR.exists():
        MODEL_DIR.mkdir()
    nlp.to_disk(MODEL_DIR)
    print("Saved model to", MODEL_DIR)


if __name__ == '__main__':
    main()

    # Expected output:
    # Entities [('Georges Brassens', 'PERSON')]
    # Tokens [('Qui', '', 2), ('est', '', 2), ('Georges', 'PERSON', 3), ('Brassens', 'PERSON', 1), ('?', '', 2)]
    # Entities [('Strasbourg', 'LOC'), ('Avignon', 'LOC')]
    # Tokens [("J'", '', 2), ('aime', '', 2), ('Strasbourg', 'LOC', 3), ('et', '', 2), ('Avignon', 'LOC', 3), ('.', '', 2)]