"""Example of training spaCy's named entity recognizer, starting off with an existing model or a blank model. For more details, see the documentation: * Training: https://spacy.io/usage/training * NER: https://spacy.io/usage/linguistic-features#named-entities Compatible with: spaCy v2.0.0+ """ import random import fr_core_news_md from path import Path import spacy # training data TRAIN_DATA = [ ('Qui est Georges Brassens?', { 'entities': [(8, 24, 'PERSON')] }), ("J'aime Strasbourg et Avignon.", { 'entities': [(7, 17, 'LOC'), (21, 28, 'LOC')] }), ("J'aime Strasbourg et Avignon.", { 'entities': [(7, 17, 'LOC'), (21, 28, 'LOC')] }), ] MODEL_DIR = Path(__file__).parent / "data" def main(n_iter=100): """Load the model, set up the pipeline and train the entity recognizer.""" nlp = spacy.load(MODEL_DIR) # load existing spaCy model @UndefinedVariable print("Model loaded") # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe('ner') # add labels for _, annotations in TRAIN_DATA: for ent in annotations.get('entities'): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER optimizer = nlp.begin_training() for _ in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} for text, annotations in TRAIN_DATA: nlp.update( [text], # batch of texts [annotations], # batch of annotations drop=0.5, # dropout - make it harder to memorise data sgd=optimizer, # callable to update weights losses=losses) print(losses) # test the trained model for text, _ in TRAIN_DATA: doc = nlp(text) print('Entities', [(ent.text, ent.label_) for ent in doc.ents]) print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc]) # save model to output directory if not MODEL_DIR.exists(): MODEL_DIR.mkdir() nlp.to_disk(MODEL_DIR) print("Saved model to", MODEL_DIR) if __name__ == '__main__': main() # Expected output: # Entities [('Georges Brassens', 'PERSON')] # Tokens [('Qui', '', 2), ('est', '', 2), ('Georges', 'PERSON', 3), ('Brassens', 'PERSON', 1), ('?', '', 2)] # Entities [('Strasbourg', 'LOC'), ('Avignon', 'LOC')] # Tokens [("J'", '', 2), ('aime', '', 2), ('Strasbourg', 'LOC', 3), ('et', '', 2), ('Avignon', 'LOC', 3), ('.', '', 2)]