| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980 |
- """Example of training spaCy's named entity recognizer, starting off with an
- existing model or a blank model.
- For more details, see the documentation:
- * Training: https://spacy.io/usage/training
- * NER: https://spacy.io/usage/linguistic-features#named-entities
- Compatible with: spaCy v2.0.0+
- """
- import random
- import fr_core_news_md
- from path import Path
- import spacy
- # training data
- TRAIN_DATA = [
- ('Qui est Georges Brassens?', { 'entities': [(8, 24, 'PERSON')] }),
- ("J'aime Strasbourg et Avignon.", { 'entities': [(7, 17, 'LOC'), (21, 28, 'LOC')] }),
- ("J'aime Strasbourg et Avignon.", { 'entities': [(7, 17, 'LOC'), (21, 28, 'LOC')] }),
- ]
- MODEL_DIR = Path(__file__).parent / "data"
- def main(n_iter=100):
- """Load the model, set up the pipeline and train the entity recognizer."""
- nlp = spacy.load(MODEL_DIR) # load existing spaCy model @UndefinedVariable
- print("Model loaded")
- # create the built-in pipeline components and add them to the pipeline
- # nlp.create_pipe works for built-ins that are registered with spaCy
- if 'ner' not in nlp.pipe_names:
- ner = nlp.create_pipe('ner')
- nlp.add_pipe(ner, last=True)
- # otherwise, get it so we can add labels
- else:
- ner = nlp.get_pipe('ner')
- # add labels
- for _, annotations in TRAIN_DATA:
- for ent in annotations.get('entities'):
- ner.add_label(ent[2])
- # get names of other pipes to disable them during training
- other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
- with nlp.disable_pipes(*other_pipes): # only train NER
- optimizer = nlp.begin_training()
- for _ in range(n_iter):
- random.shuffle(TRAIN_DATA)
- losses = {}
- for text, annotations in TRAIN_DATA:
- nlp.update(
- [text], # batch of texts
- [annotations], # batch of annotations
- drop=0.5, # dropout - make it harder to memorise data
- sgd=optimizer, # callable to update weights
- losses=losses)
- print(losses)
- # test the trained model
- for text, _ in TRAIN_DATA:
- doc = nlp(text)
- print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
- print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
- # save model to output directory
- if not MODEL_DIR.exists():
- MODEL_DIR.mkdir()
- nlp.to_disk(MODEL_DIR)
- print("Saved model to", MODEL_DIR)
- if __name__ == '__main__':
- main()
- # Expected output:
- # Entities [('Georges Brassens', 'PERSON')]
- # Tokens [('Qui', '', 2), ('est', '', 2), ('Georges', 'PERSON', 3), ('Brassens', 'PERSON', 1), ('?', '', 2)]
- # Entities [('Strasbourg', 'LOC'), ('Avignon', 'LOC')]
- # Tokens [("J'", '', 2), ('aime', '', 2), ('Strasbourg', 'LOC', 3), ('et', '', 2), ('Avignon', 'LOC', 3), ('.', '', 2)]
|