|
|
@@ -0,0 +1,80 @@
|
|
|
+"""Example of training spaCy's named entity recognizer, starting off with an
|
|
|
+existing model or a blank model.
|
|
|
+For more details, see the documentation:
|
|
|
+* Training: https://spacy.io/usage/training
|
|
|
+* NER: https://spacy.io/usage/linguistic-features#named-entities
|
|
|
+Compatible with: spaCy v2.0.0+
|
|
|
+"""
|
|
|
+import random
|
|
|
+
|
|
|
+import fr_core_news_md
|
|
|
+from path import Path
|
|
|
+import spacy
|
|
|
+
|
|
|
+
|
|
|
+# training data
|
|
|
+TRAIN_DATA = [
|
|
|
+ ('Qui est Georges Brassens?', { 'entities': [(8, 24, 'PERSON')] }),
|
|
|
+ ("J'aime Strasbourg et Avignon.", { 'entities': [(7, 17, 'LOC'), (21, 28, 'LOC')] }),
|
|
|
+ ("J'aime Strasbourg et Avignon.", { 'entities': [(7, 17, 'LOC'), (21, 28, 'LOC')] }),
|
|
|
+]
|
|
|
+
|
|
|
+MODEL_DIR = Path(__file__).parent / "data"
|
|
|
+
|
|
|
+
|
|
|
+def main(n_iter=100):
|
|
|
+ """Load the model, set up the pipeline and train the entity recognizer."""
|
|
|
+ nlp = spacy.load(MODEL_DIR) # load existing spaCy model @UndefinedVariable
|
|
|
+ print("Model loaded")
|
|
|
+
|
|
|
+ # create the built-in pipeline components and add them to the pipeline
|
|
|
+ # nlp.create_pipe works for built-ins that are registered with spaCy
|
|
|
+ if 'ner' not in nlp.pipe_names:
|
|
|
+ ner = nlp.create_pipe('ner')
|
|
|
+ nlp.add_pipe(ner, last=True)
|
|
|
+ # otherwise, get it so we can add labels
|
|
|
+ else:
|
|
|
+ ner = nlp.get_pipe('ner')
|
|
|
+
|
|
|
+ # add labels
|
|
|
+ for _, annotations in TRAIN_DATA:
|
|
|
+ for ent in annotations.get('entities'):
|
|
|
+ ner.add_label(ent[2])
|
|
|
+
|
|
|
+ # get names of other pipes to disable them during training
|
|
|
+ other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
|
|
|
+ with nlp.disable_pipes(*other_pipes): # only train NER
|
|
|
+ optimizer = nlp.begin_training()
|
|
|
+ for _ in range(n_iter):
|
|
|
+ random.shuffle(TRAIN_DATA)
|
|
|
+ losses = {}
|
|
|
+ for text, annotations in TRAIN_DATA:
|
|
|
+ nlp.update(
|
|
|
+ [text], # batch of texts
|
|
|
+ [annotations], # batch of annotations
|
|
|
+ drop=0.5, # dropout - make it harder to memorise data
|
|
|
+ sgd=optimizer, # callable to update weights
|
|
|
+ losses=losses)
|
|
|
+ print(losses)
|
|
|
+
|
|
|
+ # test the trained model
|
|
|
+ for text, _ in TRAIN_DATA:
|
|
|
+ doc = nlp(text)
|
|
|
+ print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
|
|
|
+ print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
|
|
|
+
|
|
|
+ # save model to output directory
|
|
|
+ if not MODEL_DIR.exists():
|
|
|
+ MODEL_DIR.mkdir()
|
|
|
+ nlp.to_disk(MODEL_DIR)
|
|
|
+ print("Saved model to", MODEL_DIR)
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ main()
|
|
|
+
|
|
|
+ # Expected output:
|
|
|
+ # Entities [('Georges Brassens', 'PERSON')]
|
|
|
+ # Tokens [('Qui', '', 2), ('est', '', 2), ('Georges', 'PERSON', 3), ('Brassens', 'PERSON', 1), ('?', '', 2)]
|
|
|
+ # Entities [('Strasbourg', 'LOC'), ('Avignon', 'LOC')]
|
|
|
+ # Tokens [("J'", '', 2), ('aime', '', 2), ('Strasbourg', 'LOC', 3), ('et', '', 2), ('Avignon', 'LOC', 3), ('.', '', 2)]
|