PoS tagger model for a specific domain

  nlp, python-3.x, spacy-3

I am trying to build a tagger model in spaCy v3.1 with .pos_ attributes for a specific domain. The code below manages to compile, however, it is not returning the .pos_ attributes. How I could extract the them?

import plac
import random
from pathlib import Path
import spacy
from spacy.training import Example

TAG_MAP = {
'N': {'pos': 'NOUN'},
'V': {'pos': 'VERB'},
'J': {'pos': 'ADJ'}
}

 TRAIN_DATA = [
('Eu gosto ovos cozidos', {'tags': ['N', 'V', 'N', 'J']}),
('Comer presunto azul', {'tags': ['V', 'N', 'J']})
]
@plac.annotations(
lang=("ISO Code of language to use", "option", "1", str),
output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int),)

def main(lang="pt", output_dir="./output_2", n_iter=25):
    """Main function to create a new model, set up the pipeline and train 
    the tagger. In order to train the tagger with a custom tag map, 
    we're creating a new Language instance with a custom vocab.
    """
    nlp = spacy.blank(lang)
    tagger = nlp.add_pipe("tagger")

    for tag, values in TAG_MAP.items():
        tagger.add_label(tag) # tagger.add_label(tag, values) -> gives erro

    optimizer = nlp.begin_training()
    #optimizer = nlp.initialize()   
    for i in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:    
            example = Example.from_dict(nlp.make_doc(text), annotations)
            nlp.update([example], sgd=optimizer, losses=losses)
        print(losses)
    
    test_text = "Eu gosto ovos passados"        

    # Save model to output directory        
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
    
        # test the save model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc = nlp2(test_text)
        print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])

if __name__ == "__main__":
    plac.call(main)

The last print returns:

Tags [('Eu', 'N', ''), ('gosto', 'V', ''), ('ovos', 'N', ''), ('passados', 'J', '')]

Source: Python-3x Questions

LEAVE A COMMENT