Here’s a sample code of how to load huggingface tokenizer, add some custom tokens and save it.

from transformers import XLMRobertaTokenizer
import os, shutil


t_dir = "/some/path/xlm-roberta-base"

tokenizer = XLMRobertaTokenizer.from_pretrained(t_dir)

original_length = len(tokenizer)

print(f"before mod length: {original_length}")

special_tokens_to_add = ["[custom1]", "[custom2]"]


tokenizer.add_tokens(special_tokens_to_add, special_tokens=True)

print(f"after modification length: {len(tokenizer)}")


# create outputdir
outputdir = "testoutput/pretrain_tokenizer"

if os.path.exists(outputdir):
    shutil.rmtree(outputdir)

os.makedirs(outputdir)

tokenizer.save_pretrained(outputdir)

# reload and check

tokenizer = XLMRobertaTokenizer.from_pretrained(outputdir)

print(f"> reload tokenizer length: {len(tokenizer)}")

print("done")

0 Comments

Leave a Reply

Your email address will not be published.