Chadrick Blog

add tokens to huggingface tokenizer

Here’s a sample code of how to load huggingface tokenizer, add some custom tokens and save it.

from transformers import XLMRobertaTokenizer
import os, shutil

t\_dir = "/some/path/xlm-roberta-base"

tokenizer = XLMRobertaTokenizer.from\_pretrained(t\_dir)

original\_length = len(tokenizer)

print(f"before mod length: {original\_length}")

special\_tokens\_to\_add = \["\[custom1\]", "\[custom2\]"\]

tokenizer.add\_tokens(special\_tokens\_to\_add, special\_tokens=True)

print(f"after modification length: {len(tokenizer)}")

# create outputdir
outputdir = "testoutput/pretrain\_tokenizer"

if os.path.exists(outputdir):
    shutil.rmtree(outputdir)

os.makedirs(outputdir)

tokenizer.save\_pretrained(outputdir)

# reload and check

tokenizer = XLMRobertaTokenizer.from\_pretrained(outputdir)

print(f"> reload tokenizer length: {len(tokenizer)}")

print("done")