Here’s a sample code of how to load huggingface tokenizer, add some custom tokens and save it.
from transformers import XLMRobertaTokenizer import os, shutil t_dir = "/some/path/xlm-roberta-base" tokenizer = XLMRobertaTokenizer.from_pretrained(t_dir) original_length = len(tokenizer) print(f"before mod length: {original_length}") special_tokens_to_add = ["[custom1]", "[custom2]"] tokenizer.add_tokens(special_tokens_to_add, special_tokens=True) print(f"after modification length: {len(tokenizer)}") # create outputdir outputdir = "testoutput/pretrain_tokenizer" if os.path.exists(outputdir): shutil.rmtree(outputdir) os.makedirs(outputdir) tokenizer.save_pretrained(outputdir) # reload and check tokenizer = XLMRobertaTokenizer.from_pretrained(outputdir) print(f"> reload tokenizer length: {len(tokenizer)}") print("done")
0 Comments