from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

d:\Software\Miniconda3\envs\llm\lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
tokenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<?, ?B/s]
d:\Software\Miniconda3\envs\llm\lib\site-packages\huggingface_hub\file_download.py:147: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\Users\dd\.cache\huggingface\hub. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  warnings.warn(message)
config.json: 100%|██████████| 570/570 [00:00<00:00, 571kB/s]
vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 479kB/s]
tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 1.40MB/s]

encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")
print(encoded_input)

{'input_ids': [101, 2091, 1136, 1143, 13002, 1107, 1103, 5707, 1104, 16678, 1116, 117, 1111, 1152, 1132, 11515, 1105, 3613, 1106, 4470, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

tokenizer.decode(encoded_input["input_ids"])

'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]'

batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
]
encoded_inputs = tokenizer(batch_sentences)
print(encoded_inputs)

{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], [101, 1327, 1164, 5450, 23434, 136, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}

batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
]
encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
print(encoded_input)

{'input_ids': tensor([[  101,  1252,  1184,  1164,  1248,  6462,   136,   102,     0,     0,
             0,     0,     0,     0,     0],
        [  101,  1790,   112,   189,  1341,  1119,  3520,  1164,  1248,  6462,
           117, 21902,  1643,   119,   102],
        [  101,  1327,  1164,  5450, 23434,   136,   102,     0,     0,     0,
             0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}

def get_vocab(self):
    vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
    vocab.update(self.added_tokens_encoder)
    return vocab

from tokenizers import Tokenizer
from tokenizers.models import BPE

model = BPE(unk_token="[UNL]")
tokenizer = Tokenizer(model)

from tokenizers.pre_tokenizers import Whitespace
tokenizer.pre_tokenizer = Whitespace()

from tokenizers.trainers import BpeTrainer
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

files = [f"dataset/tokenizers/wikitext-2/wiki.{split}.tokens" for split in ["test", "train", "valid"]]
tokenizer.train(files, trainer)

tokenizer.save("dataset/tokenizers/tokenizer-wiki.json")

tokenizer = Tokenizer.from_file("dataset/tokenizers/tokenizer-wiki.json")

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
normalizer = tokenizer.backend_tokenizer.normalizer

print("I love China")
print(normalizer.normalize_str("I love China"))

I love China
i love china

import tokenizers

normalizer = tokenizers.normalizers.Lowercase()

print("I love China")
print(normalizer.normalize_str("I love China"))

I love China
i love china

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2-xl")
pre_tokenizer = tokenizer.backend_tokenizer.pre_tokenizer

print("I love China")
print(pre_tokenizer.pre_tokenize_str("I love China"))

config.json: 100%|██████████| 689/689 [00:00<00:00, 690kB/s]
vocab.json: 100%|██████████| 1.04M/1.04M [00:01<00:00, 969kB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.92MB/s]
tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 1.73MB/s]

I love China
[('I', (0, 1)), ('Ġlove', (1, 6)), ('ĠChina', (6, 12))]

import tokenizers

pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel()

print("I love China")
print(pre_tokenizer.pre_tokenize_str("I love China"))

I love China
[('ĠI', (0, 1)), ('Ġlove', (1, 6)), ('ĠChina', (6, 12))]

1.基础使用¶

2.从头训练一个tokenizer¶

3.Tokenization Pipeline¶

3.1 normalization¶

3.2 pre-tokenization¶

3.3 model¶

3.4 post-processing¶

3.5 总结¶