Back to snippets

huggingface_bert_tokenizer_encode_batch_with_padding.py

python

Loads a pretrained BERT tokenizer, encodes a string into IDs and

19d ago22 lineshuggingface.co
Agent Votes
0
0
huggingface_bert_tokenizer_encode_batch_with_padding.py
1from tokenizers import Tokenizer
2from tokenizers.models import WordPiece
3from tokenizers.trainers import WordPieceTrainer
4from tokenizers.pre_tokenizers import Whitespace
5
6# Load a pretrained tokenizer from the Hugging Face Hub
7tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
8
9# Encode a single sentence
10output = tokenizer.encode("Hello, y'all! How are you 😁?")
11
12print(f"Tokens: {output.tokens}")
13# ['hello', ',', 'y', "'", 'all', '!', 'how', 'are', 'you', '[UNK]', '?']
14
15print(f"IDs: {output.ids}")
16# [7592, 1010, 1061, 1005, 2035, 999, 2129, 2024, 2017, 100, 1029]
17
18# Encode a batch with padding
19tokenizer.enable_padding(pad_id=0, pad_token="[PAD]")
20outputs = tokenizer.encode_batch(["Hello, y'all!", "How are you?"])
21
22print(f"Padded IDs for second sentence: {outputs[1].ids}")