Skip to content

Commit 47266db

Browse files
committed
An example script showing usage of 🤗 tokenizers and transformers.
1 parent 98b6c4a commit 47266db

File tree

1 file changed

+33
-0
lines changed

1 file changed

+33
-0
lines changed

labs/10/example_transformers.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/usr/bin/env python3
2+
import argparse
3+
4+
import numpy as np
5+
import torch
6+
import transformers
7+
8+
tokenizer = transformers.AutoTokenizer.from_pretrained("ufal/eleczech-lc-small")
9+
model = transformers.AutoModel.from_pretrained("ufal/eleczech-lc-small", output_hidden_states=True)
10+
11+
dataset = [
12+
"Podmínkou koexistence jedince druhu Homo sapiens a společenství druhu Canis lupus je sjednocení akustické signální soustavy.",
13+
"U závodů na zpracování obilí, řízených mytologickými bytostmi je poměrně nízká produktivita práce vyvážena naprostou spolehlivostí.",
14+
"Vodomilní obratlovci nepatrných rozměrů nejsou ničím jiným, než vodomilnými obratlovci.",
15+
]
16+
17+
print("---Textual tokenization---")
18+
print(*[tokenizer.tokenize(sentence) for sentence in dataset], sep="\n")
19+
20+
print("---Char - subword - word mapping---")
21+
encoded = tokenizer(dataset[0])
22+
print("Token IDs:", encoded.input_ids)
23+
print("Token 2 to chars: {}".format(encoded.token_to_chars(2)))
24+
print("Word 1 to chars: {}".format(encoded.word_to_chars(1)))
25+
print("Word 1 to tokens: {}".format(encoded.word_to_tokens(1)))
26+
print("Char 12 to token: {}".format(encoded.char_to_token(12)))
27+
print("Decoded text: {}".format(tokenizer.decode(encoded.input_ids)))
28+
29+
print("---Running the model---")
30+
batch = tokenizer(dataset, padding="longest")
31+
result = model(torch.as_tensor(batch.input_ids), attention_mask=torch.as_tensor(batch.attention_mask))
32+
print("last_hidden_state: shape {}".format(result.last_hidden_state.shape))
33+
print("hidden_state: shapes", *("{}".format(hidden_state.shape) for hidden_state in result.hidden_states))

0 commit comments

Comments
 (0)