Skip to content

Commit 72efebd

Browse files
authored
add special token handling to bpe from scratch code (#616)
1 parent 92b308e commit 72efebd

File tree

1 file changed

+115
-46
lines changed

1 file changed

+115
-46
lines changed

ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb

+115-46
Original file line numberDiff line numberDiff line change
@@ -513,43 +513,71 @@
513513
" else:\n",
514514
" print(f\"Skipping pair {pair} as one token is not in the vocabulary.\")\n",
515515
"\n",
516-
" def encode(self, text):\n",
516+
" def encode(self, text, allowed_special=None):\n",
517517
" \"\"\"\n",
518-
" Encode the input text into a list of token IDs.\n",
519-
"\n",
518+
" Encode the input text into a list of token IDs, with tiktoken-style handling of special tokens.\n",
519+
" \n",
520520
" Args:\n",
521-
" text (str): The text to encode.\n",
522-
"\n",
521+
" text (str): The input text to encode.\n",
522+
" allowed_special (set or None): Special tokens to allow passthrough. If None, special handling is disabled.\n",
523+
" \n",
523524
" Returns:\n",
524-
" List[int]: The list of token IDs.\n",
525+
" List of token IDs.\n",
525526
" \"\"\"\n",
527+
" import re\n",
528+
" \n",
529+
" token_ids = []\n",
530+
" \n",
531+
" # If special token handling is enabled\n",
532+
" if allowed_special is not None and len(allowed_special) > 0:\n",
533+
" # Build regex to match allowed special tokens\n",
534+
" special_pattern = (\n",
535+
" \"(\" + \"|\".join(re.escape(tok) for tok in sorted(allowed_special, key=len, reverse=True)) + \")\"\n",
536+
" )\n",
537+
" \n",
538+
" last_index = 0\n",
539+
" for match in re.finditer(special_pattern, text):\n",
540+
" prefix = text[last_index:match.start()]\n",
541+
" token_ids.extend(self.encode(prefix, allowed_special=None)) # Encode prefix without special handling\n",
542+
" \n",
543+
" special_token = match.group(0)\n",
544+
" if special_token in self.inverse_vocab:\n",
545+
" token_ids.append(self.inverse_vocab[special_token])\n",
546+
" else:\n",
547+
" raise ValueError(f\"Special token {special_token} not found in vocabulary.\")\n",
548+
" last_index = match.end()\n",
549+
" \n",
550+
" text = text[last_index:] # Remaining part to process normally\n",
551+
" \n",
552+
" # Check if any disallowed special tokens are in the remainder\n",
553+
" disallowed = [\n",
554+
" tok for tok in self.inverse_vocab\n",
555+
" if tok.startswith(\"<|\") and tok.endswith(\"|>\") and tok in text and tok not in allowed_special\n",
556+
" ]\n",
557+
" if disallowed:\n",
558+
" raise ValueError(f\"Disallowed special tokens encountered in text: {disallowed}\")\n",
559+
" \n",
560+
" # If no special tokens, or remaining text after special token split:\n",
526561
" tokens = []\n",
527-
" # First split on newlines to preserve them\n",
528562
" lines = text.split(\"\\n\")\n",
529563
" for i, line in enumerate(lines):\n",
530564
" if i > 0:\n",
531-
" tokens.append(\"\\n\") # Add newline token separately\n",
565+
" tokens.append(\"\\n\")\n",
532566
" words = line.split()\n",
533567
" for j, word in enumerate(words):\n",
534-
" if j == 0:\n",
535-
" if i > 0: # Start of a new line but not the first line\n",
536-
" tokens.append(\"Ġ\" + word) # Ensure it's marked as a new segment\n",
537-
" else:\n",
538-
" tokens.append(word)\n",
568+
" if j == 0 and i > 0:\n",
569+
" tokens.append(\"Ġ\" + word)\n",
570+
" elif j == 0:\n",
571+
" tokens.append(word)\n",
539572
" else:\n",
540-
" # Prefix words in the middle of a line with \"Ġ\"\n",
541573
" tokens.append(\"Ġ\" + word)\n",
542-
"\n",
543-
" token_ids = []\n",
574+
" \n",
544575
" for token in tokens:\n",
545576
" if token in self.inverse_vocab:\n",
546-
" # token is contained in the vocabulary as is\n",
547577
" token_ids.append(self.inverse_vocab[token])\n",
548578
" else:\n",
549-
" # Attempt to handle subword tokenization via BPE\n",
550-
" sub_token_ids = self.tokenize_with_bpe(token)\n",
551-
" token_ids.extend(sub_token_ids)\n",
552-
"\n",
579+
" token_ids.extend(self.tokenize_with_bpe(token))\n",
580+
" \n",
553581
" return token_ids\n",
554582
"\n",
555583
" def tokenize_with_bpe(self, token):\n",
@@ -781,7 +809,7 @@
781809
},
782810
{
783811
"cell_type": "code",
784-
"execution_count": 25,
812+
"execution_count": 5,
785813
"id": "51872c08-e01b-40c3-a8a0-e8d6a773e3df",
786814
"metadata": {},
787815
"outputs": [
@@ -940,15 +968,55 @@
940968
{
941969
"cell_type": "code",
942970
"execution_count": 10,
971+
"id": "78249752-38d7-47b9-b259-912bcc093dc4",
972+
"metadata": {},
973+
"outputs": [
974+
{
975+
"name": "stdout",
976+
"output_type": "stream",
977+
"text": [
978+
"[424, 256, 654, 531, 302, 311, 256, 296, 97, 465, 121, 595, 841, 116, 287, 466, 256, 326, 972, 46, 256, 60, 124, 271, 683, 102, 116, 461, 116, 124, 62]\n"
979+
]
980+
}
981+
],
982+
"source": [
983+
"input_text = \"Jack embraced beauty through art and life. <|endoftext|> \"\n",
984+
"token_ids = tokenizer.encode(input_text)\n",
985+
"print(token_ids)"
986+
]
987+
},
988+
{
989+
"cell_type": "code",
990+
"execution_count": 11,
991+
"id": "0331d37d-49a3-44f7-9aa9-9834e0938741",
992+
"metadata": {},
993+
"outputs": [
994+
{
995+
"name": "stdout",
996+
"output_type": "stream",
997+
"text": [
998+
"[424, 256, 654, 531, 302, 311, 256, 296, 97, 465, 121, 595, 841, 116, 287, 466, 256, 326, 972, 46, 257]\n"
999+
]
1000+
}
1001+
],
1002+
"source": [
1003+
"input_text = \"Jack embraced beauty through art and life. <|endoftext|> \"\n",
1004+
"token_ids = tokenizer.encode(input_text, allowed_special={\"<|endoftext|>\"})\n",
1005+
"print(token_ids)"
1006+
]
1007+
},
1008+
{
1009+
"cell_type": "code",
1010+
"execution_count": 12,
9431011
"id": "1ed1b344-f7d4-4e9e-ac34-2a04b5c5b7a8",
9441012
"metadata": {},
9451013
"outputs": [
9461014
{
9471015
"name": "stdout",
9481016
"output_type": "stream",
9491017
"text": [
950-
"Number of characters: 42\n",
951-
"Number of token IDs: 20\n"
1018+
"Number of characters: 57\n",
1019+
"Number of token IDs: 21\n"
9521020
]
9531021
}
9541022
],
@@ -975,15 +1043,15 @@
9751043
},
9761044
{
9771045
"cell_type": "code",
978-
"execution_count": 11,
1046+
"execution_count": 13,
9791047
"id": "da0e1faf-1933-43d9-b681-916c282a8f86",
9801048
"metadata": {},
9811049
"outputs": [
9821050
{
9831051
"name": "stdout",
9841052
"output_type": "stream",
9851053
"text": [
986-
"[424, 256, 654, 531, 302, 311, 256, 296, 97, 465, 121, 595, 841, 116, 287, 466, 256, 326, 972, 46]\n"
1054+
"[424, 256, 654, 531, 302, 311, 256, 296, 97, 465, 121, 595, 841, 116, 287, 466, 256, 326, 972, 46, 257]\n"
9871055
]
9881056
}
9891057
],
@@ -993,15 +1061,15 @@
9931061
},
9941062
{
9951063
"cell_type": "code",
996-
"execution_count": 12,
1064+
"execution_count": 14,
9971065
"id": "8b690e83-5d6b-409a-804e-321c287c24a4",
9981066
"metadata": {},
9991067
"outputs": [
10001068
{
10011069
"name": "stdout",
10021070
"output_type": "stream",
10031071
"text": [
1004-
"Jack embraced beauty through art and life.\n"
1072+
"Jack embraced beauty through art and life.<|endoftext|>\n"
10051073
]
10061074
}
10071075
],
@@ -1019,7 +1087,7 @@
10191087
},
10201088
{
10211089
"cell_type": "code",
1022-
"execution_count": 13,
1090+
"execution_count": 15,
10231091
"id": "2b9e6289-92cb-4d88-b3c8-e836d7c8095f",
10241092
"metadata": {},
10251093
"outputs": [
@@ -1046,7 +1114,8 @@
10461114
"256 -> \n",
10471115
"326 -> li\n",
10481116
"972 -> fe\n",
1049-
"46 -> .\n"
1117+
"46 -> .\n",
1118+
"257 -> <|endoftext|>\n"
10501119
]
10511120
}
10521121
],
@@ -1073,7 +1142,7 @@
10731142
},
10741143
{
10751144
"cell_type": "code",
1076-
"execution_count": 14,
1145+
"execution_count": 16,
10771146
"id": "c7056cb1-a9a3-4cf6-8364-29fb493ae240",
10781147
"metadata": {},
10791148
"outputs": [
@@ -1083,7 +1152,7 @@
10831152
"'This is some text.'"
10841153
]
10851154
},
1086-
"execution_count": 14,
1155+
"execution_count": 16,
10871156
"metadata": {},
10881157
"output_type": "execute_result"
10891158
}
@@ -1096,7 +1165,7 @@
10961165
},
10971166
{
10981167
"cell_type": "code",
1099-
"execution_count": 15,
1168+
"execution_count": 17,
11001169
"id": "37bc6753-8f35-4ec7-b23e-df4a12103cb4",
11011170
"metadata": {},
11021171
"outputs": [
@@ -1106,7 +1175,7 @@
11061175
"'This is some text with \\n newline characters.'"
11071176
]
11081177
},
1109-
"execution_count": 15,
1178+
"execution_count": 17,
11101179
"metadata": {},
11111180
"output_type": "execute_result"
11121181
}
@@ -1135,7 +1204,7 @@
11351204
},
11361205
{
11371206
"cell_type": "code",
1138-
"execution_count": 16,
1207+
"execution_count": 18,
11391208
"id": "955181cb-0910-4c6a-9c22-d8292a3ec1fc",
11401209
"metadata": {},
11411210
"outputs": [],
@@ -1146,7 +1215,7 @@
11461215
},
11471216
{
11481217
"cell_type": "code",
1149-
"execution_count": 17,
1218+
"execution_count": 19,
11501219
"id": "6e5ccfe7-ac67-42f3-b727-87886a8867f1",
11511220
"metadata": {},
11521221
"outputs": [],
@@ -1166,15 +1235,15 @@
11661235
},
11671236
{
11681237
"cell_type": "code",
1169-
"execution_count": 18,
1238+
"execution_count": 20,
11701239
"id": "00d9bf8f-756f-48bf-81b8-b890e2c2ef13",
11711240
"metadata": {},
11721241
"outputs": [
11731242
{
11741243
"name": "stdout",
11751244
"output_type": "stream",
11761245
"text": [
1177-
"Jack embraced beauty through art and life.\n"
1246+
"Jack embraced beauty through art and life.<|endoftext|>\n"
11781247
]
11791248
}
11801249
],
@@ -1184,7 +1253,7 @@
11841253
},
11851254
{
11861255
"cell_type": "code",
1187-
"execution_count": 19,
1256+
"execution_count": 21,
11881257
"id": "e7addb64-2892-4e1c-85dd-4f5152740099",
11891258
"metadata": {},
11901259
"outputs": [
@@ -1194,7 +1263,7 @@
11941263
"'This is some text with \\n newline characters.'"
11951264
]
11961265
},
1197-
"execution_count": 19,
1266+
"execution_count": 21,
11981267
"metadata": {},
11991268
"output_type": "execute_result"
12001269
}
@@ -1224,7 +1293,7 @@
12241293
},
12251294
{
12261295
"cell_type": "code",
1227-
"execution_count": 20,
1296+
"execution_count": 22,
12281297
"id": "b45b4366-2c2b-4309-9a14-febf3add8512",
12291298
"metadata": {},
12301299
"outputs": [
@@ -1264,7 +1333,7 @@
12641333
},
12651334
{
12661335
"cell_type": "code",
1267-
"execution_count": 21,
1336+
"execution_count": 23,
12681337
"id": "74306e6c-47d3-45a3-9e0f-93f7303ef601",
12691338
"metadata": {},
12701339
"outputs": [],
@@ -1285,7 +1354,7 @@
12851354
},
12861355
{
12871356
"cell_type": "code",
1288-
"execution_count": 22,
1357+
"execution_count": 24,
12891358
"id": "2bb722b4-dbf5-4a0c-9120-efda3293f132",
12901359
"metadata": {},
12911360
"outputs": [
@@ -1295,7 +1364,7 @@
12951364
"50257"
12961365
]
12971366
},
1298-
"execution_count": 22,
1367+
"execution_count": 24,
12991368
"metadata": {},
13001369
"output_type": "execute_result"
13011370
}
@@ -1314,7 +1383,7 @@
13141383
},
13151384
{
13161385
"cell_type": "code",
1317-
"execution_count": 23,
1386+
"execution_count": 25,
13181387
"id": "e4866de7-fb32-4dd6-a878-469ec734641c",
13191388
"metadata": {},
13201389
"outputs": [
@@ -1334,7 +1403,7 @@
13341403
},
13351404
{
13361405
"cell_type": "code",
1337-
"execution_count": 24,
1406+
"execution_count": 26,
13381407
"id": "3da8d9b2-af55-4b09-95d7-fabd983e919e",
13391408
"metadata": {},
13401409
"outputs": [

0 commit comments

Comments
 (0)