|
513 | 513 | " else:\n",
|
514 | 514 | " print(f\"Skipping pair {pair} as one token is not in the vocabulary.\")\n",
|
515 | 515 | "\n",
|
516 |
| - " def encode(self, text):\n", |
| 516 | + " def encode(self, text, allowed_special=None):\n", |
517 | 517 | " \"\"\"\n",
|
518 |
| - " Encode the input text into a list of token IDs.\n", |
519 |
| - "\n", |
| 518 | + " Encode the input text into a list of token IDs, with tiktoken-style handling of special tokens.\n", |
| 519 | + " \n", |
520 | 520 | " Args:\n",
|
521 |
| - " text (str): The text to encode.\n", |
522 |
| - "\n", |
| 521 | + " text (str): The input text to encode.\n", |
| 522 | + " allowed_special (set or None): Special tokens to allow passthrough. If None, special handling is disabled.\n", |
| 523 | + " \n", |
523 | 524 | " Returns:\n",
|
524 |
| - " List[int]: The list of token IDs.\n", |
| 525 | + " List of token IDs.\n", |
525 | 526 | " \"\"\"\n",
|
| 527 | + " import re\n", |
| 528 | + " \n", |
| 529 | + " token_ids = []\n", |
| 530 | + " \n", |
| 531 | + " # If special token handling is enabled\n", |
| 532 | + " if allowed_special is not None and len(allowed_special) > 0:\n", |
| 533 | + " # Build regex to match allowed special tokens\n", |
| 534 | + " special_pattern = (\n", |
| 535 | + " \"(\" + \"|\".join(re.escape(tok) for tok in sorted(allowed_special, key=len, reverse=True)) + \")\"\n", |
| 536 | + " )\n", |
| 537 | + " \n", |
| 538 | + " last_index = 0\n", |
| 539 | + " for match in re.finditer(special_pattern, text):\n", |
| 540 | + " prefix = text[last_index:match.start()]\n", |
| 541 | + " token_ids.extend(self.encode(prefix, allowed_special=None)) # Encode prefix without special handling\n", |
| 542 | + " \n", |
| 543 | + " special_token = match.group(0)\n", |
| 544 | + " if special_token in self.inverse_vocab:\n", |
| 545 | + " token_ids.append(self.inverse_vocab[special_token])\n", |
| 546 | + " else:\n", |
| 547 | + " raise ValueError(f\"Special token {special_token} not found in vocabulary.\")\n", |
| 548 | + " last_index = match.end()\n", |
| 549 | + " \n", |
| 550 | + " text = text[last_index:] # Remaining part to process normally\n", |
| 551 | + " \n", |
| 552 | + " # Check if any disallowed special tokens are in the remainder\n", |
| 553 | + " disallowed = [\n", |
| 554 | + " tok for tok in self.inverse_vocab\n", |
| 555 | + " if tok.startswith(\"<|\") and tok.endswith(\"|>\") and tok in text and tok not in allowed_special\n", |
| 556 | + " ]\n", |
| 557 | + " if disallowed:\n", |
| 558 | + " raise ValueError(f\"Disallowed special tokens encountered in text: {disallowed}\")\n", |
| 559 | + " \n", |
| 560 | + " # If no special tokens, or remaining text after special token split:\n", |
526 | 561 | " tokens = []\n",
|
527 |
| - " # First split on newlines to preserve them\n", |
528 | 562 | " lines = text.split(\"\\n\")\n",
|
529 | 563 | " for i, line in enumerate(lines):\n",
|
530 | 564 | " if i > 0:\n",
|
531 |
| - " tokens.append(\"\\n\") # Add newline token separately\n", |
| 565 | + " tokens.append(\"\\n\")\n", |
532 | 566 | " words = line.split()\n",
|
533 | 567 | " for j, word in enumerate(words):\n",
|
534 |
| - " if j == 0:\n", |
535 |
| - " if i > 0: # Start of a new line but not the first line\n", |
536 |
| - " tokens.append(\"Ġ\" + word) # Ensure it's marked as a new segment\n", |
537 |
| - " else:\n", |
538 |
| - " tokens.append(word)\n", |
| 568 | + " if j == 0 and i > 0:\n", |
| 569 | + " tokens.append(\"Ġ\" + word)\n", |
| 570 | + " elif j == 0:\n", |
| 571 | + " tokens.append(word)\n", |
539 | 572 | " else:\n",
|
540 |
| - " # Prefix words in the middle of a line with \"Ġ\"\n", |
541 | 573 | " tokens.append(\"Ġ\" + word)\n",
|
542 |
| - "\n", |
543 |
| - " token_ids = []\n", |
| 574 | + " \n", |
544 | 575 | " for token in tokens:\n",
|
545 | 576 | " if token in self.inverse_vocab:\n",
|
546 |
| - " # token is contained in the vocabulary as is\n", |
547 | 577 | " token_ids.append(self.inverse_vocab[token])\n",
|
548 | 578 | " else:\n",
|
549 |
| - " # Attempt to handle subword tokenization via BPE\n", |
550 |
| - " sub_token_ids = self.tokenize_with_bpe(token)\n", |
551 |
| - " token_ids.extend(sub_token_ids)\n", |
552 |
| - "\n", |
| 579 | + " token_ids.extend(self.tokenize_with_bpe(token))\n", |
| 580 | + " \n", |
553 | 581 | " return token_ids\n",
|
554 | 582 | "\n",
|
555 | 583 | " def tokenize_with_bpe(self, token):\n",
|
|
781 | 809 | },
|
782 | 810 | {
|
783 | 811 | "cell_type": "code",
|
784 |
| - "execution_count": 25, |
| 812 | + "execution_count": 5, |
785 | 813 | "id": "51872c08-e01b-40c3-a8a0-e8d6a773e3df",
|
786 | 814 | "metadata": {},
|
787 | 815 | "outputs": [
|
|
940 | 968 | {
|
941 | 969 | "cell_type": "code",
|
942 | 970 | "execution_count": 10,
|
| 971 | + "id": "78249752-38d7-47b9-b259-912bcc093dc4", |
| 972 | + "metadata": {}, |
| 973 | + "outputs": [ |
| 974 | + { |
| 975 | + "name": "stdout", |
| 976 | + "output_type": "stream", |
| 977 | + "text": [ |
| 978 | + "[424, 256, 654, 531, 302, 311, 256, 296, 97, 465, 121, 595, 841, 116, 287, 466, 256, 326, 972, 46, 256, 60, 124, 271, 683, 102, 116, 461, 116, 124, 62]\n" |
| 979 | + ] |
| 980 | + } |
| 981 | + ], |
| 982 | + "source": [ |
| 983 | + "input_text = \"Jack embraced beauty through art and life. <|endoftext|> \"\n", |
| 984 | + "token_ids = tokenizer.encode(input_text)\n", |
| 985 | + "print(token_ids)" |
| 986 | + ] |
| 987 | + }, |
| 988 | + { |
| 989 | + "cell_type": "code", |
| 990 | + "execution_count": 11, |
| 991 | + "id": "0331d37d-49a3-44f7-9aa9-9834e0938741", |
| 992 | + "metadata": {}, |
| 993 | + "outputs": [ |
| 994 | + { |
| 995 | + "name": "stdout", |
| 996 | + "output_type": "stream", |
| 997 | + "text": [ |
| 998 | + "[424, 256, 654, 531, 302, 311, 256, 296, 97, 465, 121, 595, 841, 116, 287, 466, 256, 326, 972, 46, 257]\n" |
| 999 | + ] |
| 1000 | + } |
| 1001 | + ], |
| 1002 | + "source": [ |
| 1003 | + "input_text = \"Jack embraced beauty through art and life. <|endoftext|> \"\n", |
| 1004 | + "token_ids = tokenizer.encode(input_text, allowed_special={\"<|endoftext|>\"})\n", |
| 1005 | + "print(token_ids)" |
| 1006 | + ] |
| 1007 | + }, |
| 1008 | + { |
| 1009 | + "cell_type": "code", |
| 1010 | + "execution_count": 12, |
943 | 1011 | "id": "1ed1b344-f7d4-4e9e-ac34-2a04b5c5b7a8",
|
944 | 1012 | "metadata": {},
|
945 | 1013 | "outputs": [
|
946 | 1014 | {
|
947 | 1015 | "name": "stdout",
|
948 | 1016 | "output_type": "stream",
|
949 | 1017 | "text": [
|
950 |
| - "Number of characters: 42\n", |
951 |
| - "Number of token IDs: 20\n" |
| 1018 | + "Number of characters: 57\n", |
| 1019 | + "Number of token IDs: 21\n" |
952 | 1020 | ]
|
953 | 1021 | }
|
954 | 1022 | ],
|
|
975 | 1043 | },
|
976 | 1044 | {
|
977 | 1045 | "cell_type": "code",
|
978 |
| - "execution_count": 11, |
| 1046 | + "execution_count": 13, |
979 | 1047 | "id": "da0e1faf-1933-43d9-b681-916c282a8f86",
|
980 | 1048 | "metadata": {},
|
981 | 1049 | "outputs": [
|
982 | 1050 | {
|
983 | 1051 | "name": "stdout",
|
984 | 1052 | "output_type": "stream",
|
985 | 1053 | "text": [
|
986 |
| - "[424, 256, 654, 531, 302, 311, 256, 296, 97, 465, 121, 595, 841, 116, 287, 466, 256, 326, 972, 46]\n" |
| 1054 | + "[424, 256, 654, 531, 302, 311, 256, 296, 97, 465, 121, 595, 841, 116, 287, 466, 256, 326, 972, 46, 257]\n" |
987 | 1055 | ]
|
988 | 1056 | }
|
989 | 1057 | ],
|
|
993 | 1061 | },
|
994 | 1062 | {
|
995 | 1063 | "cell_type": "code",
|
996 |
| - "execution_count": 12, |
| 1064 | + "execution_count": 14, |
997 | 1065 | "id": "8b690e83-5d6b-409a-804e-321c287c24a4",
|
998 | 1066 | "metadata": {},
|
999 | 1067 | "outputs": [
|
1000 | 1068 | {
|
1001 | 1069 | "name": "stdout",
|
1002 | 1070 | "output_type": "stream",
|
1003 | 1071 | "text": [
|
1004 |
| - "Jack embraced beauty through art and life.\n" |
| 1072 | + "Jack embraced beauty through art and life.<|endoftext|>\n" |
1005 | 1073 | ]
|
1006 | 1074 | }
|
1007 | 1075 | ],
|
|
1019 | 1087 | },
|
1020 | 1088 | {
|
1021 | 1089 | "cell_type": "code",
|
1022 |
| - "execution_count": 13, |
| 1090 | + "execution_count": 15, |
1023 | 1091 | "id": "2b9e6289-92cb-4d88-b3c8-e836d7c8095f",
|
1024 | 1092 | "metadata": {},
|
1025 | 1093 | "outputs": [
|
|
1046 | 1114 | "256 -> \n",
|
1047 | 1115 | "326 -> li\n",
|
1048 | 1116 | "972 -> fe\n",
|
1049 |
| - "46 -> .\n" |
| 1117 | + "46 -> .\n", |
| 1118 | + "257 -> <|endoftext|>\n" |
1050 | 1119 | ]
|
1051 | 1120 | }
|
1052 | 1121 | ],
|
|
1073 | 1142 | },
|
1074 | 1143 | {
|
1075 | 1144 | "cell_type": "code",
|
1076 |
| - "execution_count": 14, |
| 1145 | + "execution_count": 16, |
1077 | 1146 | "id": "c7056cb1-a9a3-4cf6-8364-29fb493ae240",
|
1078 | 1147 | "metadata": {},
|
1079 | 1148 | "outputs": [
|
|
1083 | 1152 | "'This is some text.'"
|
1084 | 1153 | ]
|
1085 | 1154 | },
|
1086 |
| - "execution_count": 14, |
| 1155 | + "execution_count": 16, |
1087 | 1156 | "metadata": {},
|
1088 | 1157 | "output_type": "execute_result"
|
1089 | 1158 | }
|
|
1096 | 1165 | },
|
1097 | 1166 | {
|
1098 | 1167 | "cell_type": "code",
|
1099 |
| - "execution_count": 15, |
| 1168 | + "execution_count": 17, |
1100 | 1169 | "id": "37bc6753-8f35-4ec7-b23e-df4a12103cb4",
|
1101 | 1170 | "metadata": {},
|
1102 | 1171 | "outputs": [
|
|
1106 | 1175 | "'This is some text with \\n newline characters.'"
|
1107 | 1176 | ]
|
1108 | 1177 | },
|
1109 |
| - "execution_count": 15, |
| 1178 | + "execution_count": 17, |
1110 | 1179 | "metadata": {},
|
1111 | 1180 | "output_type": "execute_result"
|
1112 | 1181 | }
|
|
1135 | 1204 | },
|
1136 | 1205 | {
|
1137 | 1206 | "cell_type": "code",
|
1138 |
| - "execution_count": 16, |
| 1207 | + "execution_count": 18, |
1139 | 1208 | "id": "955181cb-0910-4c6a-9c22-d8292a3ec1fc",
|
1140 | 1209 | "metadata": {},
|
1141 | 1210 | "outputs": [],
|
|
1146 | 1215 | },
|
1147 | 1216 | {
|
1148 | 1217 | "cell_type": "code",
|
1149 |
| - "execution_count": 17, |
| 1218 | + "execution_count": 19, |
1150 | 1219 | "id": "6e5ccfe7-ac67-42f3-b727-87886a8867f1",
|
1151 | 1220 | "metadata": {},
|
1152 | 1221 | "outputs": [],
|
|
1166 | 1235 | },
|
1167 | 1236 | {
|
1168 | 1237 | "cell_type": "code",
|
1169 |
| - "execution_count": 18, |
| 1238 | + "execution_count": 20, |
1170 | 1239 | "id": "00d9bf8f-756f-48bf-81b8-b890e2c2ef13",
|
1171 | 1240 | "metadata": {},
|
1172 | 1241 | "outputs": [
|
1173 | 1242 | {
|
1174 | 1243 | "name": "stdout",
|
1175 | 1244 | "output_type": "stream",
|
1176 | 1245 | "text": [
|
1177 |
| - "Jack embraced beauty through art and life.\n" |
| 1246 | + "Jack embraced beauty through art and life.<|endoftext|>\n" |
1178 | 1247 | ]
|
1179 | 1248 | }
|
1180 | 1249 | ],
|
|
1184 | 1253 | },
|
1185 | 1254 | {
|
1186 | 1255 | "cell_type": "code",
|
1187 |
| - "execution_count": 19, |
| 1256 | + "execution_count": 21, |
1188 | 1257 | "id": "e7addb64-2892-4e1c-85dd-4f5152740099",
|
1189 | 1258 | "metadata": {},
|
1190 | 1259 | "outputs": [
|
|
1194 | 1263 | "'This is some text with \\n newline characters.'"
|
1195 | 1264 | ]
|
1196 | 1265 | },
|
1197 |
| - "execution_count": 19, |
| 1266 | + "execution_count": 21, |
1198 | 1267 | "metadata": {},
|
1199 | 1268 | "output_type": "execute_result"
|
1200 | 1269 | }
|
|
1224 | 1293 | },
|
1225 | 1294 | {
|
1226 | 1295 | "cell_type": "code",
|
1227 |
| - "execution_count": 20, |
| 1296 | + "execution_count": 22, |
1228 | 1297 | "id": "b45b4366-2c2b-4309-9a14-febf3add8512",
|
1229 | 1298 | "metadata": {},
|
1230 | 1299 | "outputs": [
|
|
1264 | 1333 | },
|
1265 | 1334 | {
|
1266 | 1335 | "cell_type": "code",
|
1267 |
| - "execution_count": 21, |
| 1336 | + "execution_count": 23, |
1268 | 1337 | "id": "74306e6c-47d3-45a3-9e0f-93f7303ef601",
|
1269 | 1338 | "metadata": {},
|
1270 | 1339 | "outputs": [],
|
|
1285 | 1354 | },
|
1286 | 1355 | {
|
1287 | 1356 | "cell_type": "code",
|
1288 |
| - "execution_count": 22, |
| 1357 | + "execution_count": 24, |
1289 | 1358 | "id": "2bb722b4-dbf5-4a0c-9120-efda3293f132",
|
1290 | 1359 | "metadata": {},
|
1291 | 1360 | "outputs": [
|
|
1295 | 1364 | "50257"
|
1296 | 1365 | ]
|
1297 | 1366 | },
|
1298 |
| - "execution_count": 22, |
| 1367 | + "execution_count": 24, |
1299 | 1368 | "metadata": {},
|
1300 | 1369 | "output_type": "execute_result"
|
1301 | 1370 | }
|
|
1314 | 1383 | },
|
1315 | 1384 | {
|
1316 | 1385 | "cell_type": "code",
|
1317 |
| - "execution_count": 23, |
| 1386 | + "execution_count": 25, |
1318 | 1387 | "id": "e4866de7-fb32-4dd6-a878-469ec734641c",
|
1319 | 1388 | "metadata": {},
|
1320 | 1389 | "outputs": [
|
|
1334 | 1403 | },
|
1335 | 1404 | {
|
1336 | 1405 | "cell_type": "code",
|
1337 |
| - "execution_count": 24, |
| 1406 | + "execution_count": 26, |
1338 | 1407 | "id": "3da8d9b2-af55-4b09-95d7-fabd983e919e",
|
1339 | 1408 | "metadata": {},
|
1340 | 1409 | "outputs": [
|
|
0 commit comments