1
+ {
2
+ "nbformat" : 4 ,
3
+ "nbformat_minor" : 0 ,
4
+ "metadata" : {
5
+ "colab" : {
6
+ "name" : " 01_Word_Encoding.ipynb" ,
7
+ "provenance" : [],
8
+ "collapsed_sections" : []
9
+ },
10
+ "kernelspec" : {
11
+ "name" : " python3" ,
12
+ "display_name" : " Python 3"
13
+ }
14
+ },
15
+ "cells" : [
16
+ {
17
+ "cell_type" : " markdown" ,
18
+ "metadata" : {
19
+ "id" : " su3bfBDiRK9L"
20
+ },
21
+ "source" : [
22
+ " # Word Encodings"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type" : " markdown" ,
27
+ "metadata" : {
28
+ "id" : " V1vpla79AfJj"
29
+ },
30
+ "source" : [
31
+ " ## Import libraries and APIs"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type" : " code" ,
36
+ "metadata" : {
37
+ "id" : " dVGySTYgyVgW"
38
+ },
39
+ "source" : [
40
+ " ## import the tensorflow APIs\n " ,
41
+ " \n " ,
42
+ " import tensorflow as tf\n " ,
43
+ " from tensorflow.keras.preprocessing.text import Tokenizer"
44
+ ],
45
+ "execution_count" : 1 ,
46
+ "outputs" : []
47
+ },
48
+ {
49
+ "cell_type" : " markdown" ,
50
+ "metadata" : {
51
+ "id" : " ogxDv27ZAiOc"
52
+ },
53
+ "source" : [
54
+ " ## Define training sentences"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type" : " code" ,
59
+ "metadata" : {
60
+ "id" : " oC-bEci9Q-EI"
61
+ },
62
+ "source" : [
63
+ " ##sentences to tokenize\n " ,
64
+ " train_sentences = [\n " ,
65
+ " 'It is a sunny day',\n " ,
66
+ " 'It is also running',\n " ,
67
+ " 'It is kinda snowy'\n " ,
68
+ " ]"
69
+ ],
70
+ "execution_count" : 2 ,
71
+ "outputs" : []
72
+ },
73
+ {
74
+ "cell_type" : " markdown" ,
75
+ "metadata" : {
76
+ "id" : " FPwnd4K1AjXM"
77
+ },
78
+ "source" : [
79
+ " ## Set up the tokenizer"
80
+ ]
81
+ },
82
+ {
83
+ "cell_type" : " code" ,
84
+ "metadata" : {
85
+ "id" : " GbEn11WiT5Sp"
86
+ },
87
+ "source" : [
88
+ " ##instantiate the tokenizer\n " ,
89
+ " tokenizer = Tokenizer(num_words = 100)\n " ,
90
+ " \n " ,
91
+ " ##train the tokenizer on training sentences\n " ,
92
+ " tokenizer.fit_on_texts(train_sentences)\n " ,
93
+ " \n " ,
94
+ " ##store word index for the words in the sentence\n " ,
95
+ " word_index = tokenizer.word_index\n "
96
+ ],
97
+ "execution_count" : 3 ,
98
+ "outputs" : []
99
+ },
100
+ {
101
+ "cell_type" : " code" ,
102
+ "metadata" : {
103
+ "id" : " 0zZR31LAUM4p" ,
104
+ "colab" : {
105
+ "base_uri" : " https://localhost:8080/"
106
+ },
107
+ "outputId" : " f0d8a87f-f9e7-4c81-8e00-d31abc87a974"
108
+ },
109
+ "source" : [
110
+ " print(word_index)"
111
+ ],
112
+ "execution_count" : 4 ,
113
+ "outputs" : [
114
+ {
115
+ "output_type" : " stream" ,
116
+ "name" : " stdout" ,
117
+ "text" : [
118
+ " {'it': 1, 'is': 2, 'a': 3, 'sunny': 4, 'day': 5, 'also': 6, 'running': 7, 'kinda': 8, 'snowy': 9}\n "
119
+ ]
120
+ }
121
+ ]
122
+ }
123
+ ]
124
+ }
0 commit comments