@@ -54,15 +54,15 @@ def __init__(self, args: argparse.Namespace, train: MorphoDataset.Dataset) -> No
54
54
self ._word_masking = ...
55
55
56
56
# TODO: Create a `torch.nn.Embedding` layer for embedding the character ids
57
- # from `train.forms .char_vocab` to dimensionality `args.cle_dim`.
57
+ # from `train.words .char_vocab` to dimensionality `args.cle_dim`.
58
58
self ._char_embedding = ...
59
59
60
60
# TODO: Create a bidirectional `torch.nn.GRU` layer processing the character
61
61
# embeddings, producing output of dimensionality `args.cle_dim`.
62
62
self ._char_rnn = ...
63
63
64
- # TODO(tagger_we): Create a `torch.nn.Embedding` layer, embedding the form ids
65
- # from `train.forms .word_vocab` to dimensionality `args.we_dim`.
64
+ # TODO(tagger_we): Create a `torch.nn.Embedding` layer, embedding the word ids
65
+ # from `train.words .word_vocab` to dimensionality `args.we_dim`.
66
66
self ._word_embedding = ...
67
67
68
68
# TODO: Create an RNN layer, either `torch.nn.LSTM` or `torch.nn.GRU` depending
@@ -77,46 +77,46 @@ def __init__(self, args: argparse.Namespace, train: MorphoDataset.Dataset) -> No
77
77
# producing logits for tag prediction; `train.tags.word_vocab` is the tag vocabulary.
78
78
self ._output_layer = ...
79
79
80
- def forward (self , form_ids : torch .nn .utils .rnn .PackedSequence , unique_forms : torch .nn .utils .rnn .PackedSequence ,
81
- form_indices : torch .nn .utils .rnn .PackedSequence ) -> torch .nn .utils .rnn .PackedSequence :
80
+ def forward (self , word_ids : torch .nn .utils .rnn .PackedSequence , unique_words : torch .nn .utils .rnn .PackedSequence ,
81
+ word_indices : torch .nn .utils .rnn .PackedSequence ) -> torch .nn .utils .rnn .PackedSequence :
82
82
# The input arguments are `PackedSequence`s. A `PackedSequence` allows us to:
83
- # - get the flattened data using `form_ids .data`; these are the data without
83
+ # - get the flattened data using `word_ids .data`; these are the data without
84
84
# padding elements, i.e., a 1D vector of shape `[sum_of_sentence_lengths]`;
85
85
# - replace the data while keeping the sizes of the original sequences
86
- # by calling `form_ids ._replace(data=...)` and getting a new `PackedSequence`.
86
+ # by calling `word_ids ._replace(data=...)` and getting a new `PackedSequence`.
87
87
# Therefore, depending on the context, we need to use either the flattened
88
88
# data or the `PackedSequence` object.
89
89
90
- # TODO: Mask the input `form_ids ` using the `self._word_masking` layer.
90
+ # TODO: Mask the input `word_ids ` using the `self._word_masking` layer.
91
91
hidden = ...
92
92
93
- # TODO: Embed the masked form IDs in `hidden` using the word embedding layer.
93
+ # TODO: Embed the masked word IDs in `hidden` using the word embedding layer.
94
94
hidden = ...
95
95
96
- # TODO: Embed the `unique_forms ` using the character embedding layer.
96
+ # TODO: Embed the `unique_words ` using the character embedding layer.
97
97
cle = ...
98
98
99
99
# TODO: Pass the character embeddings through the character-level RNN.
100
100
# The input to the RNN should be a `PackedSequence` with the same structure
101
- # as `unique_forms `. Note that this time we are interested only in the
101
+ # as `unique_words `. Note that this time we are interested only in the
102
102
# second output of the GRU call (the last hidden state of the RNN).
103
103
104
104
# TODO: Concatenate the states of the forward and backward directions (in this order).
105
105
cle = ...
106
106
107
- # TODO: With `cle` being the character-level embeddings of the unique forms
108
- # of shape `[num_unique_forms , 2 * cle_dim]`, create the representation of the
109
- # (not necessary unique) sentence forms by indexing the character-level
110
- # embeddings with the `form_indices `. The result should have an analogous structure
107
+ # TODO: With `cle` being the character-level embeddings of the unique words
108
+ # of shape `[num_unique_words , 2 * cle_dim]`, create the representation of the
109
+ # (not necessary unique) sentence words by indexing the character-level
110
+ # embeddings with the `word_indices `. The result should have an analogous structure
111
111
# to word embeddings in `hidden`, just with a different dimensionality of the
112
112
# embedding. You can use for example the `torch.nn.functional.embedding` function.
113
113
cle = ...
114
114
115
115
# TODO: Concatenate the word embeddings with the character-level embeddings (in this order).
116
116
hidden = ...
117
117
118
- # TODO(tagger_we.packed): Process the embedded forms through the RNN layer, utilizing
119
- # the `PackedSequence` structure of `form_ids ` (i.e., the same sentence lengths).
118
+ # TODO(tagger_we.packed): Process the embedded words through the RNN layer, utilizing
119
+ # the `PackedSequence` structure of `word_ids ` (i.e., the same sentence lengths).
120
120
hidden = ...
121
121
122
122
# TODO(tagger_we.packed): Sum the outputs of forward and backward directions.
@@ -126,7 +126,7 @@ def forward(self, form_ids: torch.nn.utils.rnn.PackedSequence, unique_forms: tor
126
126
hidden = ...
127
127
128
128
# TODO(tagger_we.packed): Finally, produce output predictions as a `PackedSequence`
129
- # with the same `PackedSequence` structure as `form_ids ` (same sentence lengths).
129
+ # with the same `PackedSequence` structure as `word_ids ` (same sentence lengths).
130
130
hidden = ...
131
131
132
132
return hidden
@@ -145,31 +145,31 @@ def compute_metrics(self, y_pred, y_true, *xs):
145
145
class TrainableDataset (npfl138 .TransformedDataset ):
146
146
def transform (self , example ):
147
147
# TODO(tagger_we): Construct a single example, each consisting of the following pair:
148
- # - a PyTorch tensor of integer ids of input forms as input,
148
+ # - a PyTorch tensor of integer ids of input words as input,
149
149
# - a PyTorch tensor of integer tag ids as targets.
150
- # To create the ids, use `word_vocab` of `self.dataset.forms ` and `self.dataset.tags`.
151
- form_ids = ...
150
+ # To create the ids, use `word_vocab` of `self.dataset.words ` and `self.dataset.tags`.
151
+ word_ids = ...
152
152
tag_ids = ...
153
153
# Note that compared to `tagger_we`, we also return the original
154
- # forms in order to be able to compute the character-level embeddings.
155
- return form_ids , example ["forms " ], tag_ids
154
+ # words in order to be able to compute the character-level embeddings.
155
+ return word_ids , example ["words " ], tag_ids
156
156
157
157
def collate (self , batch ):
158
158
# Construct a single batch, where `data` is a list of examples
159
159
# generated by `transform`.
160
- form_ids , forms , tag_ids = zip (* batch )
161
- # TODO(tagger_we.packed): Combine `form_ids ` into a `PackedSequence` by calling
160
+ word_ids , words , tag_ids = zip (* batch )
161
+ # TODO(tagger_we.packed): Combine `word_ids ` into a `PackedSequence` by calling
162
162
# `torch.nn.utils.rnn.pack_sequence` with `enforce_sorted=False`.
163
- form_ids = ...
163
+ word_ids = ...
164
164
# TODO: Create required inputs for the character-level embeddings using
165
- # the provided `self.dataset.cle_batch_packed` function on `forms `. The function
165
+ # the provided `self.dataset.cle_batch_packed` function on `words `. The function
166
166
# returns a pair of two PyTorch PackedSequences:
167
- # - `unique_forms ` containing each unique form as a sequence of character ids,
168
- # - `forms_indices ` containing for every form its index in `unique_forms `.
169
- unique_forms , forms_indices = ...
170
- # TODO(tagger_we): Process `tag_ids` analogously to `form_ids `.
167
+ # - `unique_words ` containing each unique word as a sequence of character ids,
168
+ # - `words_indices ` containing for every word its index in `unique_words `.
169
+ unique_words , words_indices = ...
170
+ # TODO(tagger_we): Process `tag_ids` analogously to `word_ids `.
171
171
tag_ids = ...
172
- return (form_ids , unique_forms , forms_indices ), tag_ids
172
+ return (word_ids , unique_words , words_indices ), tag_ids
173
173
174
174
175
175
def main (args : argparse .Namespace ) -> dict [str , float ]:
0 commit comments