Skip to content

Commit 19a6346

Browse files
committed
Use "words" instead of "forms".
1 parent b13e7bd commit 19a6346

File tree

5 files changed

+97
-97
lines changed

5 files changed

+97
-97
lines changed

labs/07/tagger_cle.packed.py

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -54,15 +54,15 @@ def __init__(self, args: argparse.Namespace, train: MorphoDataset.Dataset) -> No
5454
self._word_masking = ...
5555

5656
# TODO: Create a `torch.nn.Embedding` layer for embedding the character ids
57-
# from `train.forms.char_vocab` to dimensionality `args.cle_dim`.
57+
# from `train.words.char_vocab` to dimensionality `args.cle_dim`.
5858
self._char_embedding = ...
5959

6060
# TODO: Create a bidirectional `torch.nn.GRU` layer processing the character
6161
# embeddings, producing output of dimensionality `args.cle_dim`.
6262
self._char_rnn = ...
6363

64-
# TODO(tagger_we): Create a `torch.nn.Embedding` layer, embedding the form ids
65-
# from `train.forms.word_vocab` to dimensionality `args.we_dim`.
64+
# TODO(tagger_we): Create a `torch.nn.Embedding` layer, embedding the word ids
65+
# from `train.words.word_vocab` to dimensionality `args.we_dim`.
6666
self._word_embedding = ...
6767

6868
# TODO: Create an RNN layer, either `torch.nn.LSTM` or `torch.nn.GRU` depending
@@ -77,46 +77,46 @@ def __init__(self, args: argparse.Namespace, train: MorphoDataset.Dataset) -> No
7777
# producing logits for tag prediction; `train.tags.word_vocab` is the tag vocabulary.
7878
self._output_layer = ...
7979

80-
def forward(self, form_ids: torch.nn.utils.rnn.PackedSequence, unique_forms: torch.nn.utils.rnn.PackedSequence,
81-
form_indices: torch.nn.utils.rnn.PackedSequence) -> torch.nn.utils.rnn.PackedSequence:
80+
def forward(self, word_ids: torch.nn.utils.rnn.PackedSequence, unique_words: torch.nn.utils.rnn.PackedSequence,
81+
word_indices: torch.nn.utils.rnn.PackedSequence) -> torch.nn.utils.rnn.PackedSequence:
8282
# The input arguments are `PackedSequence`s. A `PackedSequence` allows us to:
83-
# - get the flattened data using `form_ids.data`; these are the data without
83+
# - get the flattened data using `word_ids.data`; these are the data without
8484
# padding elements, i.e., a 1D vector of shape `[sum_of_sentence_lengths]`;
8585
# - replace the data while keeping the sizes of the original sequences
86-
# by calling `form_ids._replace(data=...)` and getting a new `PackedSequence`.
86+
# by calling `word_ids._replace(data=...)` and getting a new `PackedSequence`.
8787
# Therefore, depending on the context, we need to use either the flattened
8888
# data or the `PackedSequence` object.
8989

90-
# TODO: Mask the input `form_ids` using the `self._word_masking` layer.
90+
# TODO: Mask the input `word_ids` using the `self._word_masking` layer.
9191
hidden = ...
9292

93-
# TODO: Embed the masked form IDs in `hidden` using the word embedding layer.
93+
# TODO: Embed the masked word IDs in `hidden` using the word embedding layer.
9494
hidden = ...
9595

96-
# TODO: Embed the `unique_forms` using the character embedding layer.
96+
# TODO: Embed the `unique_words` using the character embedding layer.
9797
cle = ...
9898

9999
# TODO: Pass the character embeddings through the character-level RNN.
100100
# The input to the RNN should be a `PackedSequence` with the same structure
101-
# as `unique_forms`. Note that this time we are interested only in the
101+
# as `unique_words`. Note that this time we are interested only in the
102102
# second output of the GRU call (the last hidden state of the RNN).
103103

104104
# TODO: Concatenate the states of the forward and backward directions (in this order).
105105
cle = ...
106106

107-
# TODO: With `cle` being the character-level embeddings of the unique forms
108-
# of shape `[num_unique_forms, 2 * cle_dim]`, create the representation of the
109-
# (not necessary unique) sentence forms by indexing the character-level
110-
# embeddings with the `form_indices`. The result should have an analogous structure
107+
# TODO: With `cle` being the character-level embeddings of the unique words
108+
# of shape `[num_unique_words, 2 * cle_dim]`, create the representation of the
109+
# (not necessary unique) sentence words by indexing the character-level
110+
# embeddings with the `word_indices`. The result should have an analogous structure
111111
# to word embeddings in `hidden`, just with a different dimensionality of the
112112
# embedding. You can use for example the `torch.nn.functional.embedding` function.
113113
cle = ...
114114

115115
# TODO: Concatenate the word embeddings with the character-level embeddings (in this order).
116116
hidden = ...
117117

118-
# TODO(tagger_we.packed): Process the embedded forms through the RNN layer, utilizing
119-
# the `PackedSequence` structure of `form_ids` (i.e., the same sentence lengths).
118+
# TODO(tagger_we.packed): Process the embedded words through the RNN layer, utilizing
119+
# the `PackedSequence` structure of `word_ids` (i.e., the same sentence lengths).
120120
hidden = ...
121121

122122
# TODO(tagger_we.packed): Sum the outputs of forward and backward directions.
@@ -126,7 +126,7 @@ def forward(self, form_ids: torch.nn.utils.rnn.PackedSequence, unique_forms: tor
126126
hidden = ...
127127

128128
# TODO(tagger_we.packed): Finally, produce output predictions as a `PackedSequence`
129-
# with the same `PackedSequence` structure as `form_ids` (same sentence lengths).
129+
# with the same `PackedSequence` structure as `word_ids` (same sentence lengths).
130130
hidden = ...
131131

132132
return hidden
@@ -145,31 +145,31 @@ def compute_metrics(self, y_pred, y_true, *xs):
145145
class TrainableDataset(npfl138.TransformedDataset):
146146
def transform(self, example):
147147
# TODO(tagger_we): Construct a single example, each consisting of the following pair:
148-
# - a PyTorch tensor of integer ids of input forms as input,
148+
# - a PyTorch tensor of integer ids of input words as input,
149149
# - a PyTorch tensor of integer tag ids as targets.
150-
# To create the ids, use `word_vocab` of `self.dataset.forms` and `self.dataset.tags`.
151-
form_ids = ...
150+
# To create the ids, use `word_vocab` of `self.dataset.words` and `self.dataset.tags`.
151+
word_ids = ...
152152
tag_ids = ...
153153
# Note that compared to `tagger_we`, we also return the original
154-
# forms in order to be able to compute the character-level embeddings.
155-
return form_ids, example["forms"], tag_ids
154+
# words in order to be able to compute the character-level embeddings.
155+
return word_ids, example["words"], tag_ids
156156

157157
def collate(self, batch):
158158
# Construct a single batch, where `data` is a list of examples
159159
# generated by `transform`.
160-
form_ids, forms, tag_ids = zip(*batch)
161-
# TODO(tagger_we.packed): Combine `form_ids` into a `PackedSequence` by calling
160+
word_ids, words, tag_ids = zip(*batch)
161+
# TODO(tagger_we.packed): Combine `word_ids` into a `PackedSequence` by calling
162162
# `torch.nn.utils.rnn.pack_sequence` with `enforce_sorted=False`.
163-
form_ids = ...
163+
word_ids = ...
164164
# TODO: Create required inputs for the character-level embeddings using
165-
# the provided `self.dataset.cle_batch_packed` function on `forms`. The function
165+
# the provided `self.dataset.cle_batch_packed` function on `words`. The function
166166
# returns a pair of two PyTorch PackedSequences:
167-
# - `unique_forms` containing each unique form as a sequence of character ids,
168-
# - `forms_indices` containing for every form its index in `unique_forms`.
169-
unique_forms, forms_indices = ...
170-
# TODO(tagger_we): Process `tag_ids` analogously to `form_ids`.
167+
# - `unique_words` containing each unique word as a sequence of character ids,
168+
# - `words_indices` containing for every word its index in `unique_words`.
169+
unique_words, words_indices = ...
170+
# TODO(tagger_we): Process `tag_ids` analogously to `word_ids`.
171171
tag_ids = ...
172-
return (form_ids, unique_forms, forms_indices), tag_ids
172+
return (word_ids, unique_words, words_indices), tag_ids
173173

174174

175175
def main(args: argparse.Namespace) -> dict[str, float]:

labs/07/tagger_cle.py

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -54,16 +54,16 @@ def __init__(self, args: argparse.Namespace, train: MorphoDataset.Dataset) -> No
5454
self._word_masking = ...
5555

5656
# TODO: Create a `torch.nn.Embedding` layer for embedding the character ids
57-
# from `train.forms.char_vocab` to dimensionality `args.cle_dim`.
57+
# from `train.words.char_vocab` to dimensionality `args.cle_dim`.
5858
self._char_embedding = ...
5959

6060
# TODO: Create a bidirectional `torch.nn.GRU` layer processing the character
6161
# embeddings, producing output of dimensionality `args.cle_dim`; again, also pass
6262
# `batch_first=True` to the constructor.
6363
self._char_rnn = ...
6464

65-
# TODO(tagger_we): Create a `torch.nn.Embedding` layer, embedding the form ids
66-
# from `train.forms.word_vocab` to dimensionality `args.we_dim`.
65+
# TODO(tagger_we): Create a `torch.nn.Embedding` layer, embedding the word ids
66+
# from `train.words.word_vocab` to dimensionality `args.we_dim`.
6767
self._word_embedding = ...
6868

6969
# TODO: Create an RNN layer, either `torch.nn.LSTM` or `torch.nn.GRU` depending
@@ -78,14 +78,14 @@ def __init__(self, args: argparse.Namespace, train: MorphoDataset.Dataset) -> No
7878
# producing logits for tag prediction; `train.tags.word_vocab` is the tag vocabulary.
7979
self._output_layer = ...
8080

81-
def forward(self, form_ids: torch.Tensor, unique_forms: torch.Tensor, form_indices: torch.Tensor) -> torch.Tensor:
82-
# TODO: Mask the input `form_ids` using the `self._word_masking` layer.
81+
def forward(self, word_ids: torch.Tensor, unique_words: torch.Tensor, word_indices: torch.Tensor) -> torch.Tensor:
82+
# TODO: Mask the input `word_ids` using the `self._word_masking` layer.
8383
hidden = ...
8484

85-
# TODO: Embed the masked form IDs in `hidden` using the word embedding layer.
85+
# TODO: Embed the masked word IDs in `hidden` using the word embedding layer.
8686
hidden = ...
8787

88-
# TODO: Embed the `unique_forms` using the character embedding layer.
88+
# TODO: Embed the `unique_words` using the character embedding layer.
8989
cle = ...
9090

9191
# TODO: Pass the character embeddings through the character-level RNN.
@@ -99,10 +99,10 @@ def forward(self, form_ids: torch.Tensor, unique_forms: torch.Tensor, form_indic
9999
# TODO: Concatenate the states of the forward and backward directions (in this order).
100100
cle = ...
101101

102-
# TODO: With `cle` being the character-level embeddings of the unique forms
103-
# of shape `[num_unique_forms, 2 * cle_dim]`, create the representation of the
104-
# (not necessary unique) sentence forms by indexing the character-level
105-
# embeddings with the `form_indices`. The result should have a shape
102+
# TODO: With `cle` being the character-level embeddings of the unique words
103+
# of shape `[num_unique_words, 2 * cle_dim]`, create the representation of the
104+
# (not necessary unique) sentence words by indexing the character-level
105+
# embeddings with the `word_indices`. The result should have a shape
106106
# `[batch_size, max_sentence_length, 2 * cle_dim]`. You can use for example
107107
# the `torch.nn.functional.embedding` function.
108108
cle = ...
@@ -137,34 +137,34 @@ def forward(self, form_ids: torch.Tensor, unique_forms: torch.Tensor, form_indic
137137
class TrainableDataset(npfl138.TransformedDataset):
138138
def transform(self, example):
139139
# TODO(tagger_we): Construct a single example, each consisting of the following pair:
140-
# - a PyTorch tensor of integer ids of input forms as input,
140+
# - a PyTorch tensor of integer ids of input words as input,
141141
# - a PyTorch tensor of integer tag ids as targets.
142-
# To create the ids, use `word_vocab` of `self.dataset.forms` and `self.dataset.tags`.
143-
form_ids = ...
142+
# To create the ids, use `word_vocab` of `self.dataset.words` and `self.dataset.tags`.
143+
word_ids = ...
144144
tag_ids = ...
145145
# Note that compared to `tagger_we`, we also return the original
146-
# forms in order to be able to compute the character-level embeddings.
147-
return form_ids, example["forms"], tag_ids
146+
# words in order to be able to compute the character-level embeddings.
147+
return word_ids, example["words"], tag_ids
148148

149149
def collate(self, batch):
150150
# Construct a single batch, where `data` is a list of examples
151151
# generated by `transform`.
152-
form_ids, forms, tag_ids = zip(*batch)
153-
# TODO(tagge_we): Combine `form_ids` into a single tensor, padding shorter
152+
word_ids, words, tag_ids = zip(*batch)
153+
# TODO(tagge_we): Combine `word_ids` into a single tensor, padding shorter
154154
# sequences to length of the longest sequence in the batch with zeros
155155
# using `torch.nn.utils.rnn.pad_sequence` with `batch_first=True` argument.
156-
form_ids = ...
156+
word_ids = ...
157157
# TODO: Create required inputs for the character-level embeddings using
158-
# the provided `self.dataset.cle_batch` function on `forms`. The function
158+
# the provided `self.dataset.cle_batch` function on `words`. The function
159159
# returns a pair of two PyTorch tensors:
160-
# - `unique_forms` with shape `[num_unique_forms, max_form_length]` containing
161-
# each unique form as a sequence of character ids,
162-
# - `forms_indices` with shape `[num_sentences, max_sentence_length]`
163-
# containing for every form its index in `unique_forms`.
164-
unique_forms, forms_indices = ...
165-
# TODO(tagger_we): Process `tag_ids` analogously to `form_ids`.
160+
# - `unique_words` with shape `[num_unique_words, max_word_length]` containing
161+
# each unique word as a sequence of character ids,
162+
# - `words_indices` with shape `[num_sentences, max_sentence_length]`
163+
# containing for every word its index in `unique_words`.
164+
unique_words, words_indices = ...
165+
# TODO(tagger_we): Process `tag_ids` analogously to `word_ids`.
166166
tag_ids = ...
167-
return (form_ids, unique_forms, forms_indices), tag_ids
167+
return (word_ids, unique_words, words_indices), tag_ids
168168

169169

170170
def main(args: argparse.Namespace) -> dict[str, float]:

labs/07/tagger_competition.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import torchmetrics
99

1010
import npfl138
11-
npfl138.require_version("2425.7")
11+
npfl138.require_version("2425.7.1")
1212
from npfl138.datasets.morpho_dataset import MorphoDataset
1313
from npfl138.datasets.morpho_analyzer import MorphoAnalyzer
1414

@@ -50,8 +50,8 @@ def main(args: argparse.Namespace) -> None:
5050
# the prediction for each sentence is a vector of shape `[exactly_sentence_len, num_tags]`.)
5151
predictions = model.predict(test, data_with_labels=True)
5252

53-
for predicted_tags, forms in zip(predictions, morpho.test.forms.strings):
54-
for predicted_tag in predicted_tags[:, :len(forms)].argmax(axis=0):
53+
for predicted_tags, words in zip(predictions, morpho.test.words.strings):
54+
for predicted_tag in predicted_tags[:, :len(words)].argmax(axis=0):
5555
print(morpho.train.tags.word_vocab.string(predicted_tag), file=predictions_file)
5656
print(file=predictions_file)
5757

labs/07/tagger_we.packed.py

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import torchmetrics
99

1010
import npfl138
11-
npfl138.require_version("2425.7")
11+
npfl138.require_version("2425.7.1")
1212
from npfl138.datasets.morpho_dataset import MorphoDataset
1313

1414
parser = argparse.ArgumentParser()
@@ -30,8 +30,8 @@ def __init__(self, args: argparse.Namespace, train: MorphoDataset.Dataset) -> No
3030
super().__init__()
3131

3232
# Create all needed layers.
33-
# TODO: Create a `torch.nn.Embedding` layer, embedding the form ids
34-
# from `train.forms.word_vocab` to dimensionality `args.we_dim`.
33+
# TODO: Create a `torch.nn.Embedding` layer, embedding the word ids
34+
# from `train.words.word_vocab` to dimensionality `args.we_dim`.
3535
self._word_embedding = ...
3636

3737
# TODO: Create an RNN layer, either `torch.nn.LSTM` or `torch.nn.GRU` depending
@@ -45,20 +45,20 @@ def __init__(self, args: argparse.Namespace, train: MorphoDataset.Dataset) -> No
4545
# producing logits for tag prediction; `train.tags.word_vocab` is the tag vocabulary.
4646
self._output_layer = ...
4747

48-
def forward(self, form_ids: torch.nn.utils.rnn.PackedSequence) -> torch.nn.utils.rnn.PackedSequence:
49-
# The input `form_ids` is a `PackedSequence` object. It allows us to:
50-
# - get the flattened data using `form_ids.data`; these are the data without
48+
def forward(self, word_ids: torch.nn.utils.rnn.PackedSequence) -> torch.nn.utils.rnn.PackedSequence:
49+
# The input `word_ids` is a `PackedSequence` object. It allows us to:
50+
# - get the flattened data using `word_ids.data`; these are the data without
5151
# padding elements, i.e., a 1D vector of shape `[sum_of_sentence_lengths]`;
5252
# - replace the data while keeping the sizes of the original sequences
53-
# by calling `form_ids._replace(data=...)` and getting a new `PackedSequence`.
53+
# by calling `word_ids._replace(data=...)` and getting a new `PackedSequence`.
5454
# Therefore, depending on the context, we need to use either the flattened
5555
# data or the `PackedSequence` object.
5656

57-
# TODO: Start by embedding the `form_ids` using the word embedding layer.
57+
# TODO: Start by embedding the `word_ids` using the word embedding layer.
5858
hidden = ...
5959

60-
# TODO: Process the embedded forms through the RNN layer, utilizing
61-
# the `PackedSequence` structure of `form_ids` (i.e., the same sentence lengths).
60+
# TODO: Process the embedded words through the RNN layer, utilizing
61+
# the `PackedSequence` structure of `word_ids` (i.e., the same sentence lengths).
6262
hidden = ...
6363

6464
# TODO: Sum the outputs of forward and backward directions.
@@ -68,7 +68,7 @@ def forward(self, form_ids: torch.nn.utils.rnn.PackedSequence) -> torch.nn.utils
6868
hidden = ...
6969

7070
# TODO: Finally, produce output predictions as a `PackedSequence` with the same
71-
# `PackedSequence` structure as `form_ids` (same sentence lengths).
71+
# `PackedSequence` structure as `word_ids` (same sentence lengths).
7272
hidden = ...
7373

7474
return hidden
@@ -87,23 +87,23 @@ def compute_metrics(self, y_pred, y_true, *xs):
8787
class TrainableDataset(npfl138.TransformedDataset):
8888
def transform(self, example):
8989
# TODO: Construct a single example, each consisting of the following pair:
90-
# - a PyTorch tensor of integer ids of input forms as input,
90+
# - a PyTorch tensor of integer ids of input words as input,
9191
# - a PyTorch tensor of integer tag ids as targets.
92-
# To create the ids, use `word_vocab` of `self.dataset.forms` and `self.dataset.tags`.
93-
form_ids = ...
92+
# To create the ids, use `word_vocab` of `self.dataset.words` and `self.dataset.tags`.
93+
word_ids = ...
9494
tag_ids = ...
95-
return form_ids, tag_ids
95+
return word_ids, tag_ids
9696

9797
def collate(self, batch):
9898
# Construct a single batch, where `data` is a list of examples
9999
# generated by `transform`.
100-
form_ids, tag_ids = zip(*batch)
101-
# TODO: Combine `form_ids` into a `PackedSequence` by calling
100+
word_ids, tag_ids = zip(*batch)
101+
# TODO: Combine `word_ids` into a `PackedSequence` by calling
102102
# `torch.nn.utils.rnn.pack_sequence` with `enforce_sorted=False`.
103-
form_ids = ...
104-
# TODO: Process `tag_ids` analogously to `form_ids`.
103+
word_ids = ...
104+
# TODO: Process `tag_ids` analogously to `word_ids`.
105105
tag_ids = ...
106-
return form_ids, tag_ids
106+
return word_ids, tag_ids
107107

108108

109109
def main(args: argparse.Namespace) -> dict[str, float]:

0 commit comments

Comments
 (0)