Merge pull request #4745 from vfdev-5:fix-lm1b-nnx-example

Flax Authors · Flax Authors · commit a2fe3ca28d01 · 2025-05-29T16:43:14.000-07:00
PiperOrigin-RevId: 764936098
diff --git a/examples/lm1b_nnx/README.md b/examples/lm1b_nnx/README.md
@@ -52,7 +52,7 @@ Then install Flax + the example dependencies:
 git clone --depth=1 --branch=main https://github.com/google/flax
 cd flax
 pip install -e .
-cd examples/lm1b
+cd examples/lm1b_nnx
 pip install -r requirements.txt
 ```
 
@@ -75,9 +75,9 @@ tensorboard --logdir=$HOME/logs
 You should expect to get numbers similar to these:
 
 
-Hardware | config  | Training time |      Loss      |                             TensorBoard.dev                              |                                                          Workdir
--------- | ------- | ------------- | -------------- | ------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------
-TPU v3-8 | default | 13h18m | 3.127 | [2021-08-08](https://tensorboard.dev/experiment/n30WkNOZTJq3RHWD7wNslg/) | [gs://flax_public/examples/lm1b/default](https://console.cloud.google.com/storage/browser/flax_public/examples/lm1b/default)
+Hardware | config  | Training time |      Loss      |                                                          Workdir
+-------- | ------- | ------------- | -------------- | --------------------------------------------------------------------------------------------------------------------------
+TPU v3-8 | default | 13h18m | 3.127 | [gs://flax_public/examples/lm1b/default](https://console.cloud.google.com/storage/browser/flax_public/examples/lm1b/default)
 
 ### Downloading the LM1B Datasets
 
@@ -87,6 +87,5 @@ data on a storage bucket, from where it can be loaded directly. Set the
 `TFDS_DATA_DIR` to your storage bucket path (`gs://<bucket name>`).
 
 You can download and prepare LM1B datasets using TFDS directly:
-`python -m tensorflow_datasets.scripts.download_and_prepare
---datasets=lm1b`
+`python -m tensorflow_datasets.scripts.download_and_prepare --datasets=lm1b`
 
diff --git a/examples/lm1b_nnx/input_pipeline_test.py b/examples/lm1b_nnx/input_pipeline_test.py
@@ -48,9 +48,9 @@ def _get_datasets(self):
     vocab_path = os.path.join(tempfile.mkdtemp(), 'sentencepiece_model')
 
     # Go two directories up to the root of the flax directory.
-    flax_root_dir = pathlib.Path(__file__).parents[4]
+    # "/path/to/flax/examples/lm1b_nnx/models_test.py" -> "/path/to/flax"
+    flax_root_dir = pathlib.Path(__file__).absolute().parents[2]
     data_dir = str(flax_root_dir) + '/.tfds/metadata'  # pylint: disable=unused-variable
-
     with tfds.testing.mock_data(num_examples=128, data_dir=data_dir):
       train_ds, eval_ds, predict_ds, _ = input_pipeline.get_datasets(
         n_devices=2, config=config, vocab_path=vocab_path
diff --git a/examples/lm1b_nnx/main.py b/examples/lm1b_nnx/main.py
@@ -34,7 +34,7 @@
   'File path to the training hyperparameter configuration.',
   lock_config=True,
 )
-flags.mark_flags_as_required(['config', 'workdir'])
+flags.mark_flags_as_required(['workdir'])
 
 
 def main(argv):
diff --git a/examples/lm1b_nnx/models.py b/examples/lm1b_nnx/models.py
@@ -292,6 +292,7 @@ def __init__(self, config: TransformerConfig, *, rngs: nnx.Rngs):
       broadcast_dropout=False,
       dropout_rate=config.attention_dropout_rate,
       rngs=rngs,
+      keep_rngs=False,
     )
     self.mlp = MlpBlock(config=config, rngs=rngs)
     self.dropout = nnx.Dropout(rate=config.dropout_rate)
diff --git a/examples/lm1b_nnx/models_test.py b/examples/lm1b_nnx/models_test.py
@@ -34,7 +34,8 @@
 jax.config.update('jax_disable_most_optimizations', True)
 
 # add project_root to import lm1b Linen model
-project_root = str(Path(__file__).absolute().parents[4])
+# "/path/to/flax/examples/lm1b_nnx/models_test.py" -> "/path/to/flax"
+project_root = str(Path(__file__).absolute().parents[2])
 sys.path.append(project_root)
 from examples.lm1b.models import TransformerLM as TransformerLinen  # type: ignore[import-error]
 
diff --git a/examples/lm1b_nnx/train.py b/examples/lm1b_nnx/train.py
@@ -20,7 +20,6 @@
 # pytype: disable=wrong-arg-count
 # pytype: disable=attribute-error
 
-import collections
 import dataclasses
 import os
 
@@ -41,7 +40,6 @@
 from jax.sharding import PartitionSpec as P
 from utils import HasCache, TrainState
 
-from flax import linen as nn
 from flax import nnx
 from flax.training import checkpoints, common_utils
 
@@ -115,7 +113,7 @@ def compute_weighted_cross_entropy(
     targets, vocab_size, on_value=confidence, off_value=low_confidence
   )
 
-  loss = -jnp.sum(soft_targets * nn.log_softmax(logits), axis=-1)
+  loss = -jnp.sum(soft_targets * nnx.log_softmax(logits), axis=-1)
   loss = loss - normalizing_constant
 
   normalizing_factor = np.prod(targets.shape)
@@ -389,6 +387,7 @@ def train_and_evaluate(config: default.Config, workdir: str):
     workdir: Working directory for checkpoints and TF summaries. If this
       contains checkpoint training will be resumed from the latest checkpoint.
   """
+  workdir = os.path.abspath(workdir)
   tf.io.gfile.makedirs(workdir)
 
   vocab_path = config.vocab_path
@@ -440,18 +439,15 @@ def encode_strings(strs, max_len):
     max_len=max(config.max_target_length, config.max_eval_target_length),
     dropout_rate=config.dropout_rate,
     attention_dropout_rate=config.attention_dropout_rate,
-    kernel_init=nn.initializers.xavier_uniform(),
-    bias_init=nn.initializers.normal(stddev=1e-6),
+    kernel_init=nnx.initializers.xavier_uniform(),
+    bias_init=nnx.initializers.normal(stddev=1e-6),
     axis_rules=config.axis_rules,
   )
 
   # Mesh definition
   devices_array = utils.create_device_mesh(config)
   mesh = Mesh(devices_array, config.mesh_axes)
 
-  # print(mesh.shape)
-  # exit()
-
   start_step = 0
   rng = jax.random.PRNGKey(config.seed)
   rng, init_rng = jax.random.split(rng)
@@ -498,7 +494,7 @@ def constructor(config: models.TransformerConfig, key: jax.Array):
       None,
     ),  # type: ignore
     out_shardings=(state_sharding, None),  # type: ignore
-    static_argnums=(2, 3),
+    static_argnames=("learning_rate_fn", "label_smoothing"),
     donate_argnums=0,
   )
 
@@ -509,7 +505,7 @@ def constructor(config: models.TransformerConfig, key: jax.Array):
       data_sharding,
     ),  # type: ignore
     out_shardings=None,  # type: ignore
-    static_argnums=(2, 3),
+    static_argnames=("graphdef", "label_smoothing"),
   )
 
   # Since the inputs and rngkey args for predict_step will be batched,
@@ -575,7 +571,7 @@ def constructor(config: models.TransformerConfig, key: jax.Array):
         h(step)
 
       # Periodic metric handling.
-      if step % config.eval_every_steps == 0 or is_last_step:
+      if (step > 0 and step % config.eval_every_steps == 0) or is_last_step:
         with report_progress.timed('training_metrics'):
           logging.info('Gathering training metrics.')
           train_metrics = common_utils.stack_forest(train_metrics)
diff --git a/examples/lm1b_nnx/utils.py b/examples/lm1b_nnx/utils.py
@@ -159,7 +159,10 @@ def setup_initial_state(
     model = constructor(config, rng)
     graphdef, params = nnx.split(model, nnx.Param)
     state = TrainState.create(
-      apply_fn=graphdef.apply, params=params, tx=tx, graphdef=graphdef
+      apply_fn=graphdef.apply,
+      params=params,
+      tx=tx,
+      graphdef=graphdef,
     )
     state = jax.tree.map(_to_array, state)
     state_spec = nnx.get_partition_spec(state)
diff --git a/tests/download_dataset_metadata.sh b/tests/download_dataset_metadata.sh
@@ -8,7 +8,7 @@
 
 set -e
 
-# Download TFDS metadata to flax/.tdfs/metadata directory.
+# Download TFDS metadata to flax/.tfds/metadata directory.
 # This allows the tests to specify the `data_dir` when using tfds.testing.mock_data().
 cd "$( dirname "$0" )"
 

Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@`
`34`	`34`	`'File path to the training hyperparameter configuration.',`
`35`	`35`	`lock_config=True,`
`36`	`36`	`)`
`37`		`-flags.mark_flags_as_required(['config', 'workdir'])`
	`37`	`+flags.mark_flags_as_required(['workdir'])`
`38`	`38`
`39`	`39`
`40`	`40`	`def main(argv):`
Original file line number	Diff line number	Diff line change
`@@ -292,6 +292,7 @@ def __init__(self, config: TransformerConfig, *, rngs: nnx.Rngs):`
`292`	`292`	`broadcast_dropout=False,`
`293`	`293`	`dropout_rate=config.attention_dropout_rate,`
`294`	`294`	`rngs=rngs,`
	`295`	`+ keep_rngs=False,`
`295`	`296`	`)`
`296`	`297`	`self.mlp = MlpBlock(config=config, rngs=rngs)`
`297`	`298`	`self.dropout = nnx.Dropout(rate=config.dropout_rate)`