fix(moe): Fix OOM and HF requirements for CUDA path

hXl3s · hXl3s · commit 34086acc3ce7 · 2025-02-03T15:23:21.000+01:00
diff --git a/mixture_of_experts_pretraining/clm_datasets.py b/mixture_of_experts_pretraining/clm_datasets.py
@@ -251,7 +251,7 @@ def mask_pad(examples):
 # need to run in cpu with single process
 # to walk around undefined `OmegaConf.register_new_resolver` need to overwrite `run_dir` `global_train_batch_size` `global_eval_batch_size`
 # python clm_datasets.py model.name_or_path=mistralai/Mixtral-8x22B-v0.1 run_dir=/tmp global_train_batch_size=1 global_eval_batch_size=1 max_length=32768
-@hydra.main(config_path="config", config_name="config")
+@hydra.main(version_base=None, config_path="config", config_name="config")
 def main(config: DictConfig):
     tokenizer = AutoTokenizer.from_pretrained(
         config.model.name_or_path,
diff --git a/mixture_of_experts_pretraining/docker/gpu/Dockerfile b/mixture_of_experts_pretraining/docker/gpu/Dockerfile
@@ -41,6 +41,7 @@ ENV PYTHONPATH "${PYTHONPATH}:/app/Megatron-LM"
 RUN pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger
 RUN pip install datasets==2.20.0 hydra-core sentencepiece
 RUN pip install "git+https://github.com/mlperf/logging.git"
+RUN pip install git+https://github.com/NVIDIA/NeMo-Run.git
 
 WORKDIR /app/training
 ADD . /app/training
diff --git a/mixture_of_experts_pretraining/model_utils_gpu.py b/mixture_of_experts_pretraining/model_utils_gpu.py
@@ -14,7 +14,6 @@
 limitations under the License.
 """
 
-import os
 
 import torch
 from megatron.core.optimizer import OptimizerConfig
@@ -26,37 +25,9 @@
 
 def setup_distributed(config):
     """Initialize torch.distributed."""
-    # Get rank and world size.
-    local_rank = int(os.getenv("LOCAL_RANK", 0))
-    rank = int(os.getenv("RANK", "0"))
-    world_size = int(os.getenv("WORLD_SIZE", "1"))
-
-    logging.info(
-        f"Initializing torch.distributed with local_rank: {local_rank}, rank: {rank}, world_size: {world_size}"
-    )
-
-    # Set the device id.
-    device = rank % torch.cuda.device_count()
-    if local_rank is not None:
-        device = local_rank
-    torch.cuda.set_device(device)
-
-    # Call the init process.
-    init_method = "tcp://"
-    master_ip = os.getenv("MASTER_ADDR", "localhost")
-    master_port = os.getenv("MASTER_PORT", "6000")
-    import datetime
-
-    DEFAULT_TIMEOUT = datetime.timedelta(minutes=60)
-    init_method += master_ip + ":" + master_port
     torch.distributed.init_process_group(
         backend="nccl",
-        timeout=DEFAULT_TIMEOUT,
-        world_size=world_size,
-        rank=rank,
-        init_method=init_method,
     )
-    return local_rank, rank, world_size
 
 
 def setup_model_and_trainer(
@@ -124,6 +95,7 @@ def setup_model_and_trainer(
         fp16=False,
         params_dtype=torch.bfloat16,
         clip_grad=max_grad_norm,
+        use_distributed_optimizer=True,
     )
 
     if scheduler.name == "CosineAnnealing":
diff --git a/mixture_of_experts_pretraining/run_clm.py b/mixture_of_experts_pretraining/run_clm.py
@@ -85,16 +85,16 @@ def main(config: DictConfig):
         )
     logger.info(f"{config.eval_frequency=}")
 
-    tokenizer = AutoTokenizer.from_pretrained(
-        config.model.name_or_path,
-        add_eos_token=False,
-        add_bos_token=False,
-        use_fast=False,
-    )
-
     clmlogger = ClmLogger(config, filename="output.txt")
 
     if not USE_CUDA:
+        tokenizer = AutoTokenizer.from_pretrained(
+            config.model.name_or_path,
+            add_eos_token=False,
+            add_bos_token=False,
+            use_fast=False,
+        )
+
         config_path = os.path.join(config.run_dir, "config.yaml")
         with get_file(config_path, "w") as f:
             OmegaConf.save(config, f)