diff --git a/README.md b/README.md index 9b0c6ea26..f6025226e 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,21 @@ Each benchmark will run until the target quality is reached and then stop, print Some these benchmarks are rather slow or take a long time to run on the reference hardware. We expect to see significant performance improvements with more hardware and optimized implementations. +# MLPerf Training v5.0 (Submission Deadline May 2, 2025) + +* Framework here is given for the reference implementation. Submitters are free to use their own frameworks to run the benchmark. +* Model parameter count is not the same as active parameter that are being trained in the benchmark. + +| Model | reference implementation | framework | dataset | model parameter count +| ---- | ---- | ---- | ---- | ---- +| retinanet | [single_stage_detector](https://github.com/mlcommons/training/tree/master/single_stage_detector) | pytorch | OpenImages | 37M +| stable_diffusion | [stable_diffusion](https://github.com/mlcommons/training/tree/master/stable_diffusion) | pytorch | LAION-400M-filtered | 865M +| bert | [language_model](https://github.com/mlcommons/training/tree/master/language_model/tensorflow/bert) | tensorflow | Wikipedia 2020/01/01 | 340M +| llama3.1_405b | [large_language_model_pretraining](https://github.com/mlcommons/training/tree/master/large_language_model_pretraining) | NeMo | C4 | 405B +| llama2_70b_lora | [llama2_70b_lora](https://github.com/mlcommons/training/tree/master/llama2_70b_lora) | pytorch | SCROLLS GovReport | 70B +| dlrm_dcnv2 | [recommendation_v2](https://github.com/mlcommons/training/tree/master/recommendation_v2/torchrec_dlrm) | torchrec | Criteo 3.5TB multi-hot | 167M +| rgat | [graph_neural_network](https://github.com/mlcommons/training/tree/master/graph_neural_network) | GLT | IGBFull | 25M + # MLPerf Training v4.1 (Submission Deadline Oct 11, 2024) *Framework here is given for the reference implementation. Submitters are free to use their own frameworks to run the benchmark. diff --git a/large_language_model/megatron-lm/.gitignore b/retired_benchmarks/gpt3/megatron-lm/.gitignore similarity index 100% rename from large_language_model/megatron-lm/.gitignore rename to retired_benchmarks/gpt3/megatron-lm/.gitignore diff --git a/large_language_model/megatron-lm/Dockerfile b/retired_benchmarks/gpt3/megatron-lm/Dockerfile similarity index 100% rename from large_language_model/megatron-lm/Dockerfile rename to retired_benchmarks/gpt3/megatron-lm/Dockerfile diff --git a/large_language_model/megatron-lm/LICENSE b/retired_benchmarks/gpt3/megatron-lm/LICENSE similarity index 100% rename from large_language_model/megatron-lm/LICENSE rename to retired_benchmarks/gpt3/megatron-lm/LICENSE diff --git a/large_language_model/megatron-lm/README.md b/retired_benchmarks/gpt3/megatron-lm/README.md similarity index 100% rename from large_language_model/megatron-lm/README.md rename to retired_benchmarks/gpt3/megatron-lm/README.md diff --git a/large_language_model/megatron-lm/checksums/additional_checkpoint_files/common.pt b/retired_benchmarks/gpt3/megatron-lm/checksums/additional_checkpoint_files/common.pt similarity index 100% rename from large_language_model/megatron-lm/checksums/additional_checkpoint_files/common.pt rename to retired_benchmarks/gpt3/megatron-lm/checksums/additional_checkpoint_files/common.pt diff --git a/large_language_model/megatron-lm/checksums/additional_checkpoint_files/metadata.json b/retired_benchmarks/gpt3/megatron-lm/checksums/additional_checkpoint_files/metadata.json similarity index 100% rename from large_language_model/megatron-lm/checksums/additional_checkpoint_files/metadata.json rename to retired_benchmarks/gpt3/megatron-lm/checksums/additional_checkpoint_files/metadata.json diff --git a/large_language_model/megatron-lm/checksums/dataset_checksum.log b/retired_benchmarks/gpt3/megatron-lm/checksums/dataset_checksum.log similarity index 100% rename from large_language_model/megatron-lm/checksums/dataset_checksum.log rename to retired_benchmarks/gpt3/megatron-lm/checksums/dataset_checksum.log diff --git a/large_language_model/megatron-lm/checksums/fp32_checkpoint_checksum.log b/retired_benchmarks/gpt3/megatron-lm/checksums/fp32_checkpoint_checksum.log similarity index 100% rename from large_language_model/megatron-lm/checksums/fp32_checkpoint_checksum.log rename to retired_benchmarks/gpt3/megatron-lm/checksums/fp32_checkpoint_checksum.log diff --git a/large_language_model/megatron-lm/megatron/__init__.py b/retired_benchmarks/gpt3/megatron-lm/megatron/__init__.py similarity index 100% rename from large_language_model/megatron-lm/megatron/__init__.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/__init__.py diff --git a/large_language_model/megatron-lm/megatron/arguments.py b/retired_benchmarks/gpt3/megatron-lm/megatron/arguments.py similarity index 100% rename from large_language_model/megatron-lm/megatron/arguments.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/arguments.py diff --git a/large_language_model/megatron-lm/megatron/checkpointing.py b/retired_benchmarks/gpt3/megatron-lm/megatron/checkpointing.py similarity index 100% rename from large_language_model/megatron-lm/megatron/checkpointing.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/checkpointing.py diff --git a/large_language_model/megatron-lm/megatron/core/dist_checkpointing/__init__.py b/retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/__init__.py similarity index 100% rename from large_language_model/megatron-lm/megatron/core/dist_checkpointing/__init__.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/__init__.py diff --git a/large_language_model/megatron-lm/megatron/core/dist_checkpointing/core.py b/retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/core.py similarity index 100% rename from large_language_model/megatron-lm/megatron/core/dist_checkpointing/core.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/core.py diff --git a/large_language_model/megatron-lm/megatron/core/dist_checkpointing/dict_utils.py b/retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/dict_utils.py similarity index 100% rename from large_language_model/megatron-lm/megatron/core/dist_checkpointing/dict_utils.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/dict_utils.py diff --git a/large_language_model/megatron-lm/megatron/core/dist_checkpointing/mapping.py b/retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/mapping.py similarity index 100% rename from large_language_model/megatron-lm/megatron/core/dist_checkpointing/mapping.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/mapping.py diff --git a/large_language_model/megatron-lm/megatron/core/dist_checkpointing/optimizer.py b/retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/optimizer.py similarity index 100% rename from large_language_model/megatron-lm/megatron/core/dist_checkpointing/optimizer.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/optimizer.py diff --git a/large_language_model/megatron-lm/megatron/core/dist_checkpointing/serialization.py b/retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/serialization.py similarity index 100% rename from large_language_model/megatron-lm/megatron/core/dist_checkpointing/serialization.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/serialization.py diff --git a/large_language_model/megatron-lm/megatron/core/dist_checkpointing/strategies/__init__.py b/retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/strategies/__init__.py similarity index 100% rename from large_language_model/megatron-lm/megatron/core/dist_checkpointing/strategies/__init__.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/strategies/__init__.py diff --git a/large_language_model/megatron-lm/megatron/core/dist_checkpointing/strategies/base.py b/retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/strategies/base.py similarity index 100% rename from large_language_model/megatron-lm/megatron/core/dist_checkpointing/strategies/base.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/strategies/base.py diff --git a/large_language_model/megatron-lm/megatron/core/dist_checkpointing/strategies/zarr.py b/retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/strategies/zarr.py similarity index 100% rename from large_language_model/megatron-lm/megatron/core/dist_checkpointing/strategies/zarr.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/strategies/zarr.py diff --git a/large_language_model/megatron-lm/megatron/core/dist_checkpointing/tests/__init__.py b/retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/tests/__init__.py similarity index 100% rename from large_language_model/megatron-lm/megatron/core/dist_checkpointing/tests/__init__.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/tests/__init__.py diff --git a/large_language_model/megatron-lm/megatron/core/dist_checkpointing/tests/common.py b/retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/tests/common.py similarity index 100% rename from large_language_model/megatron-lm/megatron/core/dist_checkpointing/tests/common.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/tests/common.py diff --git a/large_language_model/megatron-lm/megatron/core/dist_checkpointing/tests/test_correctness.py b/retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/tests/test_correctness.py similarity index 100% rename from large_language_model/megatron-lm/megatron/core/dist_checkpointing/tests/test_correctness.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/tests/test_correctness.py diff --git a/large_language_model/megatron-lm/megatron/core/dist_checkpointing/tests/test_load_check.py b/retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/tests/test_load_check.py similarity index 100% rename from large_language_model/megatron-lm/megatron/core/dist_checkpointing/tests/test_load_check.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/tests/test_load_check.py diff --git a/large_language_model/megatron-lm/megatron/core/dist_checkpointing/utils.py b/retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/utils.py similarity index 100% rename from large_language_model/megatron-lm/megatron/core/dist_checkpointing/utils.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/utils.py diff --git a/large_language_model/megatron-lm/megatron/data/Makefile b/retired_benchmarks/gpt3/megatron-lm/megatron/data/Makefile similarity index 100% rename from large_language_model/megatron-lm/megatron/data/Makefile rename to retired_benchmarks/gpt3/megatron-lm/megatron/data/Makefile diff --git a/large_language_model/megatron-lm/megatron/data/__init__.py b/retired_benchmarks/gpt3/megatron-lm/megatron/data/__init__.py similarity index 100% rename from large_language_model/megatron-lm/megatron/data/__init__.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/data/__init__.py diff --git a/large_language_model/megatron-lm/megatron/data/autoaugment.py b/retired_benchmarks/gpt3/megatron-lm/megatron/data/autoaugment.py similarity index 100% rename from large_language_model/megatron-lm/megatron/data/autoaugment.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/data/autoaugment.py diff --git a/large_language_model/megatron-lm/megatron/data/biencoder_dataset_utils.py b/retired_benchmarks/gpt3/megatron-lm/megatron/data/biencoder_dataset_utils.py similarity index 100% rename from large_language_model/megatron-lm/megatron/data/biencoder_dataset_utils.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/data/biencoder_dataset_utils.py diff --git a/large_language_model/megatron-lm/megatron/data/blendable_dataset.py b/retired_benchmarks/gpt3/megatron-lm/megatron/data/blendable_dataset.py similarity index 100% rename from large_language_model/megatron-lm/megatron/data/blendable_dataset.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/data/blendable_dataset.py diff --git a/large_language_model/megatron-lm/megatron/data/data_samplers.py b/retired_benchmarks/gpt3/megatron-lm/megatron/data/data_samplers.py similarity index 100% rename from large_language_model/megatron-lm/megatron/data/data_samplers.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/data/data_samplers.py diff --git a/large_language_model/megatron-lm/megatron/data/dataset_utils.py b/retired_benchmarks/gpt3/megatron-lm/megatron/data/dataset_utils.py similarity index 100% rename from large_language_model/megatron-lm/megatron/data/dataset_utils.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/data/dataset_utils.py diff --git a/large_language_model/megatron-lm/megatron/data/gpt_dataset.py b/retired_benchmarks/gpt3/megatron-lm/megatron/data/gpt_dataset.py similarity index 100% rename from large_language_model/megatron-lm/megatron/data/gpt_dataset.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/data/gpt_dataset.py diff --git a/large_language_model/megatron-lm/megatron/data/helpers.cpp b/retired_benchmarks/gpt3/megatron-lm/megatron/data/helpers.cpp similarity index 100% rename from large_language_model/megatron-lm/megatron/data/helpers.cpp rename to retired_benchmarks/gpt3/megatron-lm/megatron/data/helpers.cpp diff --git a/large_language_model/megatron-lm/megatron/data/ict_dataset.py b/retired_benchmarks/gpt3/megatron-lm/megatron/data/ict_dataset.py similarity index 100% rename from large_language_model/megatron-lm/megatron/data/ict_dataset.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/data/ict_dataset.py diff --git a/large_language_model/megatron-lm/megatron/data/indexed_dataset.py b/retired_benchmarks/gpt3/megatron-lm/megatron/data/indexed_dataset.py similarity index 100% rename from large_language_model/megatron-lm/megatron/data/indexed_dataset.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/data/indexed_dataset.py diff --git a/large_language_model/megatron-lm/megatron/data/orqa_wiki_dataset.py b/retired_benchmarks/gpt3/megatron-lm/megatron/data/orqa_wiki_dataset.py similarity index 100% rename from large_language_model/megatron-lm/megatron/data/orqa_wiki_dataset.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/data/orqa_wiki_dataset.py diff --git a/large_language_model/megatron-lm/megatron/data/realm_dataset_utils.py b/retired_benchmarks/gpt3/megatron-lm/megatron/data/realm_dataset_utils.py similarity index 100% rename from large_language_model/megatron-lm/megatron/data/realm_dataset_utils.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/data/realm_dataset_utils.py diff --git a/large_language_model/megatron-lm/megatron/data/realm_index.py b/retired_benchmarks/gpt3/megatron-lm/megatron/data/realm_index.py similarity index 100% rename from large_language_model/megatron-lm/megatron/data/realm_index.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/data/realm_index.py diff --git a/large_language_model/megatron-lm/megatron/data/test/test_indexed_dataset.py b/retired_benchmarks/gpt3/megatron-lm/megatron/data/test/test_indexed_dataset.py similarity index 100% rename from large_language_model/megatron-lm/megatron/data/test/test_indexed_dataset.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/data/test/test_indexed_dataset.py diff --git a/large_language_model/megatron-lm/megatron/data/test/test_preprocess_data.sh b/retired_benchmarks/gpt3/megatron-lm/megatron/data/test/test_preprocess_data.sh similarity index 100% rename from large_language_model/megatron-lm/megatron/data/test/test_preprocess_data.sh rename to retired_benchmarks/gpt3/megatron-lm/megatron/data/test/test_preprocess_data.sh diff --git a/large_language_model/megatron-lm/megatron/dist_signal_handler.py b/retired_benchmarks/gpt3/megatron-lm/megatron/dist_signal_handler.py similarity index 100% rename from large_language_model/megatron-lm/megatron/dist_signal_handler.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/dist_signal_handler.py diff --git a/large_language_model/megatron-lm/megatron/fp16_deprecated/loss_scaler.py b/retired_benchmarks/gpt3/megatron-lm/megatron/fp16_deprecated/loss_scaler.py similarity index 100% rename from large_language_model/megatron-lm/megatron/fp16_deprecated/loss_scaler.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/fp16_deprecated/loss_scaler.py diff --git a/large_language_model/megatron-lm/megatron/fused_kernels/__init__.py b/retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/__init__.py similarity index 100% rename from large_language_model/megatron-lm/megatron/fused_kernels/__init__.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/__init__.py diff --git a/large_language_model/megatron-lm/megatron/fused_kernels/compat.h b/retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/compat.h similarity index 100% rename from large_language_model/megatron-lm/megatron/fused_kernels/compat.h rename to retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/compat.h diff --git a/large_language_model/megatron-lm/megatron/fused_kernels/fused_weight_gradient_dense.cpp b/retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/fused_weight_gradient_dense.cpp similarity index 100% rename from large_language_model/megatron-lm/megatron/fused_kernels/fused_weight_gradient_dense.cpp rename to retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/fused_weight_gradient_dense.cpp diff --git a/large_language_model/megatron-lm/megatron/fused_kernels/fused_weight_gradient_dense.cu b/retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/fused_weight_gradient_dense.cu similarity index 100% rename from large_language_model/megatron-lm/megatron/fused_kernels/fused_weight_gradient_dense.cu rename to retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/fused_weight_gradient_dense.cu diff --git a/large_language_model/megatron-lm/megatron/fused_kernels/layer_norm_cuda.cpp b/retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/layer_norm_cuda.cpp similarity index 100% rename from large_language_model/megatron-lm/megatron/fused_kernels/layer_norm_cuda.cpp rename to retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/layer_norm_cuda.cpp diff --git a/large_language_model/megatron-lm/megatron/fused_kernels/layer_norm_cuda_kernel.cu b/retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/layer_norm_cuda_kernel.cu similarity index 100% rename from large_language_model/megatron-lm/megatron/fused_kernels/layer_norm_cuda_kernel.cu rename to retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/layer_norm_cuda_kernel.cu diff --git a/large_language_model/megatron-lm/megatron/fused_kernels/scaled_masked_softmax.cpp b/retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/scaled_masked_softmax.cpp similarity index 100% rename from large_language_model/megatron-lm/megatron/fused_kernels/scaled_masked_softmax.cpp rename to retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/scaled_masked_softmax.cpp diff --git a/large_language_model/megatron-lm/megatron/fused_kernels/scaled_masked_softmax.h b/retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/scaled_masked_softmax.h similarity index 100% rename from large_language_model/megatron-lm/megatron/fused_kernels/scaled_masked_softmax.h rename to retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/scaled_masked_softmax.h diff --git a/large_language_model/megatron-lm/megatron/fused_kernels/scaled_masked_softmax_cuda.cu b/retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/scaled_masked_softmax_cuda.cu similarity index 100% rename from large_language_model/megatron-lm/megatron/fused_kernels/scaled_masked_softmax_cuda.cu rename to retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/scaled_masked_softmax_cuda.cu diff --git a/large_language_model/megatron-lm/megatron/fused_kernels/scaled_softmax.cpp b/retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/scaled_softmax.cpp similarity index 100% rename from large_language_model/megatron-lm/megatron/fused_kernels/scaled_softmax.cpp rename to retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/scaled_softmax.cpp diff --git a/large_language_model/megatron-lm/megatron/fused_kernels/scaled_softmax_cuda.cu b/retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/scaled_softmax_cuda.cu similarity index 100% rename from large_language_model/megatron-lm/megatron/fused_kernels/scaled_softmax_cuda.cu rename to retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/scaled_softmax_cuda.cu diff --git a/large_language_model/megatron-lm/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp b/retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp similarity index 100% rename from large_language_model/megatron-lm/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp rename to retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp diff --git a/large_language_model/megatron-lm/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h b/retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h similarity index 100% rename from large_language_model/megatron-lm/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h rename to retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h diff --git a/large_language_model/megatron-lm/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu similarity index 100% rename from large_language_model/megatron-lm/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu rename to retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu diff --git a/large_language_model/megatron-lm/megatron/fused_kernels/tests/__init__.py b/retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/tests/__init__.py similarity index 100% rename from large_language_model/megatron-lm/megatron/fused_kernels/tests/__init__.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/tests/__init__.py diff --git a/large_language_model/megatron-lm/megatron/fused_kernels/tests/test_fused_kernels.py b/retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/tests/test_fused_kernels.py similarity index 100% rename from large_language_model/megatron-lm/megatron/fused_kernels/tests/test_fused_kernels.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/tests/test_fused_kernels.py diff --git a/large_language_model/megatron-lm/megatron/fused_kernels/type_shim.h b/retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/type_shim.h similarity index 100% rename from large_language_model/megatron-lm/megatron/fused_kernels/type_shim.h rename to retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/type_shim.h diff --git a/large_language_model/megatron-lm/megatron/global_vars.py b/retired_benchmarks/gpt3/megatron-lm/megatron/global_vars.py similarity index 100% rename from large_language_model/megatron-lm/megatron/global_vars.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/global_vars.py diff --git a/large_language_model/megatron-lm/megatron/indexer.py b/retired_benchmarks/gpt3/megatron-lm/megatron/indexer.py similarity index 100% rename from large_language_model/megatron-lm/megatron/indexer.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/indexer.py diff --git a/large_language_model/megatron-lm/megatron/initialize.py b/retired_benchmarks/gpt3/megatron-lm/megatron/initialize.py similarity index 100% rename from large_language_model/megatron-lm/megatron/initialize.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/initialize.py diff --git a/large_language_model/megatron-lm/megatron/learning_rates.py b/retired_benchmarks/gpt3/megatron-lm/megatron/learning_rates.py similarity index 100% rename from large_language_model/megatron-lm/megatron/learning_rates.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/learning_rates.py diff --git a/large_language_model/megatron-lm/megatron/memory.py b/retired_benchmarks/gpt3/megatron-lm/megatron/memory.py similarity index 100% rename from large_language_model/megatron-lm/megatron/memory.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/memory.py diff --git a/large_language_model/megatron-lm/megatron/microbatches.py b/retired_benchmarks/gpt3/megatron-lm/megatron/microbatches.py similarity index 100% rename from large_language_model/megatron-lm/megatron/microbatches.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/microbatches.py diff --git a/large_language_model/megatron-lm/megatron/model/__init__.py b/retired_benchmarks/gpt3/megatron-lm/megatron/model/__init__.py similarity index 100% rename from large_language_model/megatron-lm/megatron/model/__init__.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/model/__init__.py diff --git a/large_language_model/megatron-lm/megatron/model/biencoder_model.py b/retired_benchmarks/gpt3/megatron-lm/megatron/model/biencoder_model.py similarity index 100% rename from large_language_model/megatron-lm/megatron/model/biencoder_model.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/model/biencoder_model.py diff --git a/large_language_model/megatron-lm/megatron/model/classification.py b/retired_benchmarks/gpt3/megatron-lm/megatron/model/classification.py similarity index 100% rename from large_language_model/megatron-lm/megatron/model/classification.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/model/classification.py diff --git a/large_language_model/megatron-lm/megatron/model/distributed.py b/retired_benchmarks/gpt3/megatron-lm/megatron/model/distributed.py similarity index 100% rename from large_language_model/megatron-lm/megatron/model/distributed.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/model/distributed.py diff --git a/large_language_model/megatron-lm/megatron/model/enums.py b/retired_benchmarks/gpt3/megatron-lm/megatron/model/enums.py similarity index 100% rename from large_language_model/megatron-lm/megatron/model/enums.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/model/enums.py diff --git a/large_language_model/megatron-lm/megatron/model/fused_bias_gelu.py b/retired_benchmarks/gpt3/megatron-lm/megatron/model/fused_bias_gelu.py similarity index 100% rename from large_language_model/megatron-lm/megatron/model/fused_bias_gelu.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/model/fused_bias_gelu.py diff --git a/large_language_model/megatron-lm/megatron/model/fused_layer_norm.py b/retired_benchmarks/gpt3/megatron-lm/megatron/model/fused_layer_norm.py similarity index 100% rename from large_language_model/megatron-lm/megatron/model/fused_layer_norm.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/model/fused_layer_norm.py diff --git a/large_language_model/megatron-lm/megatron/model/fused_softmax.py b/retired_benchmarks/gpt3/megatron-lm/megatron/model/fused_softmax.py similarity index 100% rename from large_language_model/megatron-lm/megatron/model/fused_softmax.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/model/fused_softmax.py diff --git a/large_language_model/megatron-lm/megatron/model/gpt_model.py b/retired_benchmarks/gpt3/megatron-lm/megatron/model/gpt_model.py similarity index 100% rename from large_language_model/megatron-lm/megatron/model/gpt_model.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/model/gpt_model.py diff --git a/large_language_model/megatron-lm/megatron/model/language_model.py b/retired_benchmarks/gpt3/megatron-lm/megatron/model/language_model.py similarity index 100% rename from large_language_model/megatron-lm/megatron/model/language_model.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/model/language_model.py diff --git a/large_language_model/megatron-lm/megatron/model/module.py b/retired_benchmarks/gpt3/megatron-lm/megatron/model/module.py similarity index 100% rename from large_language_model/megatron-lm/megatron/model/module.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/model/module.py diff --git a/large_language_model/megatron-lm/megatron/model/multiple_choice.py b/retired_benchmarks/gpt3/megatron-lm/megatron/model/multiple_choice.py similarity index 100% rename from large_language_model/megatron-lm/megatron/model/multiple_choice.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/model/multiple_choice.py diff --git a/large_language_model/megatron-lm/megatron/model/realm_model.py b/retired_benchmarks/gpt3/megatron-lm/megatron/model/realm_model.py similarity index 100% rename from large_language_model/megatron-lm/megatron/model/realm_model.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/model/realm_model.py diff --git a/large_language_model/megatron-lm/megatron/model/transformer.py b/retired_benchmarks/gpt3/megatron-lm/megatron/model/transformer.py similarity index 100% rename from large_language_model/megatron-lm/megatron/model/transformer.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/model/transformer.py diff --git a/large_language_model/megatron-lm/megatron/model/utils.py b/retired_benchmarks/gpt3/megatron-lm/megatron/model/utils.py similarity index 100% rename from large_language_model/megatron-lm/megatron/model/utils.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/model/utils.py diff --git a/large_language_model/megatron-lm/megatron/mpu/__init__.py b/retired_benchmarks/gpt3/megatron-lm/megatron/mpu/__init__.py similarity index 100% rename from large_language_model/megatron-lm/megatron/mpu/__init__.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/mpu/__init__.py diff --git a/large_language_model/megatron-lm/megatron/mpu/cross_entropy.py b/retired_benchmarks/gpt3/megatron-lm/megatron/mpu/cross_entropy.py similarity index 100% rename from large_language_model/megatron-lm/megatron/mpu/cross_entropy.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/mpu/cross_entropy.py diff --git a/large_language_model/megatron-lm/megatron/mpu/data.py b/retired_benchmarks/gpt3/megatron-lm/megatron/mpu/data.py similarity index 100% rename from large_language_model/megatron-lm/megatron/mpu/data.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/mpu/data.py diff --git a/large_language_model/megatron-lm/megatron/mpu/initialize.py b/retired_benchmarks/gpt3/megatron-lm/megatron/mpu/initialize.py similarity index 100% rename from large_language_model/megatron-lm/megatron/mpu/initialize.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/mpu/initialize.py diff --git a/large_language_model/megatron-lm/megatron/mpu/layers.py b/retired_benchmarks/gpt3/megatron-lm/megatron/mpu/layers.py similarity index 100% rename from large_language_model/megatron-lm/megatron/mpu/layers.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/mpu/layers.py diff --git a/large_language_model/megatron-lm/megatron/mpu/mappings.py b/retired_benchmarks/gpt3/megatron-lm/megatron/mpu/mappings.py similarity index 100% rename from large_language_model/megatron-lm/megatron/mpu/mappings.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/mpu/mappings.py diff --git a/large_language_model/megatron-lm/megatron/mpu/random.py b/retired_benchmarks/gpt3/megatron-lm/megatron/mpu/random.py similarity index 100% rename from large_language_model/megatron-lm/megatron/mpu/random.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/mpu/random.py diff --git a/large_language_model/megatron-lm/megatron/mpu/tests/__init__.py b/retired_benchmarks/gpt3/megatron-lm/megatron/mpu/tests/__init__.py similarity index 100% rename from large_language_model/megatron-lm/megatron/mpu/tests/__init__.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/mpu/tests/__init__.py diff --git a/large_language_model/megatron-lm/megatron/mpu/tests/commons.py b/retired_benchmarks/gpt3/megatron-lm/megatron/mpu/tests/commons.py similarity index 100% rename from large_language_model/megatron-lm/megatron/mpu/tests/commons.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/mpu/tests/commons.py diff --git a/large_language_model/megatron-lm/megatron/mpu/tests/test_cross_entropy.py b/retired_benchmarks/gpt3/megatron-lm/megatron/mpu/tests/test_cross_entropy.py similarity index 100% rename from large_language_model/megatron-lm/megatron/mpu/tests/test_cross_entropy.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/mpu/tests/test_cross_entropy.py diff --git a/large_language_model/megatron-lm/megatron/mpu/tests/test_data.py b/retired_benchmarks/gpt3/megatron-lm/megatron/mpu/tests/test_data.py similarity index 100% rename from large_language_model/megatron-lm/megatron/mpu/tests/test_data.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/mpu/tests/test_data.py diff --git a/large_language_model/megatron-lm/megatron/mpu/tests/test_initialize.py b/retired_benchmarks/gpt3/megatron-lm/megatron/mpu/tests/test_initialize.py similarity index 100% rename from large_language_model/megatron-lm/megatron/mpu/tests/test_initialize.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/mpu/tests/test_initialize.py diff --git a/large_language_model/megatron-lm/megatron/mpu/tests/test_layers.py b/retired_benchmarks/gpt3/megatron-lm/megatron/mpu/tests/test_layers.py similarity index 100% rename from large_language_model/megatron-lm/megatron/mpu/tests/test_layers.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/mpu/tests/test_layers.py diff --git a/large_language_model/megatron-lm/megatron/mpu/tests/test_random.py b/retired_benchmarks/gpt3/megatron-lm/megatron/mpu/tests/test_random.py similarity index 100% rename from large_language_model/megatron-lm/megatron/mpu/tests/test_random.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/mpu/tests/test_random.py diff --git a/large_language_model/megatron-lm/megatron/mpu/utils.py b/retired_benchmarks/gpt3/megatron-lm/megatron/mpu/utils.py similarity index 100% rename from large_language_model/megatron-lm/megatron/mpu/utils.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/mpu/utils.py diff --git a/large_language_model/megatron-lm/megatron/optimizer/__init__.py b/retired_benchmarks/gpt3/megatron-lm/megatron/optimizer/__init__.py similarity index 100% rename from large_language_model/megatron-lm/megatron/optimizer/__init__.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/optimizer/__init__.py diff --git a/large_language_model/megatron-lm/megatron/optimizer/clip_grads.py b/retired_benchmarks/gpt3/megatron-lm/megatron/optimizer/clip_grads.py similarity index 100% rename from large_language_model/megatron-lm/megatron/optimizer/clip_grads.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/optimizer/clip_grads.py diff --git a/large_language_model/megatron-lm/megatron/optimizer/distrib_optimizer.py b/retired_benchmarks/gpt3/megatron-lm/megatron/optimizer/distrib_optimizer.py similarity index 100% rename from large_language_model/megatron-lm/megatron/optimizer/distrib_optimizer.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/optimizer/distrib_optimizer.py diff --git a/large_language_model/megatron-lm/megatron/optimizer/grad_scaler.py b/retired_benchmarks/gpt3/megatron-lm/megatron/optimizer/grad_scaler.py similarity index 100% rename from large_language_model/megatron-lm/megatron/optimizer/grad_scaler.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/optimizer/grad_scaler.py diff --git a/large_language_model/megatron-lm/megatron/optimizer/optimizer.py b/retired_benchmarks/gpt3/megatron-lm/megatron/optimizer/optimizer.py similarity index 100% rename from large_language_model/megatron-lm/megatron/optimizer/optimizer.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/optimizer/optimizer.py diff --git a/large_language_model/megatron-lm/megatron/optimizer_param_scheduler.py b/retired_benchmarks/gpt3/megatron-lm/megatron/optimizer_param_scheduler.py similarity index 100% rename from large_language_model/megatron-lm/megatron/optimizer_param_scheduler.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/optimizer_param_scheduler.py diff --git a/large_language_model/megatron-lm/megatron/p2p_communication.py b/retired_benchmarks/gpt3/megatron-lm/megatron/p2p_communication.py similarity index 100% rename from large_language_model/megatron-lm/megatron/p2p_communication.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/p2p_communication.py diff --git a/large_language_model/megatron-lm/megatron/schedules.py b/retired_benchmarks/gpt3/megatron-lm/megatron/schedules.py similarity index 100% rename from large_language_model/megatron-lm/megatron/schedules.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/schedules.py diff --git a/large_language_model/megatron-lm/megatron/static/index.html b/retired_benchmarks/gpt3/megatron-lm/megatron/static/index.html similarity index 100% rename from large_language_model/megatron-lm/megatron/static/index.html rename to retired_benchmarks/gpt3/megatron-lm/megatron/static/index.html diff --git a/large_language_model/megatron-lm/megatron/text_generation/__init__.py b/retired_benchmarks/gpt3/megatron-lm/megatron/text_generation/__init__.py similarity index 100% rename from large_language_model/megatron-lm/megatron/text_generation/__init__.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/text_generation/__init__.py diff --git a/large_language_model/megatron-lm/megatron/text_generation/api.py b/retired_benchmarks/gpt3/megatron-lm/megatron/text_generation/api.py similarity index 100% rename from large_language_model/megatron-lm/megatron/text_generation/api.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/text_generation/api.py diff --git a/large_language_model/megatron-lm/megatron/text_generation/beam_utils.py b/retired_benchmarks/gpt3/megatron-lm/megatron/text_generation/beam_utils.py similarity index 100% rename from large_language_model/megatron-lm/megatron/text_generation/beam_utils.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/text_generation/beam_utils.py diff --git a/large_language_model/megatron-lm/megatron/text_generation/communication.py b/retired_benchmarks/gpt3/megatron-lm/megatron/text_generation/communication.py similarity index 100% rename from large_language_model/megatron-lm/megatron/text_generation/communication.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/text_generation/communication.py diff --git a/large_language_model/megatron-lm/megatron/text_generation/forward_step.py b/retired_benchmarks/gpt3/megatron-lm/megatron/text_generation/forward_step.py similarity index 100% rename from large_language_model/megatron-lm/megatron/text_generation/forward_step.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/text_generation/forward_step.py diff --git a/large_language_model/megatron-lm/megatron/text_generation/generation.py b/retired_benchmarks/gpt3/megatron-lm/megatron/text_generation/generation.py similarity index 100% rename from large_language_model/megatron-lm/megatron/text_generation/generation.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/text_generation/generation.py diff --git a/large_language_model/megatron-lm/megatron/text_generation/sampling.py b/retired_benchmarks/gpt3/megatron-lm/megatron/text_generation/sampling.py similarity index 100% rename from large_language_model/megatron-lm/megatron/text_generation/sampling.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/text_generation/sampling.py diff --git a/large_language_model/megatron-lm/megatron/text_generation/tokenization.py b/retired_benchmarks/gpt3/megatron-lm/megatron/text_generation/tokenization.py similarity index 100% rename from large_language_model/megatron-lm/megatron/text_generation/tokenization.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/text_generation/tokenization.py diff --git a/large_language_model/megatron-lm/megatron/text_generation_server.py b/retired_benchmarks/gpt3/megatron-lm/megatron/text_generation_server.py similarity index 100% rename from large_language_model/megatron-lm/megatron/text_generation_server.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/text_generation_server.py diff --git a/large_language_model/megatron-lm/megatron/timers.py b/retired_benchmarks/gpt3/megatron-lm/megatron/timers.py similarity index 100% rename from large_language_model/megatron-lm/megatron/timers.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/timers.py diff --git a/large_language_model/megatron-lm/megatron/tokenizer/__init__.py b/retired_benchmarks/gpt3/megatron-lm/megatron/tokenizer/__init__.py similarity index 100% rename from large_language_model/megatron-lm/megatron/tokenizer/__init__.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/tokenizer/__init__.py diff --git a/large_language_model/megatron-lm/megatron/tokenizer/bert_tokenization.py b/retired_benchmarks/gpt3/megatron-lm/megatron/tokenizer/bert_tokenization.py similarity index 100% rename from large_language_model/megatron-lm/megatron/tokenizer/bert_tokenization.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/tokenizer/bert_tokenization.py diff --git a/large_language_model/megatron-lm/megatron/tokenizer/gpt2_tokenization.py b/retired_benchmarks/gpt3/megatron-lm/megatron/tokenizer/gpt2_tokenization.py similarity index 100% rename from large_language_model/megatron-lm/megatron/tokenizer/gpt2_tokenization.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/tokenizer/gpt2_tokenization.py diff --git a/large_language_model/megatron-lm/megatron/tokenizer/tokenizer.py b/retired_benchmarks/gpt3/megatron-lm/megatron/tokenizer/tokenizer.py similarity index 100% rename from large_language_model/megatron-lm/megatron/tokenizer/tokenizer.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/tokenizer/tokenizer.py diff --git a/large_language_model/megatron-lm/megatron/training.py b/retired_benchmarks/gpt3/megatron-lm/megatron/training.py similarity index 100% rename from large_language_model/megatron-lm/megatron/training.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/training.py diff --git a/large_language_model/megatron-lm/megatron/utils.py b/retired_benchmarks/gpt3/megatron-lm/megatron/utils.py similarity index 100% rename from large_language_model/megatron-lm/megatron/utils.py rename to retired_benchmarks/gpt3/megatron-lm/megatron/utils.py diff --git a/large_language_model/megatron-lm/pretrain_gpt.py b/retired_benchmarks/gpt3/megatron-lm/pretrain_gpt.py similarity index 100% rename from large_language_model/megatron-lm/pretrain_gpt.py rename to retired_benchmarks/gpt3/megatron-lm/pretrain_gpt.py diff --git a/large_language_model/megatron-lm/requirements.txt b/retired_benchmarks/gpt3/megatron-lm/requirements.txt similarity index 100% rename from large_language_model/megatron-lm/requirements.txt rename to retired_benchmarks/gpt3/megatron-lm/requirements.txt diff --git a/large_language_model/megatron-lm/run_gpt3.sh b/retired_benchmarks/gpt3/megatron-lm/run_gpt3.sh similarity index 100% rename from large_language_model/megatron-lm/run_gpt3.sh rename to retired_benchmarks/gpt3/megatron-lm/run_gpt3.sh diff --git a/large_language_model/megatron-lm/scripts/common_bf16.json b/retired_benchmarks/gpt3/megatron-lm/scripts/common_bf16.json similarity index 100% rename from large_language_model/megatron-lm/scripts/common_bf16.json rename to retired_benchmarks/gpt3/megatron-lm/scripts/common_bf16.json diff --git a/large_language_model/megatron-lm/scripts/common_fp32.json b/retired_benchmarks/gpt3/megatron-lm/scripts/common_fp32.json similarity index 100% rename from large_language_model/megatron-lm/scripts/common_fp32.json rename to retired_benchmarks/gpt3/megatron-lm/scripts/common_fp32.json diff --git a/large_language_model/megatron-lm/scripts/convert_paxml_to_megatron_distributed.py b/retired_benchmarks/gpt3/megatron-lm/scripts/convert_paxml_to_megatron_distributed.py similarity index 100% rename from large_language_model/megatron-lm/scripts/convert_paxml_to_megatron_distributed.py rename to retired_benchmarks/gpt3/megatron-lm/scripts/convert_paxml_to_megatron_distributed.py diff --git a/large_language_model/megatron-lm/scripts/json_to_torch.py b/retired_benchmarks/gpt3/megatron-lm/scripts/json_to_torch.py similarity index 100% rename from large_language_model/megatron-lm/scripts/json_to_torch.py rename to retired_benchmarks/gpt3/megatron-lm/scripts/json_to_torch.py diff --git a/large_language_model/megatron-lm/scripts/load_checkpoint.md b/retired_benchmarks/gpt3/megatron-lm/scripts/load_checkpoint.md similarity index 100% rename from large_language_model/megatron-lm/scripts/load_checkpoint.md rename to retired_benchmarks/gpt3/megatron-lm/scripts/load_checkpoint.md diff --git a/large_language_model/megatron-lm/scripts/load_checkpoint.py b/retired_benchmarks/gpt3/megatron-lm/scripts/load_checkpoint.py similarity index 100% rename from large_language_model/megatron-lm/scripts/load_checkpoint.py rename to retired_benchmarks/gpt3/megatron-lm/scripts/load_checkpoint.py diff --git a/large_language_model/megatron-lm/scripts/preprocess.sh b/retired_benchmarks/gpt3/megatron-lm/scripts/preprocess.sh similarity index 100% rename from large_language_model/megatron-lm/scripts/preprocess.sh rename to retired_benchmarks/gpt3/megatron-lm/scripts/preprocess.sh diff --git a/large_language_model/megatron-lm/scripts/preprocess_val.sh b/retired_benchmarks/gpt3/megatron-lm/scripts/preprocess_val.sh similarity index 100% rename from large_language_model/megatron-lm/scripts/preprocess_val.sh rename to retired_benchmarks/gpt3/megatron-lm/scripts/preprocess_val.sh diff --git a/large_language_model/megatron-lm/tasks/data_utils.py b/retired_benchmarks/gpt3/megatron-lm/tasks/data_utils.py similarity index 100% rename from large_language_model/megatron-lm/tasks/data_utils.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/data_utils.py diff --git a/large_language_model/megatron-lm/tasks/ensemble_classifier.py b/retired_benchmarks/gpt3/megatron-lm/tasks/ensemble_classifier.py similarity index 100% rename from large_language_model/megatron-lm/tasks/ensemble_classifier.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/ensemble_classifier.py diff --git a/large_language_model/megatron-lm/tasks/eval_utils.py b/retired_benchmarks/gpt3/megatron-lm/tasks/eval_utils.py similarity index 100% rename from large_language_model/megatron-lm/tasks/eval_utils.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/eval_utils.py diff --git a/large_language_model/megatron-lm/tasks/finetune_utils.py b/retired_benchmarks/gpt3/megatron-lm/tasks/finetune_utils.py similarity index 100% rename from large_language_model/megatron-lm/tasks/finetune_utils.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/finetune_utils.py diff --git a/large_language_model/megatron-lm/tasks/glue/data.py b/retired_benchmarks/gpt3/megatron-lm/tasks/glue/data.py similarity index 100% rename from large_language_model/megatron-lm/tasks/glue/data.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/glue/data.py diff --git a/large_language_model/megatron-lm/tasks/glue/finetune.py b/retired_benchmarks/gpt3/megatron-lm/tasks/glue/finetune.py similarity index 100% rename from large_language_model/megatron-lm/tasks/glue/finetune.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/glue/finetune.py diff --git a/large_language_model/megatron-lm/tasks/glue/mnli.py b/retired_benchmarks/gpt3/megatron-lm/tasks/glue/mnli.py similarity index 100% rename from large_language_model/megatron-lm/tasks/glue/mnli.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/glue/mnli.py diff --git a/large_language_model/megatron-lm/tasks/glue/qqp.py b/retired_benchmarks/gpt3/megatron-lm/tasks/glue/qqp.py similarity index 100% rename from large_language_model/megatron-lm/tasks/glue/qqp.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/glue/qqp.py diff --git a/large_language_model/megatron-lm/tasks/main.py b/retired_benchmarks/gpt3/megatron-lm/tasks/main.py similarity index 100% rename from large_language_model/megatron-lm/tasks/main.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/main.py diff --git a/large_language_model/megatron-lm/tasks/orqa/README.md b/retired_benchmarks/gpt3/megatron-lm/tasks/orqa/README.md similarity index 100% rename from large_language_model/megatron-lm/tasks/orqa/README.md rename to retired_benchmarks/gpt3/megatron-lm/tasks/orqa/README.md diff --git a/large_language_model/megatron-lm/tasks/orqa/evaluate_orqa.py b/retired_benchmarks/gpt3/megatron-lm/tasks/orqa/evaluate_orqa.py similarity index 100% rename from large_language_model/megatron-lm/tasks/orqa/evaluate_orqa.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/orqa/evaluate_orqa.py diff --git a/large_language_model/megatron-lm/tasks/orqa/evaluate_utils.py b/retired_benchmarks/gpt3/megatron-lm/tasks/orqa/evaluate_utils.py similarity index 100% rename from large_language_model/megatron-lm/tasks/orqa/evaluate_utils.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/orqa/evaluate_utils.py diff --git a/large_language_model/megatron-lm/tasks/orqa/supervised/data.py b/retired_benchmarks/gpt3/megatron-lm/tasks/orqa/supervised/data.py similarity index 100% rename from large_language_model/megatron-lm/tasks/orqa/supervised/data.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/orqa/supervised/data.py diff --git a/large_language_model/megatron-lm/tasks/orqa/supervised/eval_utils.py b/retired_benchmarks/gpt3/megatron-lm/tasks/orqa/supervised/eval_utils.py similarity index 100% rename from large_language_model/megatron-lm/tasks/orqa/supervised/eval_utils.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/orqa/supervised/eval_utils.py diff --git a/large_language_model/megatron-lm/tasks/orqa/supervised/finetune.py b/retired_benchmarks/gpt3/megatron-lm/tasks/orqa/supervised/finetune.py similarity index 100% rename from large_language_model/megatron-lm/tasks/orqa/supervised/finetune.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/orqa/supervised/finetune.py diff --git a/large_language_model/megatron-lm/tasks/orqa/unsupervised/nq.py b/retired_benchmarks/gpt3/megatron-lm/tasks/orqa/unsupervised/nq.py similarity index 100% rename from large_language_model/megatron-lm/tasks/orqa/unsupervised/nq.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/orqa/unsupervised/nq.py diff --git a/large_language_model/megatron-lm/tasks/orqa/unsupervised/qa_utils.py b/retired_benchmarks/gpt3/megatron-lm/tasks/orqa/unsupervised/qa_utils.py similarity index 100% rename from large_language_model/megatron-lm/tasks/orqa/unsupervised/qa_utils.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/orqa/unsupervised/qa_utils.py diff --git a/large_language_model/megatron-lm/tasks/orqa/unsupervised/tokenizers.py b/retired_benchmarks/gpt3/megatron-lm/tasks/orqa/unsupervised/tokenizers.py similarity index 100% rename from large_language_model/megatron-lm/tasks/orqa/unsupervised/tokenizers.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/orqa/unsupervised/tokenizers.py diff --git a/large_language_model/megatron-lm/tasks/race/data.py b/retired_benchmarks/gpt3/megatron-lm/tasks/race/data.py similarity index 100% rename from large_language_model/megatron-lm/tasks/race/data.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/race/data.py diff --git a/large_language_model/megatron-lm/tasks/race/finetune.py b/retired_benchmarks/gpt3/megatron-lm/tasks/race/finetune.py similarity index 100% rename from large_language_model/megatron-lm/tasks/race/finetune.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/race/finetune.py diff --git a/large_language_model/megatron-lm/tasks/vision/classification.py b/retired_benchmarks/gpt3/megatron-lm/tasks/vision/classification.py similarity index 100% rename from large_language_model/megatron-lm/tasks/vision/classification.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/vision/classification.py diff --git a/large_language_model/megatron-lm/tasks/vision/eval_utils.py b/retired_benchmarks/gpt3/megatron-lm/tasks/vision/eval_utils.py similarity index 100% rename from large_language_model/megatron-lm/tasks/vision/eval_utils.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/vision/eval_utils.py diff --git a/large_language_model/megatron-lm/tasks/vision/finetune_utils.py b/retired_benchmarks/gpt3/megatron-lm/tasks/vision/finetune_utils.py similarity index 100% rename from large_language_model/megatron-lm/tasks/vision/finetune_utils.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/vision/finetune_utils.py diff --git a/large_language_model/megatron-lm/tasks/vision/main.py b/retired_benchmarks/gpt3/megatron-lm/tasks/vision/main.py similarity index 100% rename from large_language_model/megatron-lm/tasks/vision/main.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/vision/main.py diff --git a/large_language_model/megatron-lm/tasks/zeroshot_gpt/datasets.py b/retired_benchmarks/gpt3/megatron-lm/tasks/zeroshot_gpt/datasets.py similarity index 100% rename from large_language_model/megatron-lm/tasks/zeroshot_gpt/datasets.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/zeroshot_gpt/datasets.py diff --git a/large_language_model/megatron-lm/tasks/zeroshot_gpt/detokenizer.py b/retired_benchmarks/gpt3/megatron-lm/tasks/zeroshot_gpt/detokenizer.py similarity index 100% rename from large_language_model/megatron-lm/tasks/zeroshot_gpt/detokenizer.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/zeroshot_gpt/detokenizer.py diff --git a/large_language_model/megatron-lm/tasks/zeroshot_gpt/evaluate.py b/retired_benchmarks/gpt3/megatron-lm/tasks/zeroshot_gpt/evaluate.py similarity index 100% rename from large_language_model/megatron-lm/tasks/zeroshot_gpt/evaluate.py rename to retired_benchmarks/gpt3/megatron-lm/tasks/zeroshot_gpt/evaluate.py diff --git a/large_language_model/megatron-lm/tools/preprocess_data.py b/retired_benchmarks/gpt3/megatron-lm/tools/preprocess_data.py similarity index 100% rename from large_language_model/megatron-lm/tools/preprocess_data.py rename to retired_benchmarks/gpt3/megatron-lm/tools/preprocess_data.py diff --git a/large_language_model/paxml/README.md b/retired_benchmarks/gpt3/paxml/README.md similarity index 100% rename from large_language_model/paxml/README.md rename to retired_benchmarks/gpt3/paxml/README.md diff --git a/large_language_model/paxml/c4.py b/retired_benchmarks/gpt3/paxml/c4.py similarity index 100% rename from large_language_model/paxml/c4.py rename to retired_benchmarks/gpt3/paxml/c4.py diff --git a/large_language_model/paxml/c4_mllog.py b/retired_benchmarks/gpt3/paxml/c4_mllog.py similarity index 100% rename from large_language_model/paxml/c4_mllog.py rename to retired_benchmarks/gpt3/paxml/c4_mllog.py diff --git a/large_language_model/paxml/lm_cloud.py b/retired_benchmarks/gpt3/paxml/lm_cloud.py similarity index 100% rename from large_language_model/paxml/lm_cloud.py rename to retired_benchmarks/gpt3/paxml/lm_cloud.py diff --git a/large_language_model/paxml/model_params.py b/retired_benchmarks/gpt3/paxml/model_params.py similarity index 100% rename from large_language_model/paxml/model_params.py rename to retired_benchmarks/gpt3/paxml/model_params.py diff --git a/large_language_model/paxml/utils/generate_spm.md b/retired_benchmarks/gpt3/paxml/utils/generate_spm.md similarity index 100% rename from large_language_model/paxml/utils/generate_spm.md rename to retired_benchmarks/gpt3/paxml/utils/generate_spm.md diff --git a/large_language_model/paxml/utils/load_ts_ckpt.md b/retired_benchmarks/gpt3/paxml/utils/load_ts_ckpt.md similarity index 100% rename from large_language_model/paxml/utils/load_ts_ckpt.md rename to retired_benchmarks/gpt3/paxml/utils/load_ts_ckpt.md diff --git a/large_language_model/paxml/utils/load_ts_ckpt.py b/retired_benchmarks/gpt3/paxml/utils/load_ts_ckpt.py similarity index 100% rename from large_language_model/paxml/utils/load_ts_ckpt.py rename to retired_benchmarks/gpt3/paxml/utils/load_ts_ckpt.py diff --git a/large_language_model/paxml/utils/select_example.md b/retired_benchmarks/gpt3/paxml/utils/select_example.md similarity index 100% rename from large_language_model/paxml/utils/select_example.md rename to retired_benchmarks/gpt3/paxml/utils/select_example.md diff --git a/large_language_model/paxml/utils/select_example.py b/retired_benchmarks/gpt3/paxml/utils/select_example.py similarity index 100% rename from large_language_model/paxml/utils/select_example.py rename to retired_benchmarks/gpt3/paxml/utils/select_example.py diff --git a/large_language_model/paxml/utils/select_text.py b/retired_benchmarks/gpt3/paxml/utils/select_text.py similarity index 100% rename from large_language_model/paxml/utils/select_text.py rename to retired_benchmarks/gpt3/paxml/utils/select_text.py diff --git a/mixture_of_experts_pretraining/README.md b/retired_benchmarks/mixtral8x22b/README.md similarity index 100% rename from mixture_of_experts_pretraining/README.md rename to retired_benchmarks/mixtral8x22b/README.md diff --git a/mixture_of_experts_pretraining/clm_datasets.py b/retired_benchmarks/mixtral8x22b/clm_datasets.py similarity index 100% rename from mixture_of_experts_pretraining/clm_datasets.py rename to retired_benchmarks/mixtral8x22b/clm_datasets.py diff --git a/mixture_of_experts_pretraining/config/config.yaml b/retired_benchmarks/mixtral8x22b/config/config.yaml similarity index 100% rename from mixture_of_experts_pretraining/config/config.yaml rename to retired_benchmarks/mixtral8x22b/config/config.yaml diff --git a/mixture_of_experts_pretraining/config/dataset/c4_mlperf.yaml b/retired_benchmarks/mixtral8x22b/config/dataset/c4_mlperf.yaml similarity index 100% rename from mixture_of_experts_pretraining/config/dataset/c4_mlperf.yaml rename to retired_benchmarks/mixtral8x22b/config/dataset/c4_mlperf.yaml diff --git a/mixture_of_experts_pretraining/config/dataset/wikitext.yaml b/retired_benchmarks/mixtral8x22b/config/dataset/wikitext.yaml similarity index 100% rename from mixture_of_experts_pretraining/config/dataset/wikitext.yaml rename to retired_benchmarks/mixtral8x22b/config/dataset/wikitext.yaml diff --git a/mixture_of_experts_pretraining/config/experiment/convergence_template.yaml b/retired_benchmarks/mixtral8x22b/config/experiment/convergence_template.yaml similarity index 100% rename from mixture_of_experts_pretraining/config/experiment/convergence_template.yaml rename to retired_benchmarks/mixtral8x22b/config/experiment/convergence_template.yaml diff --git a/mixture_of_experts_pretraining/config/experiment/gbs256_tpu.yaml b/retired_benchmarks/mixtral8x22b/config/experiment/gbs256_tpu.yaml similarity index 100% rename from mixture_of_experts_pretraining/config/experiment/gbs256_tpu.yaml rename to retired_benchmarks/mixtral8x22b/config/experiment/gbs256_tpu.yaml diff --git a/mixture_of_experts_pretraining/config/model/blank_model.yaml b/retired_benchmarks/mixtral8x22b/config/model/blank_model.yaml similarity index 100% rename from mixture_of_experts_pretraining/config/model/blank_model.yaml rename to retired_benchmarks/mixtral8x22b/config/model/blank_model.yaml diff --git a/mixture_of_experts_pretraining/config/sched/CosineAnnealing.yaml b/retired_benchmarks/mixtral8x22b/config/sched/CosineAnnealing.yaml similarity index 100% rename from mixture_of_experts_pretraining/config/sched/CosineAnnealing.yaml rename to retired_benchmarks/mixtral8x22b/config/sched/CosineAnnealing.yaml diff --git a/mixture_of_experts_pretraining/config/sched/WarmupHoldPolicy.yaml b/retired_benchmarks/mixtral8x22b/config/sched/WarmupHoldPolicy.yaml similarity index 100% rename from mixture_of_experts_pretraining/config/sched/WarmupHoldPolicy.yaml rename to retired_benchmarks/mixtral8x22b/config/sched/WarmupHoldPolicy.yaml diff --git a/mixture_of_experts_pretraining/docker/gpu/Dockerfile b/retired_benchmarks/mixtral8x22b/docker/gpu/Dockerfile similarity index 100% rename from mixture_of_experts_pretraining/docker/gpu/Dockerfile rename to retired_benchmarks/mixtral8x22b/docker/gpu/Dockerfile diff --git a/mixture_of_experts_pretraining/docker/gpu/Dockerfile.GCP b/retired_benchmarks/mixtral8x22b/docker/gpu/Dockerfile.GCP similarity index 100% rename from mixture_of_experts_pretraining/docker/gpu/Dockerfile.GCP rename to retired_benchmarks/mixtral8x22b/docker/gpu/Dockerfile.GCP diff --git a/mixture_of_experts_pretraining/docker/gpu/build_and_push_image.sh b/retired_benchmarks/mixtral8x22b/docker/gpu/build_and_push_image.sh similarity index 100% rename from mixture_of_experts_pretraining/docker/gpu/build_and_push_image.sh rename to retired_benchmarks/mixtral8x22b/docker/gpu/build_and_push_image.sh diff --git a/mixture_of_experts_pretraining/docker/gpu/megatron_core.patch b/retired_benchmarks/mixtral8x22b/docker/gpu/megatron_core.patch similarity index 100% rename from mixture_of_experts_pretraining/docker/gpu/megatron_core.patch rename to retired_benchmarks/mixtral8x22b/docker/gpu/megatron_core.patch diff --git a/mixture_of_experts_pretraining/docker/tpu/Dockerfile b/retired_benchmarks/mixtral8x22b/docker/tpu/Dockerfile similarity index 100% rename from mixture_of_experts_pretraining/docker/tpu/Dockerfile rename to retired_benchmarks/mixtral8x22b/docker/tpu/Dockerfile diff --git a/mixture_of_experts_pretraining/docker/tpu/build_and_push_image.sh b/retired_benchmarks/mixtral8x22b/docker/tpu/build_and_push_image.sh similarity index 100% rename from mixture_of_experts_pretraining/docker/tpu/build_and_push_image.sh rename to retired_benchmarks/mixtral8x22b/docker/tpu/build_and_push_image.sh diff --git a/mixture_of_experts_pretraining/download_dataset.py b/retired_benchmarks/mixtral8x22b/download_dataset.py similarity index 100% rename from mixture_of_experts_pretraining/download_dataset.py rename to retired_benchmarks/mixtral8x22b/download_dataset.py diff --git a/mixture_of_experts_pretraining/file_utils.py b/retired_benchmarks/mixtral8x22b/file_utils.py similarity index 100% rename from mixture_of_experts_pretraining/file_utils.py rename to retired_benchmarks/mixtral8x22b/file_utils.py diff --git a/mixture_of_experts_pretraining/helm_context/Chart.yaml b/retired_benchmarks/mixtral8x22b/helm_context/Chart.yaml similarity index 100% rename from mixture_of_experts_pretraining/helm_context/Chart.yaml rename to retired_benchmarks/mixtral8x22b/helm_context/Chart.yaml diff --git a/mixture_of_experts_pretraining/helm_context/selected-configuration.yaml b/retired_benchmarks/mixtral8x22b/helm_context/selected-configuration.yaml similarity index 100% rename from mixture_of_experts_pretraining/helm_context/selected-configuration.yaml rename to retired_benchmarks/mixtral8x22b/helm_context/selected-configuration.yaml diff --git a/mixture_of_experts_pretraining/helm_context/templates/nemo-example.yaml b/retired_benchmarks/mixtral8x22b/helm_context/templates/nemo-example.yaml similarity index 100% rename from mixture_of_experts_pretraining/helm_context/templates/nemo-example.yaml rename to retired_benchmarks/mixtral8x22b/helm_context/templates/nemo-example.yaml diff --git a/mixture_of_experts_pretraining/helm_context/values.yaml b/retired_benchmarks/mixtral8x22b/helm_context/values.yaml similarity index 100% rename from mixture_of_experts_pretraining/helm_context/values.yaml rename to retired_benchmarks/mixtral8x22b/helm_context/values.yaml diff --git a/mixture_of_experts_pretraining/mixtral80.json b/retired_benchmarks/mixtral8x22b/mixtral80.json similarity index 100% rename from mixture_of_experts_pretraining/mixtral80.json rename to retired_benchmarks/mixtral8x22b/mixtral80.json diff --git a/mixture_of_experts_pretraining/mixtral822-instruct.json b/retired_benchmarks/mixtral8x22b/mixtral822-instruct.json similarity index 100% rename from mixture_of_experts_pretraining/mixtral822-instruct.json rename to retired_benchmarks/mixtral8x22b/mixtral822-instruct.json diff --git a/mixture_of_experts_pretraining/mixtral822.json b/retired_benchmarks/mixtral8x22b/mixtral822.json similarity index 100% rename from mixture_of_experts_pretraining/mixtral822.json rename to retired_benchmarks/mixtral8x22b/mixtral822.json diff --git a/mixture_of_experts_pretraining/mixtral87.json b/retired_benchmarks/mixtral8x22b/mixtral87.json similarity index 100% rename from mixture_of_experts_pretraining/mixtral87.json rename to retired_benchmarks/mixtral8x22b/mixtral87.json diff --git a/mixture_of_experts_pretraining/mlperf_logging_utils.py b/retired_benchmarks/mixtral8x22b/mlperf_logging_utils.py similarity index 100% rename from mixture_of_experts_pretraining/mlperf_logging_utils.py rename to retired_benchmarks/mixtral8x22b/mlperf_logging_utils.py diff --git a/mixture_of_experts_pretraining/model_utils_gpu.py b/retired_benchmarks/mixtral8x22b/model_utils_gpu.py similarity index 100% rename from mixture_of_experts_pretraining/model_utils_gpu.py rename to retired_benchmarks/mixtral8x22b/model_utils_gpu.py diff --git a/mixture_of_experts_pretraining/model_utils_tpu.py b/retired_benchmarks/mixtral8x22b/model_utils_tpu.py similarity index 100% rename from mixture_of_experts_pretraining/model_utils_tpu.py rename to retired_benchmarks/mixtral8x22b/model_utils_tpu.py diff --git a/mixture_of_experts_pretraining/run_clm.py b/retired_benchmarks/mixtral8x22b/run_clm.py similarity index 100% rename from mixture_of_experts_pretraining/run_clm.py rename to retired_benchmarks/mixtral8x22b/run_clm.py diff --git a/mixture_of_experts_pretraining/scripts/gpu/checkpoint_download.py b/retired_benchmarks/mixtral8x22b/scripts/gpu/checkpoint_download.py similarity index 100% rename from mixture_of_experts_pretraining/scripts/gpu/checkpoint_download.py rename to retired_benchmarks/mixtral8x22b/scripts/gpu/checkpoint_download.py diff --git a/mixture_of_experts_pretraining/scripts/gpu/dataset_preprocessing.py b/retired_benchmarks/mixtral8x22b/scripts/gpu/dataset_preprocessing.py similarity index 100% rename from mixture_of_experts_pretraining/scripts/gpu/dataset_preprocessing.py rename to retired_benchmarks/mixtral8x22b/scripts/gpu/dataset_preprocessing.py diff --git a/mixture_of_experts_pretraining/scripts/gpu/run.sub b/retired_benchmarks/mixtral8x22b/scripts/gpu/run.sub similarity index 100% rename from mixture_of_experts_pretraining/scripts/gpu/run.sub rename to retired_benchmarks/mixtral8x22b/scripts/gpu/run.sub diff --git a/mixture_of_experts_pretraining/scripts/tpu/distributed_checkpoint_saving.py b/retired_benchmarks/mixtral8x22b/scripts/tpu/distributed_checkpoint_saving.py similarity index 100% rename from mixture_of_experts_pretraining/scripts/tpu/distributed_checkpoint_saving.py rename to retired_benchmarks/mixtral8x22b/scripts/tpu/distributed_checkpoint_saving.py diff --git a/mixture_of_experts_pretraining/trainer_utils_tpu.py b/retired_benchmarks/mixtral8x22b/trainer_utils_tpu.py similarity index 100% rename from mixture_of_experts_pretraining/trainer_utils_tpu.py rename to retired_benchmarks/mixtral8x22b/trainer_utils_tpu.py