Migrate optimizer userbenchmarks to linux aws 100 runners (#2557)

janeyx99 · facebook-github-bot · commit 6f191e9e8d7d · 2024-12-13T12:23:36.000-08:00
Summary: The migration is larger than I thought--I will test this time to ensure it's correct. Tested here: https://github.com/pytorch/benchmark/actions/runs/12321260782/job/34392279551 Pull Request resolved: #2557 Reviewed By: kit1980 Differential Revision: D67211361 Pulled By: janeyx99 fbshipit-source-id: 686e07aab132c18c1fe1a0ffd444ec66f29802ef
diff --git a/.github/workflows/userbenchmark-regression-detector.yml b/.github/workflows/userbenchmark-regression-detector.yml
@@ -15,44 +15,41 @@ jobs:
     timeout-minutes: 1440 # 24 hours
     environment: docker-s3-upload
     env:
-      BASE_CONDA_ENV: "torchbench"
       CONDA_ENV: "optim"
       PLATFORM_NAME: "gcp_a100"
       TORCHBENCH_USERBENCHMARK_SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.TORCHBENCH_USERBENCHMARK_SCRIBE_GRAPHQL_ACCESS_TOKEN }}
       AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-      SETUP_SCRIPT: "/workspace/setup_instance.sh"
       HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
     steps:
       - name: Checkout TorchBench
         uses: actions/checkout@v3
         with:
           path: benchmark
-      - name: Tune Nvidia GPU
+      - name: Install Conda
         run: |
-          sudo nvidia-smi -pm 1
-          sudo nvidia-smi -ac 1215,1410
-          nvidia-smi
-      - name: Clone and setup Conda env
-        run: |
-          CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}"
-          conda create --name "${CONDA_ENV}" --clone "${BASE_CONDA_ENV}"
+          set -x
+          pushd benchmark
+          bash ./.ci/torchbench/install-conda.sh
       - name: Install TorchBench
         run: |
           set -x
-          . "${SETUP_SCRIPT}"
+          . "${HOME}"/miniconda3/etc/profile.d/conda.sh
+          conda activate "${CONDA_ENV}"
           pushd benchmark
           # only install the subset of models currently running.
           python install.py BERT_pytorch DALLE2_pytorch hf_GPT2_large hf_T5_large resnet50 timm_vision_transformer_large yolov3
       - name: Print torch.version.git_version
         run: |
           set -x
-          . "${SETUP_SCRIPT}"
+          . "${HOME}"/miniconda3/etc/profile.d/conda.sh
+          conda activate "${CONDA_ENV}"
           python -c "import torch; print(torch.version.git_version)"
       - name: Run optim user benchmark
         run: |
           set -x
-          . "${SETUP_SCRIPT}"
+          . "${HOME}"/miniconda3/etc/profile.d/conda.sh
+          conda activate "${CONDA_ENV}"
           # remove old results
           if [ -d benchmark-output ]; then rm -Rf benchmark-output; fi
           pushd benchmark
@@ -65,7 +62,9 @@ jobs:
       - name: Detect potential regressions
         continue-on-error: true
         run: |
-          . "${SETUP_SCRIPT}"
+          set -x
+          . "${HOME}"/miniconda3/etc/profile.d/conda.sh
+          conda activate "${CONDA_ENV}"
           pushd benchmark
           RESULTS=($(find ${PWD}/../benchmark-output -name "metrics-*.json" -maxdepth 2 | sort -r))
           # TODO: the following assumes only one metrics-*.json is found. It will keep
@@ -86,7 +85,8 @@ jobs:
             torchbench-perf-report
       - name: Upload result jsons to Scribe and S3
         run: |
-          . "${SETUP_SCRIPT}"
+          . "${HOME}"/miniconda3/etc/profile.d/conda.sh
+          conda activate "${CONDA_ENV}"
           pushd benchmark
           RESULTS=($(find ${PWD}/../benchmark-output -name "metrics-*.json" -maxdepth 2 | sort -r))
           echo "Uploading result jsons: ${RESULTS}"
@@ -102,13 +102,13 @@ jobs:
       - name: Finally, error if errors.txt exists
         if: always()
         run: |
+          set -x
           # Do not error earlier as we want all artifacts and regressions to be reported first
           # TODO: potentially move errors.txt to benchmark-output so it gets uploaded to S3
           pushd benchmark
           if [ -e errors.txt ]; then cat errors.txt && exit 1; fi
       - name: Remove conda environment
         if: always()
         run: |
-          . "${SETUP_SCRIPT}"
-          conda deactivate && conda deactivate
+          . ${HOME}/miniconda3/etc/profile.d/conda.sh
           conda remove -n "${CONDA_ENV}" --all