Skip to content

Commit 4078588

Browse files
committed
Improve the ci runners
1 parent d910b8a commit 4078588

File tree

7 files changed

+90
-140
lines changed

7 files changed

+90
-140
lines changed

scripts/torchbench_install.sh renamed to .ci/torchbench/install.sh

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
. ${HOME}/miniconda3/etc/profile.d/conda.sh
22

3+
if [ -z "${BASE_CONDA_ENV}" ]; then
4+
echo "ERROR: BASE_CONDA_ENV is not set"
5+
exit 1
6+
fi
7+
38
if [ -z "${CONDA_ENV}" ]; then
49
echo "ERROR: CONDA_ENV is not set"
510
exit 1
@@ -10,10 +15,11 @@ if [ -z "${SETUP_SCRIPT}" ]; then
1015
exit 1
1116
fi
1217

13-
. "${SETUP_SCRIPT}"
18+
CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}"
19+
conda create --name "${CONDA_ENV}" --clone "${BASE_CONDA_ENV}"
1420
conda activate "${CONDA_ENV}"
1521

16-
parent_dir=$(dirname "$(readlink -f "$0")")/..
22+
parent_dir=$(dirname "$(readlink -f "$0")")/../..
1723
cd ${parent_dir}
1824

1925
python -c "import torch; print(torch.__version__); print(torch.version.git_version)"

.ci/torchbench/test.sh

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
. ${HOME}/miniconda3/etc/profile.d/conda.sh
2+
3+
if [ -z "${CONDA_ENV}" ]; then
4+
echo "ERROR: CONDA_ENV is not set"
5+
exit 1
6+
fi
7+
8+
if [ -z "${TEST_CONFIG}" ]; then
9+
echo "ERROR: TEST_CONFIG is not set"
10+
exit 1
11+
fi
12+
13+
if [ -z "${SETUP_SCRIPT}" ]; then
14+
echo "ERROR: SETUP_SCRIPT is not set"
15+
exit 1
16+
fi
17+
18+
. "${SETUP_SCRIPT}"
19+
conda activate "${CONDA_ENV}"
20+
21+
parent_dir=$(dirname "$(readlink -f "$0")")/../..
22+
cd ${parent_dir}
23+
24+
# Test subprocess worker
25+
if [[ "$TEST_CONFIG" == 'cpu' ]]; then
26+
python -m torchbenchmark._components.test.test_subprocess
27+
python -m torchbenchmark._components.test.test_worker
28+
fi
29+
30+
# Test models
31+
python test.py -v -k "$TEST_CONFIG"

.github/workflows/pr-a10g.yml renamed to .github/workflows/_linux-test-cpu.yml

Lines changed: 17 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,19 @@
1-
name: TorchBench PR Test on A10G
2-
on:
3-
pull_request:
4-
workflow_dispatch:
5-
push:
6-
branches:
7-
- main
8-
9-
env:
10-
CONDA_ENV: "torchbench"
11-
DOCKER_IMAGE: "ghcr.io/pytorch/torchbench:latest"
12-
SETUP_SCRIPT: "/workspace/setup_instance.sh"
13-
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
14-
1+
name: linux-test
152

163
jobs:
17-
pr-test:
18-
# AWS A10G GPU instance label: linux.g5.4xlarge.nvidia.gpu
19-
# OS version: Amazon Linux 2
20-
runs-on: [self-hosted, linux.g5.4xlarge.nvidia.gpu]
21-
timeout-minutes: 1440 # 24 hours
4+
pr-test-cpu:
5+
# Don't run on forked repos
6+
if: github.repository_owner == 'pytorch'
7+
runs-on: [linux.24xlarge]
8+
timeout-minutes: ${{ inputs.timeout-minutes }}
229
environment: docker-s3-upload
10+
env:
11+
BASE_CONDA_ENV: "torchbench"
12+
CONDA_ENV: "pr-test-cpu"
13+
DOCKER_IMAGE: "ghcr.io/pytorch/torchbench:latest"
14+
SETUP_SCRIPT: "/workspace/setup_instance.sh"
15+
TEST_CONFIG: "cpu"
16+
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
2317
steps:
2418
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
2519
uses: pytorch/test-infra/.github/actions/setup-ssh@main
@@ -33,28 +27,26 @@ jobs:
3327
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
3428
with:
3529
docker-image: ${{ env.DOCKER_IMAGE }}
36-
- name: Install NVIDIA Driver, docker runtime, set GPU_FLAG
37-
id: install-nvidia-driver
38-
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
3930
- name: Install and Test TorchBench
4031
run: |
4132
container_name=$(docker run \
33+
-e BASE_CONDA_ENV="${BASE_CONDA_ENV}" \
4234
-e CONDA_ENV="${CONDA_ENV}" \
4335
-e SETUP_SCRIPT="${SETUP_SCRIPT}" \
4436
-e HUGGING_FACE_HUB_TOKEN="${HUGGING_FACE_HUB_TOKEN}" \
37+
-e TEST_CONFIG="${TEST_CONFIG}" \
4538
--tty \
4639
--detach \
4740
--shm-size=32gb \
4841
-v "${PWD}/benchmark:/benchmark" \
49-
--gpus all \
5042
-w / \
5143
"${{ env.DOCKER_IMAGE }}" \
5244
tail -f /dev/null
5345
)
5446
echo "Container name: ${container_name}"
5547
docker exec -t -w "/" "${container_name}" bash -c "sudo chown -R runner /benchmark; sudo chgrp -R runner /benchmark"
56-
docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/scripts/torchbench_install.sh
57-
docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/scripts/torchbench_test.sh
48+
docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/.ci/torchbench/install.sh
49+
docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/.ci/torchbench/test.sh
5850
- name: Teardown Linux
5951
uses: pytorch/test-infra/.github/actions/teardown-linux@main
6052
if: always()

.github/workflows/pr-gha-runner.yml renamed to .github/workflows/_linux-test-cuda.yml

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,18 @@
1-
name: TorchBench PR Test
2-
on:
3-
pull_request:
4-
workflow_dispatch:
5-
push:
6-
branches:
7-
- main
1+
name: linux-test
82

93
jobs:
10-
pr-test:
4+
pr-test-cuda:
5+
# Don't run on forked repos
6+
if: github.repository_owner == 'pytorch'
7+
runs-on: [a100-runner]
8+
timeout-minutes: ${{ inputs.timeout-minutes }}
9+
environment: docker-s3-upload
1110
env:
1211
BASE_CONDA_ENV: "torchbench"
13-
CONDA_ENV: "pr-ci-a100"
12+
CONDA_ENV: "pr-test-cuda"
1413
SETUP_SCRIPT: "/workspace/setup_instance.sh"
14+
TEST_CONFIG: "cuda"
1515
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
16-
runs-on: [a100-runner]
17-
timeout-minutes: 1440 # 24 hours
18-
environment: docker-s3-upload
1916
steps:
2017
- name: Checkout TorchBench
2118
uses: actions/checkout@v3
@@ -31,10 +28,10 @@ jobs:
3128
conda create --name "${CONDA_ENV}" --clone "${BASE_CONDA_ENV}"
3229
- name: Install TorchBench
3330
run: |
34-
bash ./scripts/torchbench_install.sh
35-
- name: Validate benchmark components
31+
bash ./.ci/torchbench/install.sh
32+
- name: Test TorchBench
3633
run: |
37-
bash ./scripts/torchbench_test.sh
34+
bash ./.ci/torchbench/test.sh
3835
- name: Clean up Conda env
3936
if: always()
4037
run: |

.github/workflows/pr-gpu-stability-ci.yml

Lines changed: 0 additions & 74 deletions
This file was deleted.

.github/workflows/pr-test.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
name: TorchBench PR Test
2+
on:
3+
pull_request:
4+
workflow_dispatch:
5+
push:
6+
branches:
7+
- main
8+
9+
jobs:
10+
cpu-test:
11+
timeout-minutes: 120 # 2 hours
12+
uses: ./.github/workflow/_linux-test-cpu.yml
13+
with:
14+
timeout-minutes: 120 # 2 hours
15+
cuda-test:
16+
uses: ./.github/workflow/_linux-test-cuda.yml
17+
with:
18+
timeout-minutes: 120 # 2 hours
19+
20+
concurrency:
21+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
22+
cancel-in-progress: true

scripts/torchbench_test.sh

Lines changed: 0 additions & 24 deletions
This file was deleted.

0 commit comments

Comments
 (0)