Skip to content

Commit 68f8f38

Browse files
authored
Change dataset download scripts to use Cloudflare buckets directly instead of going through Nextcloud (#712)
1 parent e237206 commit 68f8f38

4 files changed

+18
-6
lines changed

stable_diffusion/scripts/datasets/coco2014-validation-download-prompts.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,8 @@ while [ "$1" != "" ]; do
1212
done
1313

1414
mkdir -p ${OUTPUT_DIR}
15-
wget -O ${OUTPUT_DIR}/val2014_30k.tsv -c "https://cloud.mlcommons.org/index.php/s/training_stable_diffusion/download?path=/datasets/coco2014&files=val2014_30k.tsv"
15+
16+
rclone config create mlc-training s3 provider=Cloudflare access_key_id=76ea42eadb867e854061a1806220ee1e secret_access_key=a53625c4d45e3ca8ac0df8a353ea3a41ffc3292aa25259addd8b7dc5a6ce2936 endpoint=c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
17+
18+
rclone copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/coco2014/val2014_30k.tsv ${OUTPUT_DIR} -P
19+

stable_diffusion/scripts/datasets/coco2014-validation-download-stats.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,8 @@ while [ "$1" != "" ]; do
1212
done
1313

1414
mkdir -p ${OUTPUT_DIR}
15-
wget -O ${OUTPUT_DIR}/val2014_30k_stats.npz -c "https://cloud.mlcommons.org/index.php/s/training_stable_diffusion/download?path=/datasets/coco2014&files=val2014_30k_stats.npz"
15+
16+
rclone config create mlc-training s3 provider=Cloudflare access_key_id=76ea42eadb867e854061a1806220ee1e secret_access_key=a53625c4d45e3ca8ac0df8a353ea3a41ffc3292aa25259addd8b7dc5a6ce2936 endpoint=c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
17+
18+
rclone copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/coco2014/val2014_30k_stats.npz ${OUTPUT_DIR} -P
19+

stable_diffusion/scripts/datasets/laion400m-filtered-download-images.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@ mkdir -p ${OUTPUT_DIR}
1515
cd ${OUTPUT_DIR}
1616

1717

18-
for i in {00000..00831}; do wget -O ${OUTPUT_DIR}/${i}.tar -c "https://cloud.mlcommons.org/index.php/s/training_stable_diffusion/download?path=/datasets/laion-400m/images-webdataset-filtered&files=${i}.tar"; done
18+
rclone config create mlc-training s3 provider=Cloudflare access_key_id=76ea42eadb867e854061a1806220ee1e secret_access_key=a53625c4d45e3ca8ac0df8a353ea3a41ffc3292aa25259addd8b7dc5a6ce2936 endpoint=c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
1919

20-
wget -O ${OUTPUT_DIR}/sha512sums.txt -c "https://cloud.mlcommons.org/index.php/s/training_stable_diffusion/download?path=/datasets/laion-400m/images-webdataset-filtered&files=sha512sums.txt"
20+
rclone copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/laion-400m/images-webdataset-filtered/ ${OUTPUT_DIR} --include="*.tar" -P
21+
22+
rclone copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/laion-400m/images-webdataset-filtered/sha512sums.txt ${OUTPUT_DIR} -P
2123

2224
sha512sum --quiet -c sha512sums.txt

stable_diffusion/scripts/datasets/laion400m-filtered-download-moments.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@ mkdir -p ${OUTPUT_DIR}
1515
cd ${OUTPUT_DIR}
1616

1717

18-
for i in {00000..00831}; do wget -O ${OUTPUT_DIR}/${i}.tar -c "https://cloud.mlcommons.org/index.php/s/training_stable_diffusion/download?path=/datasets/laion-400m/moments-webdataset-filtered&files=${i}.tar"; done
18+
rclone config create mlc-training s3 provider=Cloudflare access_key_id=76ea42eadb867e854061a1806220ee1e secret_access_key=a53625c4d45e3ca8ac0df8a353ea3a41ffc3292aa25259addd8b7dc5a6ce2936 endpoint=c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
1919

20-
wget -O ${OUTPUT_DIR}/sha512sums.txt -c "https://cloud.mlcommons.org/index.php/s/training_stable_diffusion/download?path=/datasets/laion-400m/moments-webdataset-filtered&files=sha512sums.txt"
20+
rclone copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/laion-400m/moments-webdataset-filtered/ ${OUTPUT_DIR} --include="*.tar" -P
21+
22+
rclone copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/laion-400m/moments-webdataset-filtered/sha512sums.txt ${OUTPUT_DIR} -P
2123

2224
sha512sum --quiet -c sha512sums.txt

0 commit comments

Comments
 (0)