@@ -115,7 +115,7 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
115
115
116
116
self .check_config ()
117
117
118
- if self .config [ 'slurm' ]. get ( 'ParallelClusterConfig' , {}). get ( 'Enable' , False ) :
118
+ if self .use_parallel_cluster :
119
119
self .create_vpc ()
120
120
121
121
self .check_regions_config ()
@@ -286,6 +286,9 @@ def check_config(self):
286
286
Check config, set defaults, and sanity check the configuration
287
287
'''
288
288
config_errors = 0
289
+
290
+ self .use_parallel_cluster = self .config ['slurm' ].get ('ParallelClusterConfig' , {}).get ('Enable' , False )
291
+
289
292
if self .stack_name :
290
293
if 'StackName' not in self .config :
291
294
logger .info (f"config/StackName set from command line: { self .stack_name } " )
@@ -309,10 +312,13 @@ def check_config(self):
309
312
logger .info (f"Domain defaulted to { self .config ['Domain' ]} " )
310
313
311
314
if 'ClusterName' not in self .config ['slurm' ]:
312
- self .config ['slurm' ]['ClusterName' ] = self .stack_name
315
+ if self .use_parallel_cluster :
316
+ self .config ['slurm' ]['ClusterName' ] = f"{ self .stack_name } -cl"
317
+ else :
318
+ self .config ['slurm' ]['ClusterName' ] = self .stack_name
313
319
logger .info (f"slurm/ClusterName defaulted to { self .config ['StackName' ]} " )
314
320
315
- if not self .config [ 'slurm' ]. get ( 'ParallelClusterConfig' , {}). get ( 'Enable' , False ) :
321
+ if not self .use_parallel_cluster :
316
322
if 'mount_path' not in self .config ['slurm' ]['storage' ]:
317
323
self .config ['slurm' ]['storage' ]['mount_path' ] = f"/opt/slurm/{ self .config ['slurm' ]['ClusterName' ]} "
318
324
if 'provider' not in self .config ['slurm' ]['storage' ]:
@@ -442,12 +448,12 @@ def check_config(self):
442
448
logger .error (f"Must specify existing ElasticSearch domain in slurm/JobCompLoc when slurm/JobCompType == jobcomp/elasticsearch and slurm/ElasticSearch is not set." )
443
449
config_errors += 1
444
450
445
- if self .config [ 'slurm' ]. get ( 'ParallelClusterConfig' , {}). get ( 'Enable' , False ) :
451
+ if self .use_parallel_cluster :
446
452
self .PARALLEL_CLUSTER_VERSION = parse_version (self .config ['slurm' ]['ParallelClusterConfig' ]['Version' ])
447
453
448
- if self .PARALLEL_CLUSTER_VERSION < parse_version ('3.7.0b1 ' ):
454
+ if self .PARALLEL_CLUSTER_VERSION < parse_version ('3.7.0 ' ):
449
455
if 'LoginNodes' in self .config ['slurm' ]['ParallelClusterConfig' ]:
450
- logger .error (f"slurm/ParallelClusterConfig/LoginNodes not supported before version 3.7.0b1 " )
456
+ logger .error (f"slurm/ParallelClusterConfig/LoginNodes not supported before version 3.7.0 " )
451
457
config_errors += 1
452
458
453
459
# Check for unsupported legacy config file options
@@ -513,6 +519,10 @@ def check_config(self):
513
519
logger .error (f"Config 'slurm/storage/{ key } ={ self .config ['slurm' ]['storage' ][key ]} ' is not supported with /slurm/ParallelClusterConfig/Enable." )
514
520
config_errors += 1
515
521
522
+ if self .config ['slurm' ]['ParallelClusterConfig' ]['Image' ]['Os' ] == 'centos7' and self .config ['slurm' ]['ParallelClusterConfig' ]['Architecture' ] != 'x86_64' :
523
+ logger .error (f'centos7 only supports x86_64 architecture. Update slurm/ParallelClusterConfig/Architecture.' )
524
+ config_errors += 1
525
+
516
526
# Make sure that slurm ports are the same as ParallelCluster
517
527
if self .config ['slurm' ]['SlurmCtl' ]['SlurmctldPortMin' ] != 6820 :
518
528
logger .warning (f"SlurmctldPortMin overridden to 6820 from { self .config ['slurm' ]['SlurmCtl' ]['SlurmctldPortMin' ]} to match ParallelCluster." )
@@ -608,7 +618,7 @@ def check_config(self):
608
618
logger .error (f"Must specify slurm/ParallelClusterConfig/Database/{ database_key } when slurm/ParallelClusterConfig/Database/[Database,EdaSlurmCluster]StackName not set" )
609
619
config_errors += 1
610
620
611
- for extra_mount_dict in self .config ['slurm' ][ 'storage' ][ 'ExtraMounts' ] :
621
+ for extra_mount_dict in self .config ['slurm' ]. get ( 'storage' , {}). get ( 'ExtraMounts' , {}) :
612
622
mount_dir = extra_mount_dict ['dest' ]
613
623
if 'StorageType' not in extra_mount_dict :
614
624
logger .error (f"ParallelCluster requires StorageType for { mount_dir } in slurm/storage/ExtraMounts" )
@@ -905,16 +915,22 @@ def check_regions_config(self):
905
915
self .instance_types .append (instance_type )
906
916
self .instance_types = sorted (self .instance_types )
907
917
908
- if self .config ['slurm' ].get ('ParallelClusterConfig' , {}).get ('Enable' , False ):
909
- # Filter the instance types by architecture due to PC limitation to x86
910
- x86_instance_types = []
918
+ if self .use_parallel_cluster :
919
+ # Filter the instance types by architecture due to PC limitation to 1 architecture
920
+ cluster_architecture = self .config ['slurm' ]['ParallelClusterConfig' ]['Architecture' ]
921
+ logger .info (f"ParallelCluster Architecture: { cluster_architecture } " )
922
+ filtered_instance_types = []
911
923
for instance_type in self .instance_types :
912
- architecture = self .plugin .get_architecture (self .config ['Region' ], instance_type )
913
- if architecture != self .config ['slurm' ]['ParallelClusterConfig' ]['Architecture' ]:
924
+ instance_architecture = self .plugin .get_architecture (self .config ['Region' ], instance_type )
925
+ if instance_architecture != cluster_architecture :
926
+ logger .warning (f"Excluding { instance_type } because architecture ({ instance_architecture } ) != { cluster_architecture } " )
914
927
continue
915
- x86_instance_types .append (instance_type )
916
- self .instance_types = x86_instance_types
928
+ filtered_instance_types .append (instance_type )
929
+ self .instance_types = filtered_instance_types
917
930
logger .info (f"ParallelCluster configured to use { len (self .instance_types )} instance types :\n { pp .pformat (self .instance_types )} " )
931
+ if len (self .instance_types ) == 0 :
932
+ logger .error (f"No instance type configured. Update slurm/InstanceConfig with { cluster_architecture } instance types." )
933
+ sys .exit (1 )
918
934
919
935
# Validate updated config against schema
920
936
from config_schema import check_schema
@@ -1143,7 +1159,7 @@ def create_security_groups(self):
1143
1159
1144
1160
# These are the security groups that have client access to mount the extra file systems
1145
1161
self .extra_mount_security_groups = {}
1146
- for fs_type in self .config ['slurm' ][ 'storage' ][ 'ExtraMountSecurityGroups' ] .keys ():
1162
+ for fs_type in self .config ['slurm' ]. get ( 'storage' , {}). get ( 'ExtraMountSecurityGroups' , {}) .keys ():
1147
1163
self .extra_mount_security_groups [fs_type ] = {}
1148
1164
for extra_mount_sg_name , extra_mount_sg_id in self .config ['slurm' ]['storage' ]['ExtraMountSecurityGroups' ][fs_type ].items ():
1149
1165
(allow_all_outbound , allow_all_ipv6_outbound ) = self .allow_all_outbound (extra_mount_sg_id )
@@ -1334,7 +1350,7 @@ def create_security_groups(self):
1334
1350
self .suppress_cfn_nag (self .slurmnode_sg , 'W27' , 'Correct, restricted range for lustre: 1021-1023' )
1335
1351
self .suppress_cfn_nag (self .slurmnode_sg , 'W29' , 'Correct, restricted range for lustre: 1021-1023' )
1336
1352
1337
- for fs_type in self .config ['slurm' ][ 'storage' ][ 'ExtraMountCidrs' ] .keys ():
1353
+ for fs_type in self .config ['slurm' ]. get ( 'storage' , {}). get ( 'ExtraMountCidrs' , {}) .keys ():
1338
1354
for extra_mount_cidr_name , extra_mount_cidr in self .config ['slurm' ]['storage' ]['ExtraMountCidrs' ][fs_type ].items ():
1339
1355
extra_mount_cidr = ec2 .Peer .ipv4 (extra_mount_cidr )
1340
1356
if fs_type in ['nfs' , 'zfs' ]:
@@ -2491,7 +2507,7 @@ def get_instance_template_vars(self, instance_role):
2491
2507
"ClusterName" : self .config ['slurm' ]['ClusterName' ],
2492
2508
"Domain" : self .config ['Domain' ],
2493
2509
"ERROR_SNS_TOPIC_ARN" : self .config ['ErrorSnsTopicArn' ],
2494
- "ExtraMounts" : self .config ['slurm' ][ 'storage' ][ 'ExtraMounts' ] ,
2510
+ "ExtraMounts" : self .config ['slurm' ]. get ( 'storage' , {}). get ( 'ExtraMounts' , {}) ,
2495
2511
"FileSystemDns" : self .file_system_dns ,
2496
2512
"FileSystemMountPath" : self .config ['slurm' ]['storage' ]['mount_path' ],
2497
2513
"FileSystemMountSrc" : self .file_system_mount_source ,
@@ -3561,7 +3577,7 @@ def suppress_cfn_nag(self, resource, msg_id, reason):
3561
3577
def create_parallel_cluster_config (self ):
3562
3578
MAX_NUMBER_OF_QUEUES = 50
3563
3579
MAX_NUMBER_OF_COMPUTE_RESOURCES = 50
3564
- if self .PARALLEL_CLUSTER_VERSION < parse_version ('3.7.0b1 ' ):
3580
+ if self .PARALLEL_CLUSTER_VERSION < parse_version ('3.7.0 ' ):
3565
3581
# ParallelCluster has a restriction where a queue can have only 1 instance type with memory based scheduling
3566
3582
# So, for now creating a queue for each instance type and purchase option
3567
3583
PARALLEL_CLUSTER_SUPPORTS_MULTIPLE_COMPUTE_RESOURCES_PER_QUEUE = False
@@ -3870,7 +3886,7 @@ def create_parallel_cluster_config(self):
3870
3886
)
3871
3887
average_price = total_price / len (instance_types )
3872
3888
compute_resource ['Efa' ]['Enabled' ] = efa_supported
3873
- if self .PARALLEL_CLUSTER_VERSION >= parse_version ('3.7.0b1 ' ):
3889
+ if self .PARALLEL_CLUSTER_VERSION >= parse_version ('3.7.0 ' ):
3874
3890
compute_resource ['StaticNodePriority' ] = int (average_price * 1000 )
3875
3891
compute_resource ['DynamicNodePriority' ] = int (average_price * 10000 )
3876
3892
compute_resource ['Networking' ] = {
@@ -3974,7 +3990,7 @@ def create_parallel_cluster_config(self):
3974
3990
'InstanceType' : instance_type
3975
3991
}
3976
3992
)
3977
- if self .PARALLEL_CLUSTER_VERSION >= parse_version ('3.7.0b1 ' ):
3993
+ if self .PARALLEL_CLUSTER_VERSION >= parse_version ('3.7.0 ' ):
3978
3994
compute_resource ['StaticNodePriority' ] = int (price * 1000 )
3979
3995
compute_resource ['DynamicNodePriority' ] = int (price * 10000 )
3980
3996
parallel_cluster_queue ['ComputeResources' ].append (compute_resource )
@@ -4094,7 +4110,7 @@ def create_parallel_cluster_config(self):
4094
4110
self .parallel_cluster_config ['Scheduling' ]['SlurmSettings' ]['CustomSlurmSettings' ].append (slurm_settings_dict )
4095
4111
4096
4112
self .parallel_cluster_config ['SharedStorage' ] = []
4097
- for extra_mount_dict in self .config ['slurm' ][ 'storage' ][ 'ExtraMounts' ] :
4113
+ for extra_mount_dict in self .config ['slurm' ]. get ( 'storage' , {}). get ( 'ExtraMounts' , {}) :
4098
4114
mount_dir = extra_mount_dict ['dest' ]
4099
4115
storage_type = extra_mount_dict ['StorageType' ]
4100
4116
if storage_type == 'Efs' :
0 commit comments