Skip to content

Commit 1ab863a

Browse files
authored
Check that ParallelCluster architecture is consistent with OS (#145)
Centos 7 only supports x86_64, but config was defaulting to arm64. So unless the architecture was explicitly set then the cluster would fail to deploy. Set the default Architecture to be consistent with the default OS and added a check. Check config better. Resolves #143
1 parent 22e766b commit 1ab863a

File tree

2 files changed

+45
-23
lines changed

2 files changed

+45
-23
lines changed

source/cdk/cdk_slurm_stack.py

+37-21
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
115115

116116
self.check_config()
117117

118-
if self.config['slurm'].get('ParallelClusterConfig', {}).get('Enable', False):
118+
if self.use_parallel_cluster:
119119
self.create_vpc()
120120

121121
self.check_regions_config()
@@ -286,6 +286,9 @@ def check_config(self):
286286
Check config, set defaults, and sanity check the configuration
287287
'''
288288
config_errors = 0
289+
290+
self.use_parallel_cluster = self.config['slurm'].get('ParallelClusterConfig', {}).get('Enable', False)
291+
289292
if self.stack_name:
290293
if 'StackName' not in self.config:
291294
logger.info(f"config/StackName set from command line: {self.stack_name}")
@@ -309,10 +312,13 @@ def check_config(self):
309312
logger.info(f"Domain defaulted to {self.config['Domain']}")
310313

311314
if 'ClusterName' not in self.config['slurm']:
312-
self.config['slurm']['ClusterName'] = self.stack_name
315+
if self.use_parallel_cluster:
316+
self.config['slurm']['ClusterName'] = f"{self.stack_name}-cl"
317+
else:
318+
self.config['slurm']['ClusterName'] = self.stack_name
313319
logger.info(f"slurm/ClusterName defaulted to {self.config['StackName']}")
314320

315-
if not self.config['slurm'].get('ParallelClusterConfig', {}).get('Enable', False):
321+
if not self.use_parallel_cluster:
316322
if 'mount_path' not in self.config['slurm']['storage']:
317323
self.config['slurm']['storage']['mount_path'] = f"/opt/slurm/{self.config['slurm']['ClusterName']}"
318324
if 'provider' not in self.config['slurm']['storage']:
@@ -442,12 +448,12 @@ def check_config(self):
442448
logger.error(f"Must specify existing ElasticSearch domain in slurm/JobCompLoc when slurm/JobCompType == jobcomp/elasticsearch and slurm/ElasticSearch is not set.")
443449
config_errors += 1
444450

445-
if self.config['slurm'].get('ParallelClusterConfig', {}).get('Enable', False):
451+
if self.use_parallel_cluster:
446452
self.PARALLEL_CLUSTER_VERSION = parse_version(self.config['slurm']['ParallelClusterConfig']['Version'])
447453

448-
if self.PARALLEL_CLUSTER_VERSION < parse_version('3.7.0b1'):
454+
if self.PARALLEL_CLUSTER_VERSION < parse_version('3.7.0'):
449455
if 'LoginNodes' in self.config['slurm']['ParallelClusterConfig']:
450-
logger.error(f"slurm/ParallelClusterConfig/LoginNodes not supported before version 3.7.0b1")
456+
logger.error(f"slurm/ParallelClusterConfig/LoginNodes not supported before version 3.7.0")
451457
config_errors += 1
452458

453459
# Check for unsupported legacy config file options
@@ -513,6 +519,10 @@ def check_config(self):
513519
logger.error(f"Config 'slurm/storage/{key}={self.config['slurm']['storage'][key]}' is not supported with /slurm/ParallelClusterConfig/Enable.")
514520
config_errors += 1
515521

522+
if self.config['slurm']['ParallelClusterConfig']['Image']['Os'] == 'centos7' and self.config['slurm']['ParallelClusterConfig']['Architecture'] != 'x86_64':
523+
logger.error(f'centos7 only supports x86_64 architecture. Update slurm/ParallelClusterConfig/Architecture.')
524+
config_errors += 1
525+
516526
# Make sure that slurm ports are the same as ParallelCluster
517527
if self.config['slurm']['SlurmCtl']['SlurmctldPortMin'] != 6820:
518528
logger.warning(f"SlurmctldPortMin overridden to 6820 from {self.config['slurm']['SlurmCtl']['SlurmctldPortMin']} to match ParallelCluster.")
@@ -608,7 +618,7 @@ def check_config(self):
608618
logger.error(f"Must specify slurm/ParallelClusterConfig/Database/{database_key} when slurm/ParallelClusterConfig/Database/[Database,EdaSlurmCluster]StackName not set")
609619
config_errors += 1
610620

611-
for extra_mount_dict in self.config['slurm']['storage']['ExtraMounts']:
621+
for extra_mount_dict in self.config['slurm'].get('storage', {}).get('ExtraMounts', {}):
612622
mount_dir = extra_mount_dict['dest']
613623
if 'StorageType' not in extra_mount_dict:
614624
logger.error(f"ParallelCluster requires StorageType for {mount_dir} in slurm/storage/ExtraMounts")
@@ -905,16 +915,22 @@ def check_regions_config(self):
905915
self.instance_types.append(instance_type)
906916
self.instance_types = sorted(self.instance_types)
907917

908-
if self.config['slurm'].get('ParallelClusterConfig', {}).get('Enable', False):
909-
# Filter the instance types by architecture due to PC limitation to x86
910-
x86_instance_types = []
918+
if self.use_parallel_cluster:
919+
# Filter the instance types by architecture due to PC limitation to 1 architecture
920+
cluster_architecture = self.config['slurm']['ParallelClusterConfig']['Architecture']
921+
logger.info(f"ParallelCluster Architecture: {cluster_architecture}")
922+
filtered_instance_types = []
911923
for instance_type in self.instance_types:
912-
architecture = self.plugin.get_architecture(self.config['Region'], instance_type)
913-
if architecture != self.config['slurm']['ParallelClusterConfig']['Architecture']:
924+
instance_architecture = self.plugin.get_architecture(self.config['Region'], instance_type)
925+
if instance_architecture != cluster_architecture:
926+
logger.warning(f"Excluding {instance_type} because architecture ({instance_architecture}) != {cluster_architecture}")
914927
continue
915-
x86_instance_types.append(instance_type)
916-
self.instance_types = x86_instance_types
928+
filtered_instance_types.append(instance_type)
929+
self.instance_types = filtered_instance_types
917930
logger.info(f"ParallelCluster configured to use {len(self.instance_types)} instance types :\n{pp.pformat(self.instance_types)}")
931+
if len(self.instance_types) == 0:
932+
logger.error(f"No instance type configured. Update slurm/InstanceConfig with {cluster_architecture} instance types.")
933+
sys.exit(1)
918934

919935
# Validate updated config against schema
920936
from config_schema import check_schema
@@ -1143,7 +1159,7 @@ def create_security_groups(self):
11431159

11441160
# These are the security groups that have client access to mount the extra file systems
11451161
self.extra_mount_security_groups = {}
1146-
for fs_type in self.config['slurm']['storage']['ExtraMountSecurityGroups'].keys():
1162+
for fs_type in self.config['slurm'].get('storage', {}).get('ExtraMountSecurityGroups', {}).keys():
11471163
self.extra_mount_security_groups[fs_type] = {}
11481164
for extra_mount_sg_name, extra_mount_sg_id in self.config['slurm']['storage']['ExtraMountSecurityGroups'][fs_type].items():
11491165
(allow_all_outbound, allow_all_ipv6_outbound) = self.allow_all_outbound(extra_mount_sg_id)
@@ -1334,7 +1350,7 @@ def create_security_groups(self):
13341350
self.suppress_cfn_nag(self.slurmnode_sg, 'W27', 'Correct, restricted range for lustre: 1021-1023')
13351351
self.suppress_cfn_nag(self.slurmnode_sg, 'W29', 'Correct, restricted range for lustre: 1021-1023')
13361352

1337-
for fs_type in self.config['slurm']['storage']['ExtraMountCidrs'].keys():
1353+
for fs_type in self.config['slurm'].get('storage', {}).get('ExtraMountCidrs', {}).keys():
13381354
for extra_mount_cidr_name, extra_mount_cidr in self.config['slurm']['storage']['ExtraMountCidrs'][fs_type].items():
13391355
extra_mount_cidr = ec2.Peer.ipv4(extra_mount_cidr)
13401356
if fs_type in ['nfs', 'zfs']:
@@ -2491,7 +2507,7 @@ def get_instance_template_vars(self, instance_role):
24912507
"ClusterName": self.config['slurm']['ClusterName'],
24922508
"Domain": self.config['Domain'],
24932509
"ERROR_SNS_TOPIC_ARN": self.config['ErrorSnsTopicArn'],
2494-
"ExtraMounts": self.config['slurm']['storage']['ExtraMounts'],
2510+
"ExtraMounts": self.config['slurm'].get('storage', {}).get('ExtraMounts', {}),
24952511
"FileSystemDns": self.file_system_dns,
24962512
"FileSystemMountPath": self.config['slurm']['storage']['mount_path'],
24972513
"FileSystemMountSrc": self.file_system_mount_source,
@@ -3561,7 +3577,7 @@ def suppress_cfn_nag(self, resource, msg_id, reason):
35613577
def create_parallel_cluster_config(self):
35623578
MAX_NUMBER_OF_QUEUES = 50
35633579
MAX_NUMBER_OF_COMPUTE_RESOURCES = 50
3564-
if self.PARALLEL_CLUSTER_VERSION < parse_version('3.7.0b1'):
3580+
if self.PARALLEL_CLUSTER_VERSION < parse_version('3.7.0'):
35653581
# ParallelCluster has a restriction where a queue can have only 1 instance type with memory based scheduling
35663582
# So, for now creating a queue for each instance type and purchase option
35673583
PARALLEL_CLUSTER_SUPPORTS_MULTIPLE_COMPUTE_RESOURCES_PER_QUEUE = False
@@ -3870,7 +3886,7 @@ def create_parallel_cluster_config(self):
38703886
)
38713887
average_price = total_price / len(instance_types)
38723888
compute_resource['Efa']['Enabled'] = efa_supported
3873-
if self.PARALLEL_CLUSTER_VERSION >= parse_version('3.7.0b1'):
3889+
if self.PARALLEL_CLUSTER_VERSION >= parse_version('3.7.0'):
38743890
compute_resource['StaticNodePriority'] = int(average_price * 1000)
38753891
compute_resource['DynamicNodePriority'] = int(average_price * 10000)
38763892
compute_resource['Networking'] = {
@@ -3974,7 +3990,7 @@ def create_parallel_cluster_config(self):
39743990
'InstanceType': instance_type
39753991
}
39763992
)
3977-
if self.PARALLEL_CLUSTER_VERSION >= parse_version('3.7.0b1'):
3993+
if self.PARALLEL_CLUSTER_VERSION >= parse_version('3.7.0'):
39783994
compute_resource['StaticNodePriority'] = int(price * 1000)
39793995
compute_resource['DynamicNodePriority'] = int(price * 10000)
39803996
parallel_cluster_queue['ComputeResources'].append(compute_resource)
@@ -4094,7 +4110,7 @@ def create_parallel_cluster_config(self):
40944110
self.parallel_cluster_config['Scheduling']['SlurmSettings']['CustomSlurmSettings'].append(slurm_settings_dict)
40954111

40964112
self.parallel_cluster_config['SharedStorage'] = []
4097-
for extra_mount_dict in self.config['slurm']['storage']['ExtraMounts']:
4113+
for extra_mount_dict in self.config['slurm'].get('storage', {}).get('ExtraMounts', {}):
40984114
mount_dir = extra_mount_dict['dest']
40994115
storage_type = extra_mount_dict['StorageType']
41004116
if storage_type == 'Efs':

source/cdk/config_schema.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,13 @@ def get_DEFAULT_PARALLEL_CLUSTER_PYTHON_VERSION(config):
8181
python_version = DEFAULT_PARALLEL_CLUSTER_PYTHON_VERSIONS.get(parallel_cluster_version, str(DEFAULT_PARALLEL_CLUSTER_PYTHON_VERSION))
8282
return python_version
8383

84-
PARALLEL_CLUSTER_ALLOWED_OSES = ['alinux2', 'centos7', 'rhel8', 'ubuntu2004', 'ubuntu2204']
84+
PARALLEL_CLUSTER_ALLOWED_OSES = [
85+
'alinux2',
86+
'centos7',
87+
'rhel8',
88+
'ubuntu2004',
89+
'ubuntu2204'
90+
]
8591

8692
DEFAULT_SLURM_VERSION = '23.02.1'
8793
def get_DEFAULT_SLURM_VERSION(config):
@@ -255,7 +261,7 @@ def get_config_schema(config):
255261
Optional('Image', default={'Os': 'centos7'}): {
256262
Optional('Os', default='centos7'): And(str, lambda s: s in PARALLEL_CLUSTER_ALLOWED_OSES)
257263
},
258-
Optional('Architecture', default='arm64'): And(str, lambda s: s in ['arm64', 'x86_64']),
264+
Optional('Architecture', default='x86_64'): And(str, lambda s: s in ['arm64', 'x86_64']),
259265
Optional('ComputeNodeAmi'): And(str, lambda s: s.startswith('ami-')),
260266
Optional('DisableSimultaneousMultithreading', default=True): bool,
261267
# Recommend to not use EFA unless necessary to avoid insufficient capacity errors when starting new instances in group or when multiple instance types in the group

0 commit comments

Comments
 (0)