Skip to content

Commit 2c93c97

Browse files
authored
Add support for ParallelCluster 3.12.0 (#296)
Resolves #295 Update the FSxZ security groups to remove outbound security group rules. Resolves #253 Update the installer to check the status of the ParallelCluster stack after the config stack update is complete. Make sure that the stack exists and that it was correctly deployed or else give an error message.
1 parent b7bc3d5 commit 2c93c97

File tree

5 files changed

+67
-8
lines changed

5 files changed

+67
-8
lines changed

create-slurm-security-groups/create-slurm-security-groups.py

+4
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ def main(self):
6161
parser.add_argument("--fsxo-security-group-id", type=str, help="Id of security group attached to FSx for NetApp Ontap file systems.")
6262
parser.add_argument("--fsxz-security-group-id", type=str, help="Id of security group attached to FSx for OpenZfs file systems.")
6363
parser.add_argument("--cdk-cmd", type=str, choices=["deploy", "create", "update", "diff", "ls", "list", "synth", "synthesize", "destroy", "bootstrap"], default="create")
64+
parser.add_argument("--min-pc-version", type=str, default="3.12.0", help="Minimum version of ParallelCluster being used. Used to control security group rules required by PC.")
6465
parser.add_argument("--debug", action='store_const', const=True, default=False, help="Enable CDK debug mode")
6566
args = parser.parse_args()
6667

@@ -73,6 +74,9 @@ def main(self):
7374
# Must be passed to the stack.
7475
self.stack_parameters['region'] = args.region
7576

77+
logger.debug(f"min pc version: {args.min_pc_version}")
78+
self.stack_parameters['min_parallel_cluster_version'] = args.min_pc_version
79+
7680
# Retrieve the AWS Account ID for CDK
7781
sts_client = boto3.client("sts", region_name=args.region)
7882
try:

create-slurm-security-groups/create_slurm_security_groups/create_slurm_security_groups_stack.py

+17-7
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
)
99
from constructs import Construct
1010
import logging
11+
from packaging.version import parse as parse_version
1112

1213
logger = logging.getLogger(__file__)
1314
logger_formatter = logging.Formatter('%(levelname)s: %(message)s')
@@ -28,6 +29,10 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
2829
logger.info(f"VpcId: {self.config['VpcId']}")
2930
self.vpc = ec2.Vpc.from_lookup(self, "Vpc", vpc_id = self.config['VpcId'])
3031

32+
self.min_parallel_cluster_version = self.node.try_get_context('min_parallel_cluster_version')
33+
if self.min_parallel_cluster_version:
34+
self.min_parallel_cluster_version = parse_version(self.min_parallel_cluster_version)
35+
3136
security_groups = {}
3237
fsx_client_security_groups = {}
3338
lustre_security_groups = {}
@@ -105,6 +110,10 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
105110
)
106111
security_groups['SlurmdbdSG'] = slurmdbd_sg
107112

113+
if not self.min_parallel_cluster_version:
114+
logger.info("This is a bootstrap so exiting early.")
115+
exit(0)
116+
108117
# Rules for compute nodes
109118
# Allow mounting of /opt/slurm from the head node.
110119
# This is needed in XIO VMs. ParallelCluster compute nodes have a local copy on their root volume.
@@ -153,14 +162,15 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
153162
fsx_client_sg.connections.allow_to(fsx_zfs_sg, ec2.Port.tcp_range(20001, 20003), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} NFS mount, status monitor, and lock daemon")
154163
fsx_client_sg.connections.allow_to(fsx_zfs_sg, ec2.Port.udp_range(20001, 20003), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} NFS mount, status monitor, and lock daemon")
155164
# There is a bug in PC 3.10.1 that requires outbound traffic to be enabled even though ZFS doesn't.
156-
# Remove when bug in PC is fixed.
165+
# This bug was resolved in PC 3.12.0.
157166
# Tracked by https://github.com/aws-samples/aws-eda-slurm-cluster/issues/253
158-
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.tcp(111), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} rpc for NFS")
159-
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.udp(111), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} rpc for NFS")
160-
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.tcp(2049), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS server daemon")
161-
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.udp(2049), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS server daemon")
162-
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.tcp_range(20001, 20003), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS mount, status monitor, and lock daemon")
163-
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.udp_range(20001, 20003), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS mount, status monitor, and lock daemon")
167+
if self.min_parallel_cluster_version < parse_version('3.12.0'):
168+
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.tcp(111), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} rpc for NFS")
169+
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.udp(111), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} rpc for NFS")
170+
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.tcp(2049), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS server daemon")
171+
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.udp(2049), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS server daemon")
172+
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.tcp_range(20001, 20003), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS mount, status monitor, and lock daemon")
173+
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.udp_range(20001, 20003), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS mount, status monitor, and lock daemon")
164174

165175
for sg_name, sg in security_groups.items():
166176
CfnOutput(self, f"{sg_name}Id",

create-slurm-security-groups/requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@ aws-cdk-lib==2.111.0
22
boto3
33
colored
44
constructs>=10.0.0,<11.0.0
5+
packaging

source/cdk/config_schema.py

+17
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,8 @@
9797
# * Disable Pyxis Spack plugin by default
9898
# * Upgrade Python runtime to 3.12
9999
# * Upgrade libjwt to version 1.17.0.
100+
# 3.12.0:
101+
# * OpenZFS security group requirements fixed.
100102
MIN_PARALLEL_CLUSTER_VERSION = parse_version('3.6.0')
101103
# Update source/resources/default_config.yml with latest version when this is updated.
102104
PARALLEL_CLUSTER_VERSIONS = [
@@ -114,16 +116,19 @@
114116
'3.10.1',
115117
'3.11.0',
116118
'3.11.1',
119+
'3.12.0',
117120
]
118121
PARALLEL_CLUSTER_ENROOT_VERSIONS = {
119122
# This can be found on the head node by running 'yum info enroot'
120123
'3.11.0': '3.4.1', # confirmed
121124
'3.11.1': '3.4.1', # confirmed
125+
'3.12.0': '3.4.1', # confirmed
122126
}
123127
PARALLEL_CLUSTER_PYXIS_VERSIONS = {
124128
# This can be found on the head node at /opt/parallelcluster/sources
125129
'3.11.0': '0.20.0', # confirmed
126130
'3.11.1': '0.20.0', # confirmed
131+
'3.12.0': '0.20.0', # confirmed
127132
}
128133
PARALLEL_CLUSTER_MUNGE_VERSIONS = {
129134
# This can be found on the head node at /opt/parallelcluster/sources
@@ -142,6 +147,7 @@
142147
'3.10.1': '0.5.16', # confirmed
143148
'3.11.0': '0.5.16', # confirmed
144149
'3.11.1': '0.5.16', # confirmed
150+
'3.12.0': '0.5.16', # confirmed
145151
}
146152
PARALLEL_CLUSTER_PYTHON_VERSIONS = {
147153
# This can be found on the head node at /opt/parallelcluster/pyenv/versions
@@ -159,6 +165,7 @@
159165
'3.10.1': '3.9.19', # confirmed
160166
'3.11.0': '3.9.20', # confirmed
161167
'3.11.1': '3.9.20', # confirmed
168+
'3.12.0': '3.9.20', # confirmed
162169
}
163170
PARALLEL_CLUSTER_SLURM_VERSIONS = {
164171
# This can be found on the head node at /etc/chef/local-mode-cache/cache/
@@ -176,6 +183,7 @@
176183
'3.10.1': '23.11.7', # confirmed
177184
'3.11.0': '23.11.10', # confirmed
178185
'3.11.1': '23.11.10', # confirmed
186+
'3.12.0': '23.11.10', # confirmed
179187
}
180188
PARALLEL_CLUSTER_PC_SLURM_VERSIONS = {
181189
# This can be found on the head node at /etc/chef/local-mode-cache/cache/
@@ -193,6 +201,7 @@
193201
'3.10.1': '23-11-7-1', # confirmed
194202
'3.11.0': '23-11-10-1', # confirmed
195203
'3.11.1': '23-11-10-1', # confirmed
204+
'3.12.0': '23-11-10-1', # confirmed
196205
}
197206
SLURM_REST_API_VERSIONS = {
198207
'23-02-2-1': '0.0.39',
@@ -329,6 +338,14 @@ def get_PARALLEL_CLUSTER_LAMBDA_RUNTIME(parallel_cluster_version):
329338
else:
330339
return aws_lambda.Runtime.PYTHON_3_12
331340

341+
# Version 3.12.0
342+
343+
def PARALLEL_CLUSTER_REQUIRES_FSXZ_OUTBOUND_SG_RULES(parallel_cluster_version):
344+
if parallel_cluster_version < parse_version('3.12.0'):
345+
return True
346+
else:
347+
return False
348+
332349
# Determine all AWS regions available on the account.
333350
default_region = environ.get("AWS_DEFAULT_REGION", "us-east-1")
334351
ec2_client = boto3.client("ec2", region_name=default_region)

source/slurm_installer/installer.py

+28-1
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,8 @@ def main(self):
378378
launch_installer = os.system(cmd) # nosec
379379
if cdk_cmd == "deploy":
380380
if int(launch_installer) == 0:
381-
logger.info(f"{fg('green')}SLURM was successfully deployed!{attr('reset')}")
381+
logger.info(f"{fg('green')}SLURM config was successfully deployed!{attr('reset')}")
382+
self.wait_for_slurm_stack()
382383
elif args.cdk_cmd == "destroy":
383384
# Destroy stack if known
384385
cmd_destroy = f"cdk destroy {self.install_parameters['stack_name']} -c {' -c '.join('{}={}'.format(key, val) for (key, val) in self.install_parameters.items() if val is not None)} --require-approval never"
@@ -458,6 +459,32 @@ def get_config(self, config_file):
458459

459460
return validated_config
460461

462+
def wait_for_slurm_stack(self):
463+
'''
464+
Wait for the Slurm stack to be created or updated.
465+
'''
466+
stack_name = self.config['slurm']['ClusterName']
467+
cfn_client = boto3.client("cloudformation", region_name=self.config['Region'])
468+
469+
valid_states = ['CREATE_COMPLETE', 'UPDATE_COMPLETE']
470+
invalid_states = ['ROLLBACK_COMPLETE', 'UPDATE_ROLLBACK_COMPLETE']
471+
stack_status = None
472+
while stack_status not in (valid_states + invalid_states):
473+
try:
474+
stack_info = cfn_client.describe_stacks(StackName=stack_name)['Stacks'][0]
475+
except:
476+
logger.error(f"ParallelCluster stack ({stack_name}) doesn't exist. Failed to create cluster.")
477+
exit(1)
478+
if stack_info:
479+
stack_status = stack_info['StackStatus']
480+
logger.info(f"ParallelCluster stack ({stack_name}) in {stack_status} state.")
481+
482+
if stack_status in invalid_states:
483+
logger.error(f"ParallelCluster stack ({stack_name} deployment failed. State: {stack_status}")
484+
exit(1)
485+
486+
logger.info(f"ParallelCluster stack {stack_name} successfully deployed.")
487+
461488
def upload_objects(install_directory, bucket, stack_name):
462489
# Upload required assets to customer S3 bucket
463490
logger.info(f"\n====== Uploading install files to {bucket}/{stack_name} ======\n")

0 commit comments

Comments
 (0)