Skip to content

chore(e2e): create multiple VMs in one go through the CLI #2181

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 30, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 44 additions & 82 deletions e2e/cluster/cmx/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@ import (
"sync"
"testing"
"time"

"github.com/google/uuid"
)

type ClusterInput struct {
Expand All @@ -35,77 +33,41 @@ type Cluster struct {
}

type Node struct {
ID string `json:"id"`
Name string `json:"name"`
ID string `json:"id"`
Name string `json:"name"`
NetworkID string `json:"network_id"`

privateIP string `json:"-"`
sshEndpoint string `json:"-"`
adminConsoleURL string `json:"-"`
}

type Network struct {
ID string `json:"id"`
Name string `json:"name"`
ID string `json:"id"`
}

func NewCluster(in *ClusterInput) *Cluster {
c := &Cluster{t: in.T, supportBundleNodeIndex: in.SupportBundleNodeIndex}
c.t.Cleanup(c.Destroy)

c.Nodes = make([]Node, in.Nodes)

network, err := NewNetwork(in)
nodes, err := NewNodes(in)
if err != nil {
in.T.Fatalf("failed to create network: %v", err)
}
c.network = network

for i := range c.Nodes {
node, err := NewNode(in, i, network.ID)
if node != nil {
c.Nodes[i] = *node
}
if err != nil {
in.T.Fatalf("create node %d: %v", i, err)
}
in.T.Logf("node%d created with ID %s", i, node.ID)
in.T.Fatalf("failed to create nodes: %v", err)
}
in.T.Logf("cluster created with network ID %s", nodes[0].NetworkID)
c.Nodes = nodes
c.network = &Network{ID: nodes[0].NetworkID}

return c
}

func NewNetwork(in *ClusterInput) (*Network, error) {
name := fmt.Sprintf("ec-e2e-%s", uuid.New().String())
in.T.Logf("creating network %s", name)

output, err := exec.Command("replicated", "network", "create", "--name", name, "--wait", "5m", "-ojson").Output() // stderr can break json parsing
if err != nil {
if exitErr, ok := err.(*exec.ExitError); ok {
return nil, fmt.Errorf("create network %s: %w: stderr: %s: stdout: %s", name, err, string(exitErr.Stderr), string(output))
}
return nil, fmt.Errorf("create network %s: %w: stdout: %s", name, err, string(output))
}

var networks []Network
if err := json.Unmarshal(output, &networks); err != nil {
return nil, fmt.Errorf("parse networks output: %v: %s", err, string(output))
}
if len(networks) != 1 {
return nil, fmt.Errorf("expected 1 network, got %d", len(networks))
}
network := &networks[0]
in.T.Logf("Network created with ID %s", network.ID)
return network, nil
}

func NewNode(in *ClusterInput, index int, networkID string) (node *Node, err error) {
nodeName := fmt.Sprintf("node%d", index)
in.T.Logf("creating node %s", nodeName)
func NewNodes(in *ClusterInput) ([]Node, error) {
in.T.Logf("creating %s nodes", strconv.Itoa(in.Nodes))

args := []string{
"vm", "create",
"--name", nodeName,
"--network", networkID,
"--name", "ec-test-suite",
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we're creating multiple nodes now they'll all gonna have the same name, I've replaced the ID in most of the logs in here and also added a couple of initial log lines post cluster creation that maps the index in the Nodes slice to the node ID and private IP. This is how it looks:

   cluster.go:64: creating 3 nodes
    cluster.go:136: node 0 created with ID b8309fa6 and private IP 10.0.0.247
    cluster.go:136: node 1 created with ID 1a08d35f and private IP 10.0.0.192
    cluster.go:136: node 2 created with ID 5054f8b6 and private IP 10.0.0.229
    cluster.go:56: cluster created with network ID 624bf1704b41342375aa887fcce206124bfbd01da194720556d06976ec9f435f
    restore_test.go:649: 2025-05-22T12:46:48+01:00: deploying minio on node 0
    restore_test.go:655: 2025-05-22T12:49:16+01:00: downloading airgap files
    airgap.go:92: downloaded airgap bundle on node 0 to /assets/ec-release.tgz (2.0 GB) in 1m17.617605375s
    airgap.go:92: downloaded airgap bundle on node 0 to /assets/ec-release-upgrade.tgz (2.1 GB) in 1m21.8185815s
    restore_test.go:668: 2025-05-22T12:50:37+01:00: installing expect package on node 0
    restore_test.go:673: 2025-05-22T12:50:42+01:00: installing expect package on node 2
    restore_test.go:678: 2025-05-22T12:50:46+01:00: airgapping cluster
    cluster.go:236: node 0 is airgapped successfully
    cluster.go:236: node 1 is airgapped successfully
    cluster.go:236: node 2 is airgapped successfully

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it ok that all workflow runs will have the same name?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would imagine it's fine since we do all the operations based on node IDs, it's also not that different from having multiple runs using `node0´ for example.

"--count", strconv.Itoa(in.Nodes),
"--wait", "5m",
"-ojson",
}
Expand All @@ -128,54 +90,54 @@ func NewNode(in *ClusterInput, index int, networkID string) (node *Node, err err
output, err := exec.Command("replicated", args...).Output() // stderr can break json parsing
if err != nil {
if exitErr, ok := err.(*exec.ExitError); ok {
return nil, fmt.Errorf("create node %s: %w: stderr: %s: stdout: %s", nodeName, err, string(exitErr.Stderr), string(output))
return nil, fmt.Errorf("create nodes: %w: stderr: %s: stdout: %s", err, string(exitErr.Stderr), string(output))
}
return nil, fmt.Errorf("create node %s: %w: stdout: %s", nodeName, err, string(output))
return nil, fmt.Errorf("create nodes: %w: stdout: %s", err, string(output))
}

var nodes []Node
if err := json.Unmarshal(output, &nodes); err != nil {
return nil, fmt.Errorf("unmarshal node: %v: %s", err, string(output))
}
if len(nodes) != 1 {
return nil, fmt.Errorf("expected 1 node, got %d", len(nodes))
}
node = &nodes[0]

// TODO (@salah): remove this once the bug is fixed in CMX
// note: the vm gets marked as ready before the services are actually running
time.Sleep(30 * time.Second)

sshEndpoint, err := getSSHEndpoint(node.ID)
if err != nil {
return node, fmt.Errorf("get ssh endpoint for node %s: %v", nodeName, err)
}
node.sshEndpoint = sshEndpoint
for i := range nodes {
sshEndpoint, err := getSSHEndpoint(nodes[i].ID)
if err != nil {
return nil, fmt.Errorf("get ssh endpoint for node %s: %v", nodes[i].ID, err)
}
nodes[i].sshEndpoint = sshEndpoint

privateIP, err := discoverPrivateIP(*node)
if err != nil {
return node, fmt.Errorf("discover node private IP: %v", err)
}
node.privateIP = privateIP
privateIP, err := discoverPrivateIP(nodes[i])
if err != nil {
return nil, fmt.Errorf("discover node private IP: %v", err)
}
nodes[i].privateIP = privateIP

if err := ensureAssetsDir(*node); err != nil {
return node, fmt.Errorf("ensure assets dir on node %s: %v", node.Name, err)
}
if err := ensureAssetsDir(nodes[i]); err != nil {
return nil, fmt.Errorf("ensure assets dir on node %s: %v", nodes[i].ID, err)
}

if err := copyScriptsToNode(*node); err != nil {
return node, fmt.Errorf("copy scripts to node %s: %v", node.Name, err)
}
if err := copyScriptsToNode(nodes[i]); err != nil {
return nil, fmt.Errorf("copy scripts to node %s: %v", nodes[i].ID, err)
}

if index == 0 {
in.T.Logf("exposing port 30003 on node %s", node.Name)
hostname, err := exposePort(*node, "30003")
if err != nil {
return node, fmt.Errorf("expose port: %v", err)
if i == 0 {
in.T.Logf("exposing port 30003 on node %s", nodes[i].ID)
hostname, err := exposePort(nodes[i], "30003")
if err != nil {
return nil, fmt.Errorf("expose port: %v", err)
}
nodes[i].adminConsoleURL = fmt.Sprintf("http://%s", hostname)
}
node.adminConsoleURL = fmt.Sprintf("http://%s", hostname)

in.T.Logf("node %d created with ID %s and private IP %s", i, nodes[i].ID, nodes[i].privateIP)
}

return node, nil
return nodes, nil
}

func discoverPrivateIP(node Node) (string, error) {
Expand Down Expand Up @@ -337,14 +299,14 @@ func (c *Cluster) Destroy() {
func (c *Cluster) removeNode(node Node) {
output, err := exec.Command("replicated", "vm", "rm", node.ID).CombinedOutput()
if err != nil {
c.t.Logf("failed to destroy node %s: %v: %s", node.Name, err, string(output))
c.t.Logf("failed to destroy node %s: %v: %s", node.ID, err, string(output))
}
}

func (c *Cluster) removeNetwork(network Network) {
output, err := exec.Command("replicated", "network", "rm", network.ID).CombinedOutput()
if err != nil {
c.t.Logf("failed to destroy network %s: %v: %s", network.Name, err, string(output))
c.t.Logf("failed to destroy network %s: %v: %s", network.ID, err, string(output))
}
}

Expand Down