Merge pull request 2i2c-org#5289 from sgibson91/ci-cd/separate-suppor…

…t-staging Refactor CI/CD so support and staging jobs are separate, and multiple staging hubs are detected
sgibson91 · Dec 19, 2024 · a43e8df · a43e8df
2 parents c9ba7b0 + 60f442a
commit a43e8df
Show file tree

Hide file tree

Showing 8 changed files with 647 additions and 561 deletions.
diff --git a/.github/workflows/deploy-hubs.yaml b/.github/workflows/deploy-hubs.yaml
diff --git a/deployer/commands/generate/helm_upgrade/decision.py b/deployer/commands/generate/helm_upgrade/decision.py
diff --git a/deployer/commands/generate/helm_upgrade/jobs.py b/deployer/commands/generate/helm_upgrade/jobs.py
@@ -11,10 +11,8 @@
 from .decision import (
     assign_staging_jobs_for_missing_clusters,
     discover_modified_common_files,
-    ensure_support_staging_jobs_have_correct_keys,
     generate_hub_matrix_jobs,
     generate_support_matrix_jobs,
-    move_staging_hubs_to_staging_matrix,
     pretty_print_matrix_jobs,
 )
 
@@ -59,8 +57,9 @@ def helm_upgrade_jobs(
     cluster_files = get_all_cluster_yaml_files()
 
     # Empty lists to store job definitions in
+    support_matrix_jobs = []
+    staging_hub_matrix_jobs = []
     prod_hub_matrix_jobs = []
-    support_and_staging_matrix_jobs = []
 
     for cluster_file in cluster_files:
         # Read in the cluster.yaml file
@@ -92,20 +91,20 @@ def helm_upgrade_jobs(
             upgrade_support_on_this_cluster = False
 
         # Generate a job matrix of all hubs that need upgrading on this cluster
-        prod_hub_matrix_jobs.extend(
-            generate_hub_matrix_jobs(
-                cluster_file,
-                cluster_config,
-                cluster_info,
-                set(changed_filepaths),
-                pr_labels,
-                upgrade_all_hubs_on_this_cluster=upgrade_all_hubs_on_this_cluster,
-                upgrade_all_hubs_on_all_clusters=upgrade_all_hubs_on_all_clusters,
-            )
+        staging_hubs, prod_hubs = generate_hub_matrix_jobs(
+            cluster_file,
+            cluster_config,
+            cluster_info,
+            set(changed_filepaths),
+            pr_labels,
+            upgrade_all_hubs_on_this_cluster=upgrade_all_hubs_on_this_cluster,
+            upgrade_all_hubs_on_all_clusters=upgrade_all_hubs_on_all_clusters,
         )
+        staging_hub_matrix_jobs.extend(staging_hubs)
+        prod_hub_matrix_jobs.extend(prod_hubs)
 
         # Generate a job matrix for support chart upgrades
-        support_and_staging_matrix_jobs.extend(
+        support_matrix_jobs.extend(
             generate_support_matrix_jobs(
                 cluster_file,
                 cluster_config,
@@ -118,21 +117,13 @@ def helm_upgrade_jobs(
         )
 
     # Clean up the matrix jobs
-    (
-        prod_hub_matrix_jobs,
-        support_and_staging_matrix_jobs,
-    ) = move_staging_hubs_to_staging_matrix(
-        prod_hub_matrix_jobs, support_and_staging_matrix_jobs
-    )
-    support_and_staging_matrix_jobs = ensure_support_staging_jobs_have_correct_keys(
-        support_and_staging_matrix_jobs, prod_hub_matrix_jobs
-    )
-    support_and_staging_matrix_jobs = assign_staging_jobs_for_missing_clusters(
-        support_and_staging_matrix_jobs, prod_hub_matrix_jobs
+    staging_hub_matrix_jobs = assign_staging_jobs_for_missing_clusters(
+        staging_hub_matrix_jobs, prod_hub_matrix_jobs
     )
-
     # Pretty print the jobs using rich
-    pretty_print_matrix_jobs(prod_hub_matrix_jobs, support_and_staging_matrix_jobs)
+    pretty_print_matrix_jobs(
+        support_matrix_jobs, staging_hub_matrix_jobs, prod_hub_matrix_jobs
+    )
 
     # The existence of the CI environment variable is an indication that we are running
     # in an GitHub Actions workflow
@@ -145,15 +136,14 @@ def helm_upgrade_jobs(
     if ci_env:
         # Add these matrix jobs as output variables for use in another job
         with open(output_file, "a") as f:
-            f.write(f"prod-hub-matrix-jobs={json.dumps(prod_hub_matrix_jobs)}\n")
-            f.write(
-                f"support-and-staging-matrix-jobs={json.dumps(support_and_staging_matrix_jobs)}\n"
-            )
+            f.write(f"support-jobs={json.dumps(support_matrix_jobs)}\n")
+            f.write(f"staging-jobs={json.dumps(staging_hub_matrix_jobs)}\n")
+            f.write(f"prod-jobs={json.dumps(prod_hub_matrix_jobs)}\n")
 
-        # Don't bother generating a comment if both of the matrices are empty
-        if support_and_staging_matrix_jobs or prod_hub_matrix_jobs:
+        # Don't bother generating a comment if all of the matrices are empty
+        if support_matrix_jobs or staging_hub_matrix_jobs or prod_hub_matrix_jobs:
             # Generate Markdown tables from the job matrices and write them to a file
             # for use in another job
             create_markdown_comment(
-                support_and_staging_matrix_jobs, prod_hub_matrix_jobs
+                support_matrix_jobs, staging_hub_matrix_jobs, prod_hub_matrix_jobs
             )
diff --git a/deployer/utils/rendering.py b/deployer/utils/rendering.py
@@ -36,72 +36,75 @@ def print_colour(msg: str, colour="green"):
         print(msg)
 
 
-def create_markdown_comment(support_staging_matrix, prod_matrix):
+def create_markdown_comment(support_matrix, staging_matrix, prod_matrix):
     """Convert a list of dictionaries into a Markdown formatted table for posting to
     GitHub as comments. This function will write the Markdown content to a file to allow
     a GitHub Actions to upload it as an artifact and reuse the content in another
     workflow.
 
     Args:
-        support_staging_matrix (list[dict]): The support of staging jobs to be converted
-            into a Markdown formatted table
+        support_matrix (list[dict]): The support jobs to be converted into a Markdown
+            formatted table
+        staging_matrix (list[dict]): The staging jobs to be converted into a Markdown
+            formatted table
         prod_matrix (list[dict]): The production jobs to be converted into a Markdown
             formatted table
     """
     # A dictionary to convert column names
     column_converter = {
         "cluster_name": "Cluster Name",
         "provider": "Cloud Provider",
-        "upgrade_support": "Upgrade Support?",
-        "reason_for_support_redeploy": "Reason for Support Redeploy",
-        "upgrade_staging": "Upgrade Staging?",
-        "reason_for_staging_redeploy": "Reason for Staging Redeploy",
         "hub_name": "Hub Name",
         "reason_for_redeploy": "Reason for Redeploy",
     }
 
-    # A dictionary to convert row values when they are Boolean
-    boolean_converter = {
-        True: "Yes",
-        False: "No",
-    }
-
     # === To reliably convert a list of dictionaries into a Markdown table, the keys
     # === must be consistent across each dictionary in the list as they will become the
     # === columns of the table. Moreover, we want the columns to be in 'sensible' order
     # === when a human reads this table; therefore, we reformat the inputted jobs.
 
-    # Only execute if support_staging_matrix is not an empty list
-    if support_staging_matrix:
-        # Format the Support and Staging matrix jobs
-        formatted_support_staging_matrix = []
-        for entry in support_staging_matrix:
+    # Only execute if support_matrix is not an empty list
+    if support_matrix:
+        # Format the Support matrix jobs
+        formatted_support_matrix = []
+        for entry in support_matrix:
+            formatted_entry = {
+                column_converter["provider"]: entry["provider"],
+                column_converter["cluster_name"]: entry["cluster_name"],
+                column_converter["reason_for_redeploy"]: entry["reason_for_redeploy"],
+            }
+            formatted_support_matrix.append(formatted_entry)
+
+        # Generate a Markdown table
+        support_md_table = (
+            markdown_table(formatted_support_matrix)
+            .set_params(row_sep="markdown", quote=False)
+            .get_markdown()
+        )
+    else:
+        support_md_table = []
+
+    # Only execute if staging_matrix is not an empty list
+    if staging_matrix:
+        # Format the Staging Hubs matrix jobs
+        formatted_staging_matrix = []
+        for entry in staging_matrix:
             formatted_entry = {
                 column_converter["provider"]: entry["provider"],
                 column_converter["cluster_name"]: entry["cluster_name"],
-                column_converter["upgrade_support"]: boolean_converter[
-                    entry["upgrade_support"]
-                ],
-                column_converter["reason_for_support_redeploy"]: entry[
-                    "reason_for_support_redeploy"
-                ],
-                column_converter["upgrade_staging"]: boolean_converter[
-                    entry["upgrade_staging"]
-                ],
-                column_converter["reason_for_staging_redeploy"]: entry[
-                    "reason_for_staging_redeploy"
-                ],
+                column_converter["hub_name"]: entry["hub_name"],
+                column_converter["reason_for_redeploy"]: entry["reason_for_redeploy"],
             }
-            formatted_support_staging_matrix.append(formatted_entry)
+            formatted_staging_matrix.append(formatted_entry)
 
         # Generate a Markdown table
-        support_staging_md_table = (
-            markdown_table(formatted_support_staging_matrix)
+        staging_md_table = (
+            markdown_table(formatted_staging_matrix)
             .set_params(row_sep="markdown", quote=False)
             .get_markdown()
         )
     else:
-        support_staging_md_table = []
+        staging_md_table = []
 
     # Only execute if prod_matrix is not an empty list
     if prod_matrix:
@@ -129,9 +132,13 @@ def create_markdown_comment(support_staging_matrix, prod_matrix):
     comment_body = f"""<!-- deployment-plan -->
 Merging this PR will trigger the following deployment actions.
 
-### Support and Staging deployments
+### Support deployments
+
+{support_md_table if bool(support_md_table) else 'No support upgrades will be triggered'}
+
+### Staging deployments
 
-{support_staging_md_table if bool(support_staging_md_table) else 'No support or staging upgrades will be triggered'}
+{staging_md_table if bool(staging_md_table) else 'No staging hub upgrades will be triggered'}
 
 ### Production deployments
 

diff --git a/docs/hub-deployment-guide/runbooks/phase3/initial-hub-setup.md b/docs/hub-deployment-guide/runbooks/phase3/initial-hub-setup.md
@@ -162,13 +162,13 @@ All of the following steps must be followed in order to consider phase 3.1 compl
    If Dask gateway will be needed, then choose a `basehub`, and follow the guide on
    [how to enable dask-gateway on an existing hub](howto:features:daskhub).
 
-1. **Add the new cluster to CI/CD**
+1. **Add the new cluster and staging hub to CI/CD**
 
    ```{important}
-   This step is only applicable if the hub is the first hub being deployed to a cluster.
+   This step is only applicable if the hub is the first hub being deployed to a cluster **or** has `staging` in it's name.
    ```
 
-   To ensure the new cluster and its hubs are appropriately handled by our CI/CD system, please add it as an entry in the following places:
+   To ensure the new cluster and its hubs are appropriately handled by our CI/CD system, please add it as an entry in the following places in the [`deploy-hubs.yaml`](https://github.com/2i2c-org/infrastructure/blob/HEAD/.github/workflows/deploy-hubs.yaml) GitHub Actions workflow file:
 
       - The [`deploy-hubs.yaml`](https://github.com/2i2c-org/infrastructure/blob/008ae2c1deb3f5b97d0c334ed124fa090df1f0c6/.github/workflows/deploy-hubs.yaml#L121) GitHub workflow has a job named [`upgrade-support-and-staging`](https://github.com/2i2c-org/infrastructure/blob/18f5a4f8f39ed98c2f5c99091ae9f19a1075c988/.github/workflows/deploy-hubs.yaml#L128-L166) that needs to list of clusters being automatically deployed by our CI/CD system. Add an entry for the new cluster here.
 

diff --git a/docs/reference/ci-cd/hub-deploy.md b/docs/reference/ci-cd/hub-deploy.md
@@ -7,62 +7,50 @@ You can learn more about this workflow in our blog post [Multiple JupyterHubs, m
 
 The best place to learn about the latest state of our *automatic* hub deployment
 is to look at [the `deploy-hubs.yaml` GitHub Actions workflow file](https://github.com/2i2c-org/infrastructure/tree/HEAD/.github/workflows/deploy-hubs.yaml).
-This workflow file depends on a locally defined action that [sets up access to a given cluster](https://github.com/2i2c-org/infrastructure/blob/main/.github/actions/setup-deploy/action.yaml) and itself contains four main jobs, detailed below.
+This workflow file depends on a locally defined action that [sets up access to a given cluster](https://github.com/2i2c-org/infrastructure/blob/main/.github/actions/setup-deploy/action.yaml) and itself contains a range of jobs, the most relevant ones of which are detailed below.
+There are also some filtering/optimisation jobs which are not discussed here.
 
 ## Main hub deployment workflow
 
 (cicd/hub/generate-jobs)=
 ### 1. `generate-jobs`: Generate Helm upgrade jobs
 
 The first job takes a list of files that have been added/modified as part of a Pull Request and pipes them into the [`generate-helm-upgrade-jobs` sub-command](https://github.com/2i2c-org/infrastructure/blob/main/deployer/helm_upgrade_decision.py) of the [deployer module](https://github.com/2i2c-org/infrastructure/tree/main/deployer).
-This sub-command uses a set of functions to calculate which hubs on which clusters require a helm upgrade, alongside whether the support chart and staging hub on that cluster should also be upgraded.
-If any production hubs require an upgrade, the upgrade of the staging hub is a requirement.
+This sub-command uses a set of functions to calculate which hubs on which clusters require a helm upgrade, alongside whether the support chart and staging hub(s) on that cluster should also be upgraded.
+If any production hubs require an upgrade, the upgrade of the staging hub(s) is a requirement.
 
 This job provides the following outputs:
 
-- Two JSON objects that can be read by later GitHub Actions jobs to define matrix jobs.
-  These JSON objects detail: which clusters require their support chart and/or staging hub to be upgraded, and which production hubs require an upgrade.
+- Three JSON objects that can be read by later GitHub Actions jobs to define matrix jobs.
+  These JSON objects detail: which clusters require their support chart to be upgraded, which staging hub(s) require an upgrade, and which production hubs require an upgrade.
 - The above JSON objects are also rendered as human-readable tables using [`rich`](https://github.com/Textualize/rich).
 
-````{admonition} Some special cased filepaths
+```{admonition} Some special cased filepaths
 While the aim of this workflow is to only upgrade the pieces of the infrastructure that require it with every change, some changes do require us to redeploy everything.
 
 - If a cluster's `cluster.yaml` file has been modified, we upgrade the support chart and **all** hubs on **that** cluster. This is because we cannot tell what has been changed without inspecting the diff of the file.
 - If any of the `basehub` or `daskhub` Helm charts have additions/modifications in their paths, we redeploy **all** hubs across **all** clusters.
-- If the support Helm chart has additions/modifications in its path, we redeploy the support chart on **all** clusters.
-- If the deployer module has additions/modifications in its path, then we redeploy **all** hubs on **all** clusters.
-
-```{attention}
-Right now, we redeploy everything when the deployer changes since the deployer undertakes some tasks that generates config related to authentication.
-This may change in the future as we move towards the deployer becoming a separable, stand-alone package.
+- If the `support` Helm chart has additions/modifications in its path, we redeploy the support chart on **all** clusters.
+- If the `deployer` module has additions/modifications in its path, then we redeploy **all** hubs on **all** clusters.
 ```
-````
 
-### 2. `upgrade-support-and-staging`: Upgrade support and staging hub Helm charts on clusters that require it
+### 2. `upgrade-support`: Upgrade support Helm chart on clusters that require it
 
-The next job reads in one of the JSON objects detailed above that defines which clusters need their support chart and/or staging hub upgrading.
-*Note that it is not a requirement for both the support chart and staging hub to be upgraded during this job.*
+The next job reads in one of the JSON objects detailed above that defines which clusters need their support chart upgrading.
 A matrix job is set up that parallelises over all the clusters defined in the JSON object.
-For each cluster, the support chart is first upgraded (if required) followed by the staging hub (if required).
+For each cluster, the support chart is upgraded (if required).
+We set an output variable from this job to determine if any support chart upgrades fail for a cluster.
+We then use these outputs to filter out the failed clusters and prevent further deployments to them, without impairing deployments to unrelated clusters.
 
-```{note}
-The 2i2c cluster is a special case here as it has three staging hubs: one running the `basehub` Helm chart and another running the `daskhub` Helm chart.
-We therefore run extra steps for the 2i2c cluster to upgrade these hubs (if required).
-```
+### 3. `upgrade-staging`: Upgrade Helm chart for staging hub(s) in parallel
 
+Next we deploy the staging hub(s) on a cluster.
 We use staging hubs as [canary deployments](https://sre.google/workbook/canarying-releases/) and prevent deploying production hubs if a staging deployment fails.
-Hence, the last step of this job is to set an output variable that stores if the job completed successfully or failed.
-
-### 3. `filter-generate-jobs`: Filter out jobs for clusters whose support/staging job failed
+Similarly to `upgrade-support`, the last step of this job is to set an output variable that stores if the job completed successfully or failed.
 
-This job is an optimisation job.
-While we do want to prevent all production hubs on Cluster X from being upgraded if its support/staging job fails, we **don't** want to prevent the production hubs on Cluster Y from being upgraded because the support/staging job for Cluster X failed.
+### 4. `upgrade-prod`: Upgrade Helm chart for production hubs in parallel
 
-This job reads in the production hub job definitions generated in job 1 and the support/staging success/failure variables set in job 2, then proceeds to filter out the productions hub upgrade jobs that were due to be run on a cluster whose support/staging job failed.
-
-### 4. `upgrade-prod-hubs`: Upgrade Helm chart for production hubs in parallel
-
-This last job deploys all production hubs that require it in parallel to the clusters that successfully completed job 2.
+This last job deploys all production hubs that require it in parallel to the clusters that successfully completed a staging upgrade.
 
 (cicd/hub/pr-comment)=
 ## Posting the deployment plan as a comment on a Pull Request
@@ -82,7 +70,6 @@ This workflow downloads the artifacts uploaded by `generate-jobs` and then uses
 - Either update an existing comment or create a new comment on the PR posting the Markdown tables downloaded as an artifact.
 
 ```{admonition} Why we're using artifacts and separate workflow files
-
 Any secrets used by GitHub Actions are not available to Pull Requests that come from forks by default to protect against malicious code being executed with privileged access. `generate-jobs` needs to run in the PR context in order to establish which files are added/modified, but the required secrets would not be available for the rest of the workflow that would post a comment to the PR.
 
 To overcome this in a secure manner, we upload the required information (the body of the comment to be posted and the number of the PR the comment should be posted to) as artifacts.

diff --git a/tests/test-clusters/cluster3/cluster.yaml b/tests/test-clusters/cluster3/cluster.yaml
@@ -0,0 +1,15 @@
+name: cluster3
+provider: gcp
+support:
+  helm_chart_values_files:
+    - support.values.yaml
+hubs:
+  - name: staging1
+    helm_chart_values_files:
+      - staging1.values.yaml
+  - name: staging2
+    helm_chart_values_files:
+      - staging2.values.yaml
+  - name: prod
+    helm_chart_values_files:
+      - prod.values.yaml