initial commit (WIP)

2025-07-18 18:26:49 -06:00 · 2025-07-18 18:26:49 -06:00 · 6ad4cfb189
commit 6ad4cfb189
14 changed files with 554 additions and 0 deletions
--- a/.obsidian/app.json
+++ b/.obsidian/app.json
@ -0,0 +1,3 @@
+{
+  "alwaysUpdateLinks": true
+}
--- a/.obsidian/appearance.json
+++ b/.obsidian/appearance.json
@ -0,0 +1 @@
+{}
--- a/.obsidian/core-plugins.json
+++ b/.obsidian/core-plugins.json
@ -0,0 +1,31 @@
+{
+  "file-explorer": true,
+  "global-search": true,
+  "switcher": true,
+  "graph": true,
+  "backlink": true,
+  "canvas": true,
+  "outgoing-link": true,
+  "tag-pane": true,
+  "properties": false,
+  "page-preview": true,
+  "daily-notes": true,
+  "templates": true,
+  "note-composer": true,
+  "command-palette": true,
+  "slash-command": false,
+  "editor-status": true,
+  "bookmarks": true,
+  "markdown-importer": false,
+  "zk-prefixer": false,
+  "random-note": false,
+  "outline": true,
+  "word-count": true,
+  "slides": false,
+  "audio-recorder": false,
+  "workspaces": false,
+  "file-recovery": true,
+  "publish": false,
+  "sync": true,
+  "webviewer": false
+}
--- a/.obsidian/graph.json
+++ b/.obsidian/graph.json
@ -0,0 +1,22 @@
+{
+  "collapse-filter": true,
+  "search": "",
+  "showTags": false,
+  "showAttachments": false,
+  "hideUnresolved": false,
+  "showOrphans": true,
+  "collapse-color-groups": true,
+  "colorGroups": [],
+  "collapse-display": true,
+  "showArrow": false,
+  "textFadeMultiplier": 0,
+  "nodeSizeMultiplier": 1,
+  "lineSizeMultiplier": 1,
+  "collapse-forces": true,
+  "centerStrength": 0.518713248970312,
+  "repelStrength": 10,
+  "linkStrength": 1,
+  "linkDistance": 250,
+  "scale": 1,
+  "close": true
+}
--- a/.obsidian/workspace.json
+++ b/.obsidian/workspace.json
@ -0,0 +1,210 @@
+{
+  "main": {
+    "id": "5beee7a86b8e6d7b",
+    "type": "split",
+    "children": [
+      {
+        "id": "293ae152699ad4dc",
+        "type": "tabs",
+        "dimension": 49.30434782608696,
+        "children": [
+          {
+            "id": "e7be67cc73914413",
+            "type": "leaf",
+            "state": {
+              "type": "markdown",
+              "state": {
+                "file": "Software/Software.md",
+                "mode": "source",
+                "source": false
+              },
+              "icon": "lucide-file",
+              "title": "Software"
+            }
+          }
+        ]
+      },
+      {
+        "id": "5575fc1f3edcb038",
+        "type": "tabs",
+        "dimension": 50.69565217391304,
+        "children": [
+          {
+            "id": "2e25f7a996a520e8",
+            "type": "leaf",
+            "state": {
+              "type": "markdown",
+              "state": {
+                "file": "Troubleshooting.md",
+                "mode": "source",
+                "source": false
+              },
+              "icon": "lucide-file",
+              "title": "Troubleshooting"
+            }
+          }
+        ]
+      }
+    ],
+    "direction": "vertical"
+  },
+  "left": {
+    "id": "93324146bc93f225",
+    "type": "split",
+    "children": [
+      {
+        "id": "a792a4a8664edc88",
+        "type": "tabs",
+        "children": [
+          {
+            "id": "a9a4b24504ce51fe",
+            "type": "leaf",
+            "state": {
+              "type": "file-explorer",
+              "state": {
+                "sortOrder": "alphabetical",
+                "autoReveal": false
+              },
+              "icon": "lucide-folder-closed",
+              "title": "Files"
+            }
+          },
+          {
+            "id": "eb40eeaab710f462",
+            "type": "leaf",
+            "state": {
+              "type": "search",
+              "state": {
+                "query": "",
+                "matchingCase": false,
+                "explainSearch": false,
+                "collapseAll": false,
+                "extraContext": false,
+                "sortOrder": "alphabetical"
+              },
+              "icon": "lucide-search",
+              "title": "Search"
+            }
+          },
+          {
+            "id": "da4358ad1d89d2f9",
+            "type": "leaf",
+            "state": {
+              "type": "bookmarks",
+              "state": {},
+              "icon": "lucide-bookmark",
+              "title": "Bookmarks"
+            }
+          }
+        ]
+      }
+    ],
+    "direction": "horizontal",
+    "width": 213.5
+  },
+  "right": {
+    "id": "f3c2a2165de3d045",
+    "type": "split",
+    "children": [
+      {
+        "id": "0ee61c73668fb8d7",
+        "type": "tabs",
+        "children": [
+          {
+            "id": "b4dcef26a28cd2d7",
+            "type": "leaf",
+            "state": {
+              "type": "backlink",
+              "state": {
+                "file": "Troubleshooting.md",
+                "collapseAll": false,
+                "extraContext": false,
+                "sortOrder": "alphabetical",
+                "showSearch": false,
+                "searchQuery": "",
+                "backlinkCollapsed": false,
+                "unlinkedCollapsed": true
+              },
+              "icon": "links-coming-in",
+              "title": "Backlinks for Troubleshooting"
+            }
+          },
+          {
+            "id": "9a63ced004bf5fcf",
+            "type": "leaf",
+            "state": {
+              "type": "outgoing-link",
+              "state": {
+                "file": "State Management Database (SMD).md",
+                "linksCollapsed": false,
+                "unlinkedCollapsed": true
+              },
+              "icon": "links-going-out",
+              "title": "Outgoing links from State Management Database (SMD)"
+            }
+          },
+          {
+            "id": "82d569e4e5b2946f",
+            "type": "leaf",
+            "state": {
+              "type": "tag",
+              "state": {
+                "sortOrder": "frequency",
+                "useHierarchy": true,
+                "showSearch": false,
+                "searchQuery": ""
+              },
+              "icon": "lucide-tags",
+              "title": "Tags"
+            }
+          },
+          {
+            "id": "9b15aa5b1c5b4f39",
+            "type": "leaf",
+            "state": {
+              "type": "outline",
+              "state": {
+                "file": "State Management Database (SMD).md",
+                "followCursor": false,
+                "showSearch": false,
+                "searchQuery": ""
+              },
+              "icon": "lucide-list",
+              "title": "Outline of State Management Database (SMD)"
+            }
+          }
+        ]
+      }
+    ],
+    "direction": "horizontal",
+    "width": 300
+  },
+  "left-ribbon": {
+    "hiddenItems": {
+      "switcher:Open quick switcher": false,
+      "graph:Open graph view": false,
+      "canvas:Create new canvas": false,
+      "daily-notes:Open today's daily note": false,
+      "templates:Insert template": false,
+      "command-palette:Open command palette": false
+    }
+  },
+  "active": "2e25f7a996a520e8",
+  "lastOpenFiles": [
+    "Software/Software.md",
+    "Getting Started.md",
+    "Troubleshooting.md",
+    "Use Cases/Advanced Use Cases.md",
+    "Deployments/Deploying with Podman Quadlets.md",
+    "OpenCHAMI Wiki.md",
+    "Use Cases",
+    "Software/Magellan.md",
+    "Deployments",
+    "Software",
+    "Deployments/Deployments.md",
+    "Software/State Management Database (SMD).md",
+    "Untitled.canvas",
+    "Welcome.md",
+    "Untitled 1.canvas"
+  ]
+}
--- a/Deployments/Deploying
+++ b/Deployments/Deploying
--- a/Deployments/Deployments.md
+++ b/Deployments/Deployments.md
@ -0,0 +1,10 @@
+OpenCHAMI offers deploying the microservices in several ways. This document covers the supported ways to deploy 
+## Podman Quadlets
+
+### Discovering Nodes
+
+#### Static Discovery
+#### Dynamic Discovery
+
+## Docker Compose
+
--- a/Started.md
+++ b/Started.md
@ -0,0 +1,17 @@
+OpenCHAMI provides a [tutorial](https://github.com/OpenCHAMI/tutorial-2025) to introduce new users to the project. This tutorial demonstrates how to quickly jump start a development environment with the OpenCHAMI services using Podman quadlets and `systemd`. The main part of the tutorial is organized into 2 phases that covers the following topics:
+
+1. Preparing Head Node or Instance
+2. Installing OpenCHAMI
+3. Discovering Nodes
+4. Building Images
+5. Booting Nodes
+6. Provisioning Nodes
+
+For further exploration, refer to the [[Advanced Use Cases]] section to see how you can use OpenCHAMI, such as:
+
+1. Adding SLURM and MPI to the Compute Node
+2. Serving the Root Filesystem with NFS (import-image.sh)
+3. Enabling WireGuard Security with `cloud-init-server`
+4. Using Image Layers to Customize Boot Image and with a Common Base
+5. Using `kexec` to Reboot Nodes For an Upgrade or Specific Kernal
+6. Discovering Nodes Dynamically with Redfish
--- a/Wiki.md
+++ b/Wiki.md
@ -0,0 +1,8 @@
+Welcome to the OpenCHAMI wiki! This wiki is provides a unified guide and documentation for using and deploying OpenCHAMI. Here are some links to get started with the software.
+
+[Getting Started](Getting%20Started.md)
+[Software](Software.md)
+[Deployments](Deployments.md)
+[Use Cases](Advanced%20Use%20Cases.md)
+[Troubleshooting](Troubleshooting.md)
+Getting Involved
--- a/Software/Magellan.md
+++ b/Software/Magellan.md
--- a/Software/Software.md
+++ b/Software/Software.md
@ -0,0 +1,15 @@
+The OpenCHAMI project contains a collection of software built to discover, manage, and provision nodes. This sections contains a brief introduction and user guide to quickly get you started with each tool or service.
+
+- **[Magellan](Magellan.md)** - Redfish-based tool for automatic node discovery and firmware management
+- **[State Management Database (SMD)](State%20Management%20Database%20(SMD).md)** - Compute Node Inventory Daemon with support for ad-hoc group
+- **[Boot Script Service (BSS)](BootScriptService.md)** - Automatic boot script generation for diskless/diskful HPC Compute Nodes
+- **[Image Builder](ImageBuilder.md)** - Tooling for creating squashfs filesystems for remote booting HPC Nodes
+- **[Cloud-Init Server](CloudInitServer.md)** - Automatic generation of cloud-init payloads with optional machine identity and optional transport layer security through Wireguard
+- **[coresmd](CoreSMD.md)** - CoreDHCP plugin for autmatically updating the DHCP configuration based on information from the rest of OpenCHAMI
+
+### Third Party Open Source
+
+- **[Step-CA](https://smallstep.com/certificates/)** - ACME certificate authority designed to be run as a microservice
+- **[Haproxy](https://www.haproxy.org/)** - Reverse proxy for allowing all microservices to be accessible through a single http(s) host
+- **[Ory Hydra](https://github.com/ory/hydra)** - OIDC provider to use site identity for authorization within OpenCHAMI
+- **[Postgresql](https://www.postgresql.org/)** - Database persistence for services that need it
--- a/Software/State
+++ b/Software/State
--- a/Troubleshooting.md
+++ b/Troubleshooting.md
@ -0,0 +1,77 @@
+Sometimes, things don't always work out as we would expect them to when trying to install the services or boot nodes. Whether your issue is related to the services or configuration, this section covers a list of issues you may run into working with OpenCHAMI. Keep in mind that this list is continuously updated as the software is changed.
+
+### Services Not Starting
+
+### Certificate and TLS Errors
+
+
+
+### Cannot Make Request to Service
+
+#### Access Token Errors
+
+When making a request, if you receive errors related to the access, there are a few things you may want to check.
+
+1. If you're making requests using the `ochami` CLI to services like SMD, make sure that the `ACCESS_TOKEN` environment variable is set.
+2. If you're 
+
+### Cannot Discover Nodes
+
+### Nodes Are Not Booting
+
+When booting with iPXE, it is critical to make sure that you specified the correct images in the boot script. Make sure there is not a typo and that the image exists.
+
+```bash
+>>Start PXE over IPv4.
+  PXE-E18: Server response timeout.
+BdsDxe: failed to load Boot0001 "UEFI PXEv4 (MAC:525400BEEF01)" from PciRoot(0x0)/Pci(0x1,0x0)/Pci(0x0,0x0)/MAC(525400BEEF01,0x1)/IPv4(0.0.0.0,0x0,DHCP,0.0.0.0,0.0.0.0,0.0.0.0): Not Found
+
+>>Start PXE over IPv6.
+```
+
+#### Node Cannot Make Request to S3
+
+Sometimes the node may not be able to complete a request to the DHCP to get the iPXE binary.
+### Images Are Not Pushed to S3 Bucket
+
+### Image Are Not Pushed to the OCI Registry
+
+### Errors Caused By SELinux
+
+If you see an error like the one below after using `s3cmd` to create your S3 buckets, try disabling SELinux and try again.
+
+```bash
+[rocky@devonb-tutorial-practice images]$ s3cmd setacl s3://boot-images --acl-public                                                                                                                   [3/763]
+                                                   
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+    An unexpected error has occurred.
+  Please try reproducing the error using 
+  the latest s3cmd code from the git master
+  branch found at:                                 
+    https://github.com/s3tools/s3cmd
+  and have a look at the known issues list:
+    https://github.com/s3tools/s3cmd/wiki/Common-known-issues-and-their-solutions-(FAQ)
+  If the error persists, please report the
+  following lines (removing any private
+  info as necessary) to:                           
+   s3tools-bugs@lists.sourceforge.net
+
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+```
+
+If you make a `curl` request and watch the `minio-server` logs, you should see an error when the request is made. 
+
+```bash
+$ sudo podman exec -it minio-server curl localhost:9000
+curl: (56) Recv failure: Connection reset by peer
+```
+
+Then make the request to the `minio-server`.
+
+```bash
+$ curl 172.16.0.254:9000
+curl: (52) Empty reply from server
+```
+
--- a/Cases/Advanced
+++ b/Cases/Advanced
@ -0,0 +1,160 @@
+After going through the [tutorial](https://github.com/OpenCHAMI/tutorial-2025), you should be familiar and comfortable enough with OpenCHAMI to make changes to the deployment process and configuration. We're going to cover some of the more common use-cases that an OpenCHAMI user would want to pursue.
+
+At this point, we can use what we have learned so far in the OpenCHAMI tutorial to customize our nodes in various ways such as changing how we serve images, deriving new images, and updating our cloud-init config. This sections explores some of the use cases that you may want to explore to utilize OpenCHAMI to fit your own needs.
+
+## Adding SLURM and MPI to the Compute Node
+
+After getting our nodes to boot using our compute images, let's try running a test MPI job. We need to install and configure both SLURM and MPI to do so. We can do this at least two ways here:
+
+- Create a new `compute-mpi` image similar to the `compute-debug` image using the `compute-base` image as a base. You do not have to rebuild the parent images unless you want to make changes to them, but keep in mind that you will also have to rebuild any derivative images.
+
+### Building Into the Image
+
+
+
+### Installing via Cloud-Init
+
+Alternatively, we can install the necessary SLURM and MPI packages in our cloud-init config and set up or node in the `cmds` section of the config file.
+
+Let's start by making changes to the cloud-init config file  in `/opt/workdir/cloud-init/computes.yaml`  that we used previously. Note that we are using a pre-built RPMs to install SLURM and OpenMPI from the Rocky 9 repos.
+
+```bash
+- name: compute
+  description: "compute config"
+  file:
+    encoding: plain
+    content: |
+      ## template: jinja
+      #cloud-config
+      merge_how:
+      - name: list
+        settings: [append]
+      - name: dict
+        settings: [no_replace, recurse_list]
+      users:
+        - name: root
+          ssh_authorized_keys: {{ ds.meta_data.instance_data.v1.public_keys }}
+      disable_root: false
+      packages:
+	    - slurm
+	    - openmpi
+      cmds:
+	    - systemctl enable slurmctld
+	    - systemctl enable slurmdbd
+```
+
+We added the `packages` section to tell cloud-init to install the `slurm` and `openmpi` packages after booting the compute
+
+### Prepare SLURM on Head Node
+
+### Run a Sample MPI job across two VMs
+
+After we have installed both SLURM and OpenMPI on the compute node, let's try and launch a "hello world" MPI job. To do so, we will need three things:
+
+1. Source code for MPI program
+2. Compiled MPI executable binary
+3. SLURM job script
+
+We create the MPI program in C. First, create a new directory to store our source code. Then, edit the `/opt/workdir/apps/hello.c` file. 
+
+```bash
+mkdir -p /opt/workdir/apps/mpi/hello
+# edit /opt/workdir/apps/mpi/hello/hello.c
+```
+
+Now copy the contents below into the `hello.c` file. 
+
+```c
+/*The Parallel Hello World Program*/
+#include <stdio.h>
+#include <mpi.h>
+
+main(int argc, char **argv)
+{
+   int node;
+   
+   MPI_Init(&argc,&argv);
+   MPI_Comm_rank(MPI_COMM_WORLD, &node);
+     
+   printf("Hello World from Node %d\n",node);
+            
+   MPI_Finalize();
+}
+```
+
+Compile the program.
+
+```bash
+cd /opt/workdir/apps/mpi/hello
+mpicc hello.c -o hello
+```
+
+You should have an `hello` executable in the `/opt/workdir/apps/mpi/hello` directory now. We can use this binary executable with SLURM to launch process in parallel.
+
+Let's create a job script to launch the executable we just created. Create a new directory to hold our SLURM job script. Then, edit a new file called `launch-hello.sh` in the new `/opt/workdir/jobscripts` directory.
+
+```bash
+mkdir -p /opt/workdir/jobscripts
+cd /opt/workdir/jobscripts
+# edit launch.sh
+```
+
+Copy the contents below into the `launch-hello.sh` job script.
+
+> [!NOTE]
+> The contents of your job script may vary significantly depending on your cluster. Refer to the documentation for your institution and adjust the script accordingly to your needs.
+
+```bash
+#!/bin/bash 
+
+#SBATCH --job-name=hello
+#SBATCH --account=account_name
+#SBATCH --partition=partition_name 
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=4
+#SBATCH --time=00:00:30
+export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK /opt/workdir/apps/mpi/hello/hello
+```
+
+We should now have everything we need to test our MPI job with our compute node(s). Launch the job with the `sbatch` command.
+
+```bash
+sbatch /opt/workdir/jobscripts/launch-hello.sh
+```
+
+We can confirm the job is running with the `squeue` command.
+
+```bash
+squeue
+```
+
+You should see a list with a job named `hello` that was given in the `launch-hello.sh` job script.
+
+```bash
+# TODO: add output of squeue above
+```
+
+If you saw the output above, you should now be able to inspect the output of the job when it completes.
+
+```bash
+# TODO: add output of MPI job (should be something like hello.o and/or hello.e)
+```
+
+And that's it! You have successfully launched an MPI job with SLURM from an OpenCHAMI deployed system.
+## Serving the Root Filesystem with NFS (import-image.sh)
+
+For this tutorial, we served images via HTTP using a local S3 bucket (MinIO) and OCI registry. We could instead serve our images using NFS by setting up and running a NFS server on the head node, include NFS tools in our base image, and configuring our nodes to work with NFS.
+
+## Enable WireGuard Security for the `cloud-init-server`
+
+## Using Image Layers to Customize Boot Image and with a Common Base
+
+Often, we want to allocate nodes for different purposes using different images. Let's use the base image that we created before and create another Kubernetes layer called `kubernetes-worker` based on the `base` image we created before. We would need to modify the boot script to use this new Kubernetes image and update cloud-init set up the nodes.
+
+## Using `kexec` to Reboot Nodes For an Upgrade or Specific Kernal
+
+
+## Discovering Nodes Dynamically with Redfish
+
+In this tutorial, we used static discovery to to populate our inventory in SMD instead of dynamically discovering nodes on our network. Static discovery is good when we know beforehand the MAC address, IP address, xname, and NID of our nodes and guarantee determistic behavior. However, if we don't know these properties or if we want to update our inventory state, we can use `magellan` to scan, collect, and populate SMD with these properties.