6 Commits

  1. 57
      docs/config.nix
  2. 19
      docs/content/first_steps.md
  3. 1
      docs/content/index.md
  4. 79
      docs/content/internal/deployment.md
  5. 18
      docs/default.nix
  6. 37
      docs/mkdocs.yaml
  7. 19
      docs/module.nix
  8. 1
      docs/result
  9. 84
      flake.lock
  10. 7
      flake.nix
  11. 3
      machines/manager/default.nix
  12. 97
      machines/manager/dhcp.nix
  13. 20
      machines/manager/docs.nix
  14. 20
      machines/manager/mail.nix
  15. 2
      machines/manager/mpi.nix
  16. 36
      machines/manager/netinstall/default.nix
  17. 36
      machines/manager/network.nix
  18. 6
      machines/manager/secrets.yaml
  19. 11
      machines/node/default.nix
  20. 19
      machines/node/network.nix
  21. 30
      machines/nodes.nix
  22. 3
      modules/default.nix
  23. 29
      modules/dhcp.nix
  24. 20
      modules/netinstall.nix
  25. 30
      modules/node.nix
  26. 6
      shared/slurm.nix

57
docs/config.nix

@ -0,0 +1,57 @@
{ config, ... }:
{
site_name = "HPC @ HS-Fulda";
site_description = ''
User documentation for high performance cluster on University of Applied Sciences Fulda
'';
site_url = "http://${config.networking.domain}/";
use_directory_urls = false;
strict = true;
repo_url = "https://gogs.informatik.hs-fulda.de/hpc/nixcfg.git";
docs_dir = ./content;
theme = {
name = "readthedocs";
locale = "de";
prev_next_buttons_location = "none";
highlightjs = true;
hljs_languages = [
"bash"
"yaml"
"rust"
];
};
markdown_extensions = [
"extra"
"admonition"
];
plugins = [
"search"
];
extra = {
"manager"."host" = config.networking.domain;
};
nav = [
{ "Start" = "index.md"; }
{ "Erste Schritte" = "first_steps.md"; }
{ "Nutzung" = "usage.md"; }
{ "Software" = "environment.md"; }
{ "Daten" = "storage.md"; }
{ "Best Practices" = "best_practice.md"; }
{ "Hilfe" = "support.md"; }
{
"Internes" = [
{ "Deployment" = "internal/deployment.md"; }
{ "Netzwerk" = "internal/network.md"; }
];
}
];
}

19
docs/content/first_steps.md

@ -6,7 +6,7 @@ Willkommen zum Abschnitt "Erste Schritte" der Benutzer-Dokumentation für den Le
Bevor Sie mit dem Lehr-HPC Cluster arbeiten können, müssen Sie Zugang beantragen. Hier sind die Schritte, um den Zugang zu erhalten:
1. Besuchen Sie die Webseite des Lehr-HPC Clusters: https://lehr-hpc-cluster-university.edu.
1. Besuchen Sie die Webseite des Lehr-HPC Clusters: {{ config.site_url }}.
1. Suchen Sie nach dem Abschnitt "Zugang beantragen" oder ähnlichem.
1. Füllen Sie das Zugangsformular aus, das Ihre Kontaktdaten, den Grund für den Zugang und möglicherweise andere Informationen abfragt.
1. Senden Sie den Antrag ab und warten Sie auf die Bestätigung Ihrer Zugangsberechtigung. Dies kann einige Zeit in Anspruch nehmen.
@ -16,11 +16,11 @@ Bevor Sie mit dem Lehr-HPC Cluster arbeiten können, müssen Sie Zugang beantrag
Sobald Ihr Zugang genehmigt wurde, können Sie sich auf dem Lehr-HPC Cluster anmelden:
1. Öffnen Sie ein Terminal auf Ihrem Computer.
1. Verwenden Sie den Befehl ssh in Verbindung mit Ihrer zugewiesenen Benutzer-ID und der Cluster-Adresse, um sich anzumelden:
1. Verwenden Sie den Befehl ssh in Verbindung mit Ihrer fd-Nummer und der Cluster-Adresse, um sich anzumelden:
```bash
ssh benutzername@lehr-hpc-cluster-university.edu
ssh fd-nummer@{{ config.extra.manager.host }}
```
Geben Sie Ihr Passwort ein, wenn Sie dazu aufgefordert werden.
1. Geben Sie Ihr Passwort ein, wenn Sie dazu aufgefordert werden.
## Grundlegende Begriffe kennenlernen
Um erfolgreich mit dem Cluster zu arbeiten, sollten Sie einige grundlegende HPC-Konzepte verstehen:
@ -49,7 +49,7 @@ Dieses Skript führt einen einfachen Befehl aus und wartet dann 60 Sekunden.
* Laden Sie die Datei auf den Cluster hoch, beispielsweise mit dem Befehl scp:
```bash
scp testjob.sh benutzername@lehr-hpc-cluster-university.edu:~/testjob.sh
scp testjob.sh fd-nummer@{{ config.extra.manager.host }}:~/testjob.sh
```
* Verbinden Sie sich erneut mit dem Cluster und reichen Sie den Testjob ein:
@ -57,9 +57,12 @@ scp testjob.sh benutzername@lehr-hpc-cluster-university.edu:~/testjob.sh
sbatch testjob.sh
```
* Überwachen Sie den Status des Jobs mit dem Befehl `squeue -u benutzername` und sehen Sie zu, wie er ausgeführt wird.
* Überwachen Sie den Status des Jobs mit dem Befehl `squeue -u $USER` und sehen Sie zu, wie dieser ausgeführt wird.
Herzlichen Glückwunsch! Sie haben Ihren ersten Testjob auf dem Lehr-HPC Cluster erfolgreich eingereicht.
Nächste Schritte
Sie haben nun erfolgreich die ersten Schritte auf dem Lehr-HPC Cluster unternommen! Als nächstes können Sie sich mit fortgeschritteneren Themen wie der Einreichung komplexerer Aufträge und der Nutzung spezifischer Software vertraut machen. Werfen Sie einen Blick auf die anderen Abschnitte dieser Dokumentation, um Ihr Wissen zu vertiefen.
## Nächste Schritte
Sie haben nun erfolgreich die ersten Schritte auf dem Lehr-HPC Cluster unternommen!
Als nächstes können Sie sich mit fortgeschritteneren Themen wie der Einreichung komplexerer Aufträge und der Nutzung spezifischer Software vertraut machen.
Werfen Sie einen Blick auf die anderen Abschnitte dieser Dokumentation, um Ihr Wissen zu vertiefen.

1
docs/content/index.md

@ -4,7 +4,6 @@ Herzlich Willkommen zur Benutzer-Dokumentation des High-Performance Computing (H
Herzlich willkommen zur offiziellen Benutzer-Dokumentation unseres High-Performance Computing (HPC) Clusters.
Diese Dokumentation soll Ihnen dabei helfen, das volle Potenzial unseres Clusters auszuschöpfen, um komplexe Berechnungen und wissenschaftliche Simulationen effizient durchzuführen.
## Was ist ein HPC Cluster?
Ein HPC Cluster ist ein leistungsstarkes Netzwerk von miteinander verbundenen Computern, die gemeinsam komplexe Berechnungen und rechenintensive Aufgaben bewältigen können.
Unser Cluster wurde entwickelt, um den Umgang mit einem HPC Cluster in der Lehre zu vermittel und Forscherinnen und Forschern aus verschiedenen Disziplinen die Möglichkeit zu bieten, Simulationen durchzuführen, Datenanalysen durchzuführen und innovative Forschungsprojekte zu realisieren.

79
docs/content/internal/deployment.md

@ -0,0 +1,79 @@
# Infrastructure Deployment
The whole cluster infrastructure is build using [NixOS](https://nixos.org/).
The configuration repository is hosted at {{ config.repo_url }} and is deployed using [colmena](https://github.com/zhaofengli/colmena).
## Building the configuration
To build the configuration, as system with [Nix](https://nix.dev/install-nix) installed is required.
To activate the environment, run `nix develop` inside the configuration folder.
This will fetch all required build dependecies and makes them available in the environment.
Building the whole configuration is as easy as running:
```
colmana build --verbose --show-trace
```
*Go grap a coffee, this can take a while*
## Deploying
> Note: Deployment requires SSH access as the `root` user to all machines.
To deploy a configuration change or updates to the cluster, run the following command:
```
colmena apply switch
```
### Using the manager as a SSH jump host
SSH access to the nodes is limited.
Therefore it the manager system can be used as a jump host.
To do so, add the following lines to your local `~/.ssh/config` file (before the the `Host *` entry):
```
Host 10.32.47.1??
IdentitiesOnly yes
ProxyJump root@10.32.47.10
```
## Updating
Updating all systems can be done by running the following command in the configuration repository:
```
nix flake update
```
This will update all dependencies including the NixOS operation system.
After doing the update, the changed config (with the updated dependencies) must be [deployed](#deploying).
## Gather node information
The configuration repository relies on some information gathered from the machines itself.
After bootstrapping a machine, these information need to be gathered from the machines into the configuration repository.
To gather there data, run the following command:
```
./gather.sh
```
## Secret management
The config repository contains several secrets which are secured by [sops](https://github.com/getsops/sops) and the according [Nix integration](https://github.com/Mic92/sops-nix).
To edit a config file, run the following command:
```
sops <path/to/secrets/file>
```
This requires the editor to have its PGP-key fingerprint be part of the `adminKeys` list in `sops.nix`.
Altering the list requires one of the previous members to [update the keys](#update-keys).
### Update keys
Whenever a key, either the SSH key of a machine or the PGP key of an administrator, changes, the secret files need updating.
To do so, run the following command:
```
find -name "secrets.yaml" -or -path "*/secrets/**" -type f -exec 'sops updatekeys {}'
```
## Bootstrapping a node
Compute nodes can be bootstrapped using PXE boot.
The manager will provide a touchless boot image which will install the node with the current deployment automatically.
Booting the node from PXE (network boot) is enough to activate the bootstrapping process.
After bootstrapping a node, make sure to [gather the node data](#gather-node-information) and [update the secret keys](#update-keys).

18
docs/default.nix

@ -1,18 +0,0 @@
{ stdenv
, mkdocs
, ...
}:
stdenv.mkDerivation {
name = "docs";
preferLocalBuild = true;
allowSubstitutes = false;
src = ./.;
buildCommand = ''
cd "$src"
${mkdocs}/bin/mkdocs build --site-dir "$out"
'';
}

37
docs/mkdocs.yaml

@ -1,37 +0,0 @@
site_name: HPC @ HS-Fulda
site_description: User documentation for high performance cluster on University of Applied Sciences Fulda
site_url: https://docs.hpc.informatik.hs-fulda.de/
site_dir: public
use_directory_urls: false
strict: true
repo_url: https://gogs.informatik.hs-fulda.de/hpc/nixcfg.git
docs_dir: content
theme:
name: readthedocs
locale: de
prev_next_buttons_location: none
highlightjs: true
hljs_languages:
- bash
- yaml
- rust
markdown_extensions:
- extra
- admonition
plugins:
- search
nav:
- Start: index.md
- Erste Schritte: first_steps.md
- Nutzung: usage.md
- Software: environment.md
- Daten: storage.md
- Best Practices: best_practice.md
- Hilfe: support.md
- Internes:
- Netzwerk: internal/network.md

19
docs/module.nix

@ -0,0 +1,19 @@
{ pkgs, config, lib, ... }:
with lib;
let
mkdocsConfig = import ./config.nix {
inherit config lib;
};
mkdocsConfigYaml = pkgs.writeText "mkdocs.yaml" (generators.toYAML { } mkdocsConfig);
in
{
system.build.docs = pkgs.runCommand "docs" { } ''
${pkgs.mkdocs}/bin/mkdocs build \
--site-dir "$out" \
--config-file "${mkdocsConfigYaml}"
'';
}

1
docs/result

@ -1 +0,0 @@
/nix/store/8v3r668x18fl49yx2s41yzs0qx9cn24d-docs

84
flake.lock

@ -10,11 +10,11 @@
"stable": "stable"
},
"locked": {
"lastModified": 1685163780,
"narHash": "sha256-tMwseHtEFDpO3WKeZKWqrKRAZI6TiEULidxEbzicuFg=",
"lastModified": 1699171528,
"narHash": "sha256-ZsN6y+tgN5w84oAqRQpMhIvQM39ZNSZoZvn2AK0QYr4=",
"owner": "zhaofengli",
"repo": "colmena",
"rev": "c61bebae1dc1d57237577080b1ca1e37a3fbcebf",
"rev": "665603956a1c3040d756987bc7a810ffe86a3b15",
"type": "github"
},
"original": {
@ -30,11 +30,11 @@
]
},
"locked": {
"lastModified": 1687747614,
"narHash": "sha256-KXspKgtdO2YRL12Jv0sUgkwOwHrAFwdIG/90pDx8Ydg=",
"lastModified": 1699099781,
"narHash": "sha256-2WAs839yL6xmIPBLNVwbft46BDh0/RAjq1bAKNRqeR4=",
"owner": "nix-community",
"repo": "disko",
"rev": "fef67a1ddc293b595d62a660f57deabbcb70ff95",
"rev": "548962c50b8afad7b8c820c1d6e21dc8394d6e65",
"type": "github"
},
"original": {
@ -151,12 +151,15 @@
}
},
"flake-utils_6": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1667395993,
"narHash": "sha256-nuEHfE/LcWyuSWnS8t12N1wc105Qtau+/OdUAjtQ0rA=",
"lastModified": 1685518550,
"narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "5aed5285a952e0b949eb3ba02c12fa4fcfef535f",
"rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef",
"type": "github"
},
"original": {
@ -228,11 +231,11 @@
]
},
"locked": {
"lastModified": 1683210100,
"narHash": "sha256-bhGDOlkWtlhVECpoOog4fWiFJmLCpVEg09a40aTjCbw=",
"lastModified": 1687381756,
"narHash": "sha256-IUMIlYfrvj7Yli4H2vvyig8HEPpfCeMaE6+kBGPzFyk=",
"owner": "nix-community",
"repo": "nixago",
"rev": "1da60ad9412135f9ed7a004669fdcf3d378ec630",
"rev": "dacceb10cace103b3e66552ec9719fa0d33c0dc9",
"type": "github"
},
"original": {
@ -341,11 +344,11 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1686513595,
"narHash": "sha256-H3JNqj7TEiMx5rd8lRiONvgFZvmf3kmwHI2umDdqgFY=",
"lastModified": 1699169573,
"narHash": "sha256-cvUb1xZkvOp3W2SzylStrTirhVd9zCeo5utJl9nSIhw=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "bb8b5735d6f7e06b9ddd27de115b0600c1ffbdb4",
"rev": "aeefe2054617cae501809b82b44a8e8f7be7cc4b",
"type": "github"
},
"original": {
@ -357,16 +360,16 @@
},
"nixpkgs-stable": {
"locked": {
"lastModified": 1678872516,
"narHash": "sha256-/E1YwtMtFAu2KUQKV/1+KFuReYPANM2Rzehk84VxVoc=",
"lastModified": 1685801374,
"narHash": "sha256-otaSUoFEMM+LjBI1XL/xGB5ao6IwnZOXc47qhIgJe8U=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "9b8e5abb18324c7fe9f07cb100c3cd4a29cda8b8",
"rev": "c37ca420157f4abc31e26f436c1145f8951ff373",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-22.11",
"ref": "nixos-23.05",
"repo": "nixpkgs",
"type": "github"
}
@ -382,11 +385,11 @@
"nixpkgs-stable": "nixpkgs-stable"
},
"locked": {
"lastModified": 1685361114,
"narHash": "sha256-4RjrlSb+OO+e1nzTExKW58o3WRwVGpXwj97iCta8aj4=",
"lastModified": 1698852633,
"narHash": "sha256-Hsc/cCHud8ZXLvmm8pxrXpuaPEeNaaUttaCvtdX/Wug=",
"owner": "cachix",
"repo": "pre-commit-hooks.nix",
"rev": "ca2fdbf3edda2a38140184da6381d49f8206eaf4",
"rev": "dec10399e5b56aa95fcd530e0338be72ad6462a0",
"type": "github"
},
"original": {
@ -418,11 +421,11 @@
]
},
"locked": {
"lastModified": 1685434555,
"narHash": "sha256-aZl0yeaYX3T2L3W3yXOd3S9OfpS+8YUOT2b1KwrSf6E=",
"lastModified": 1699252567,
"narHash": "sha256-WCzEBCu17uXilT9OZ3XSy/c4Gk/j3L7AUxBRHzNlQ4Y=",
"owner": "Mic92",
"repo": "sops-nix",
"rev": "876846cde9762ae563f018c17993354875e2538e",
"rev": "0a9d5e41f6013a1b8b66573822f9beb827902968",
"type": "github"
},
"original": {
@ -433,16 +436,16 @@
},
"stable": {
"locked": {
"lastModified": 1669735802,
"narHash": "sha256-qtG/o/i5ZWZLmXw108N2aPiVsxOcidpHJYNkT45ry9Q=",
"lastModified": 1696039360,
"narHash": "sha256-g7nIUV4uq1TOVeVIDEZLb005suTWCUjSY0zYOlSBsyE=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "731cc710aeebecbf45a258e977e8b68350549522",
"rev": "32dcb45f66c0487e92db8303a798ebc548cadedc",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-22.11",
"ref": "nixos-23.05",
"repo": "nixpkgs",
"type": "github"
}
@ -462,16 +465,31 @@
"type": "github"
}
},
"systems_2": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
},
"utils": {
"inputs": {
"systems": "systems"
"systems": "systems_2"
},
"locked": {
"lastModified": 1685518550,
"narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=",
"lastModified": 1694529238,
"narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef",
"rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
"type": "github"
},
"original": {

7
flake.nix

@ -121,6 +121,13 @@
sops
age
mkdocs
] ++ [
(pkgs.vscode-with-extensions.override {
vscode = pkgs.vscodium;
vscodeExtensions = with pkgs.vscode-extensions; [
bbenoist.nix
];
})
]);
shellHook = ''

3
machines/manager/default.nix

@ -12,13 +12,16 @@ with lib;
./ldap.nix
./users.nix
./nginx.nix
./mail.nix
#./beegfs.nix
./ntp.nix
./dhcp.nix
./netinstall
./cache.nix
./rdma.nix
./mpi.nix
./slurm.nix
./docs.nix
];
deployment = {

97
machines/manager/dhcp.nix

@ -0,0 +1,97 @@
{ pkgs, lib, config, nodes, ... }:
with lib;
let
mkReservations = net: concatLists (mapAttrsToList
(_: node: optional (hasAttr net node.config.hpc.dhcp.reservations) {
"hw-address" = node.config.hpc.dhcp.reservations.${net}.hwAddress;
"ip-address" = node.config.hpc.dhcp.reservations.${net}.ipAddress;
})
nodes);
in
{
services.kea = {
dhcp4 = {
enable = true;
settings = {
"valid-lifetime" = 4000;
"renew-timer" = 1000;
"rebind-timer" = 2000;
"interfaces-config" = {
"interfaces" = [ "mngt" "data" ];
};
"lease-database" = {
"type" = "memfile";
"persist" = true;
"name" = "/var/lib/kea/dhcp4.leases";
};
"subnet4" = [
{
"subnet" = "10.32.46.0/24";
"interface" = "mngt";
"option-data" = [
{
"name" = "routers";
"data" = config.networking.defaultGateway.address;
}
{
"name" = "domain-name-servers";
"data" = "10.0.0.53,10.1.1.10";
}
{
"name" = "domain-name";
"data" = "mngt.${config.networking.domain}";
}
{
"name" = "domain-search";
"data" = "mngt.${config.networking.domain}";
}
];
"pools" = [
{
"pool" = "10.32.46.100-10.32.46.200";
}
];
"reservations" = mkReservations "mngt";
}
{
"subnet" = "10.32.47.0/24";
"interface" = "data";
"option-data" = [
{
"name" = "domain-name-servers";
"data" = "10.0.0.53,10.1.1.10";
}
{
"name" = "domain-name";
"data" = config.networking.domain;
}
{
"name" = "domain-search";
"data" = config.networking.domain;
}
];
"pools" = [
{
"pool" = "10.32.47.100-10.32.47.200";
}
];
"reservations" = mkReservations "data";
}
];
};
};
};
}

20
machines/manager/docs.nix

@ -1,16 +1,22 @@
{ pkgs, lib, ... }:
{ config, pkgs, lib, ... }:
with lib;
let
docs = pkgs.callPackage ../../docs { };
in
{
imports = [
../../docs/module.nix
];
services.nginx = {
virtualHosts = {
"docs.${config.networking.domain}" = {
locations."/".root = docs;
"${config.networking.domain}" = {
default = true;
serverAliases = [
"doku.${config.networking.domain}"
];
locations."/".root = config.system.build.docs;
};
};
};

20
machines/manager/mail.nix

@ -0,0 +1,20 @@
{ config, ... }:
{
programs.msmtp = {
enable = true;
accounts = {
default = {
auth = true;
tls = true;
port = 587;
from = "fdhpc@informatik.hs-fulda.de";
host = "smtp.hs-fulda.de";
user = "fdhpc";
passwordeval = "cat ${config.sops.secrets."mail/password".path}";
};
};
};
sops.secrets."mail/password" = { };
}

2
machines/manager/mpi.nix

@ -7,7 +7,7 @@ with lib;
text = concatMapStringsSep "\n"
(node: "${node.config.networking.hostName} max_slots=32")
(filter
(node: elem "node" node.config.deployment.tags)
(node: node.config.hpc.node.enable)
(attrValues nodes));
};
}

36
machines/manager/netinstall/default.nix

@ -7,15 +7,10 @@
with lib;
let
targets = {
"50:46:5d:da:0b:d6" = "node-00";
"50:46:5d:da:0c:56" = "node-01";
"10:bf:48:1f:a6:8f" = "node-02";
"10:bf:48:1b:57:47" = "node-03";
"10:bf:48:19:a2:4d" = "node-04";
"10:bf:48:1b:56:df" = "node-05";
"50:46:5d:da:0c:52" = "node-06";
};
targets = attrNames
(filterAttrs
(_: node: node.config.hpc.netinstall.enable)
nodes);
installer = pkgs.nixos [
./installer.nix
@ -59,9 +54,9 @@ let
message = "NixOS Automatic Installer for ${name}";
});
api = pkgs.linkFarm "pixiecore-api" (mapAttrs'
(mac: name: nameValuePair "pixiecore/v1/boot/${mac}" (apiEntry name))
targets);
api = pkgs.linkFarm "pixiecore-api" (listToAttrs (map
(name: nameValuePair "pixiecore/v1/boot/${nodes."${name}".config.hpc.dhcp.reservations."data".hwAddress}" (apiEntry name))
targets));
in
{
services.pixiecore = {
@ -85,23 +80,6 @@ in
};
};
services.dhcpd4 = {
enable = true;
interfaces = [ "enp11s0f0" ];
extraConfig = ''
option domain-name-servers 10.0.0.53, 10.1.1.10;
option domain-name "${config.networking.domain}";
subnet 10.32.47.0 netmask 255.255.255.0 {
interface enp11s0f0;
range 10.32.47.200 10.32.47.230;
option routers 10.32.47.1;
}
'';
};
hpc.hostFile.aliases = [
"boot.${config.networking.domain}"
];

36
machines/manager/network.nix

@ -1,13 +1,45 @@
{
networking.interfaces."enp11s0f0" = {
networking.interfaces."enp11s0f0" = { };
networking.interfaces."enp11s0f1" = { };
networking.bonds."data" = {
interfaces = [ "enp11s0f0" "enp11s0f1" ];
driverOptions = {
miimon = "100";
mode = "802.3ad";
};
};
networking.vlans."mngt" = {
id = 1032;
interface = "data";
};
networking.interfaces."data" = {
ipv4.addresses = [{
address = "10.32.47.10";
prefixLength = 24;
}];
};
# This is not our real management interface but the hosts interface to the
# manangement network
networking.interfaces."mngt" = {
ipv4.addresses = [{
address = "10.32.46.253";
prefixLength = 24;
}];
};
networking.defaultGateway = {
address = "10.32.47.1";
interface = "enp11s0f0";
interface = "data";
};
hpc.dhcp.reservations = {
"mngt" = {
hwAddress = "e4:1f:13:28:c7:b9";
ipAddress = "10.32.46.10";
};
};
}

6
machines/manager/secrets.yaml

@ -2,6 +2,8 @@ ldap:
root:
username: ENC[AES256_GCM,data:aXIFdQ==,iv:tdC7GFit0LrO4DJL3vbI6uKCDXeYAOwDGwvOqrvn9mM=,tag:x1mBwe+K+UKjCpGO5qKMuQ==,type:str]
password: ENC[AES256_GCM,data:Q42VVdHaPZuvLR4HJ11CICpx61qTpw/v,iv:GhsXDsWxRinPOG+uMzy/uvxvMB1G8OKu4yH0a8achJc=,tag:yEWD4slZu/kDEV8ZJs43Hg==,type:str]
mail:
password: ENC[AES256_GCM,data:rpX5D5Fg3pvsQZWyo9bfk2KpbHD4Bo/R,iv:vuWI154d0J/peulEdB9eJLw784RjLjtcVgStaadKLf4=,tag:7BWaclC3TqsS1NGOmkE1SQ==,type:str]
sops:
kms: []
gcp_kms: []
@ -26,8 +28,8 @@ sops:
REQzSVBpb3lrOVFkcTF6SGtuN0VPRWsKfi/qLyhgOdDwudWztQTRQpcv5ITUEPeS
zFA4aAzsDf91juywYXWsAmUq4L4WusfWb8Cf2hMTQLYLISb3mJXxlg==
-----END AGE ENCRYPTED FILE-----
lastmodified: "2023-06-27T09:57:35Z"
mac: ENC[AES256_GCM,data:QpMkI/w+J49DeQ0EDrz+6WtbtvJrgNChI1Z4PNNjdD2cik9wvtZNMUhjJVV18dUxWRH3dkhwX7Jt4mPhlDjhDspbkKsNjKaSApOS8AACybs8FqodvlUCU2mF+xG4beblQn3n8oPcqc5kjbAFc2r+mPSb4b7rcoS+xrB3rKUJTng=,iv:xsjx8Gz5UfpAXMEDEzMA4Kau4BI0vq3xvgfFvHS4uFo=,tag:aiFD1PXsHtiXFrx+legUhw==,type:str]
lastmodified: "2023-11-04T18:58:58Z"
mac: ENC[AES256_GCM,data:fFsTTPmvONkDVQ8mkOxJ6J0N1pbhtUXMspT9svzJYRMoKcEICEocUOEgS3Ucmn7hgx3v4+8prj3BYG1woA2pamgZ5+xI/UsZAOFHg2Wjdqm7OQ3AuK4Qf0uS8LMT9b3dKEddB9S4jVSn9klTou8m7gf1DM6esc2GBKh/zVi+ZtE=,iv:uFIc+S/JIQZHavT5hunvqmZi+zIUxsEj0VCN335opPE=,tag:kfQMv6skdQR/SCyTIQK1qA==,type:str]
pgp:
- created_at: "2023-07-03T09:54:56Z"
enc: |

11
machines/node/default.nix

@ -13,17 +13,16 @@ with lib;
];
deployment = {
targetHost = "10.32.47.${fixedWidthNumber 3 (100 + id)}";
targetHost = "10.32.47.${toString (100 + id)}";
targetUser = "root";
tags = [ "node" ];
};
_module.args = {
node-id = id;
hpc.node = {
enable = true;
inherit id;
};
networking.hostName = "node-${fixedWidthNumber 2 id}";
networking.hostName = config.hpc.node.name;
networking.timeServers = [
"manager.${config.networking.domain}"

19
machines/node/network.nix

@ -1,11 +1,15 @@
{ lib, node-id, ... }:
{ lib, config, ... }:
with lib;
let
node = (import ../nodes.nix).${config.hpc.node.name};
in
{
networking.interfaces."enp2s0f0" = {
ipv4.addresses = [{
address = "10.32.47.${fixedWidthNumber 3 (100 + node-id)}";
address = "10.32.47.${toString (100 + config.hpc.node.id)}";
prefixLength = 24;
}];
};
@ -14,4 +18,15 @@ with lib;
address = "10.32.47.1";
interface = "enp2s0f0";
};
hpc.dhcp.reservations = {
"mngt" = {
hwAddress = node.mngt;
ipAddress = "10.32.46.${toString (100 + config.hpc.node.id)}";
};
"data" = {
hwAddress = node.data;
ipAddress = "10.32.47.${toString (100 + config.hpc.node.id)}";
};
};
}

30
machines/nodes.nix

@ -0,0 +1,30 @@
{
"node-00" = {
mngt = "50:46:5D:DA:0C:C9";
data = "50:46:5d:da:0b:d6";
};
"node-01" = {
mngt = "50:46:5D:DA:0C:07";
data = "50:46:5d:da:0c:56";
};
"node-02" = {
mngt = "10:BF:48:19:B0:04";
data = "10:bf:48:1f:a6:8f";
};
"node-03" = {
mngt = "10:BF:48:19:A4:FE";
data = "10:bf:48:1b:57:47";
};
"node-04" = {
mngt = "10:BF:48:19:A2:E2";
data = "10:bf:48:19:a2:4d";
};
"node-05" = {
mngt = "10:BF:48:15:00:F5";
data = "10:bf:48:1b:56:df";
};
"node-06" = {
mngt = "50:46:5D:DA:0C:09";
data = "50:46:5d:da:0c:52";
};
}

3
modules/default.nix

@ -1,6 +1,9 @@
{
imports = [
./node.nix
./hostFile.nix
./dhcp.nix
./netinstall.nix
./beegfs.nix
];
}

29
modules/dhcp.nix

@ -0,0 +1,29 @@
{ lib, config, ... }:
with lib;
{
options.hpc.dhcp = {
reservations = mkOption {
description = ''
DHCP reservations for this host.
'';
type = types.attrsOf (types.submodule {
options = {
hwAddress = mkOption {
description = ''
MAC address of the interface in this network.
'';
type = types.str;
};
ipAddress = mkOption {
description = ''
IP address of the host.
'';
};
};
});
default = { };
};
};
}

20
modules/netinstall.nix

@ -0,0 +1,20 @@
{ config, lib, ... }:
with lib;
{
options.hpc.netinstall = {
enable = mkEnableOption "NetInstall";
};
config = mkIf config.hpc.netinstall.enable {
deployment.tags = [ "netinstall" ];
assertions = [
{
assertion = elem "data" config.hpc.dhcp.reservations;
message = "NetInstall needs DHCP reservation in data network";
}
];
};
}

30
modules/node.nix

@ -0,0 +1,30 @@
{ lib, config, ... }:
with lib;
{
options.hpc.node = {
enable = mkEnableOption "Compute Node";
id = mkOption {
description = ''
ID of the compute node.
'';
type = types.ints.unsigned;
};
name = mkOption {
description = ''
Name of the node.
'';
type = types.str;
readOnly = true;
};
};
config = mkIf config.hpc.node.enable {
hpc.node.name = "node-${fixedWidthNumber 2 config.hpc.node.id}";
deployment.tags = [ "node" ];
};
}

6
shared/slurm.nix

@ -11,17 +11,13 @@ with lib;
nodeName = map
(node: "${node.config.networking.hostName} CPUs=32")
(filter # Filter all nodes that have a tag "node" being a compute node
(node: elem "node" node.config.deployment.tags)
(node: node.config.hpc.node.enable)
(attrValues nodes));
partitionName = [
"all Nodes=ALL AllowGroups=cluster Default=YES MaxTime=6:00:00 State=UP"
"vip Nodes=ALL AllowGroups=vip Default=NO MaxTime=INFINITE State=UP"
];
extraConfig = ''
MailProg=${pkgs.coreutils}/bin/false
'';
};
services.munge = {

Loading…
Cancel
Save