diff --git a/.gitignore b/.gitignore index 72030bb..d0fb072 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ build.env.work +cluster.env.work .DS_Store clitools/bin packages/ diff --git a/README.md b/README.md index c4ddf29..13ed085 100644 --- a/README.md +++ b/README.md @@ -83,11 +83,11 @@ Default-style control-plane configuration looks like this: ```bash make cluster-config \ - MKS_HOSTNAME=monok8s-master \ - MKS_CLUSTER_ROLE=control-plane \ - MKS_INIT_CONTROL_PLANE=true \ - MKS_MGMT_ADDRESS=10.0.0.10/24 \ - MKS_APISERVER_ADVERTISE_ADDRESS=10.0.0.10 + MKS_HOSTNAME=monok8s-master \ + MKS_CLUSTER_ROLE=control-plane \ + MKS_INIT_CONTROL_PLANE=true \ + MKS_MGMT_ADDRESS=10.0.0.10/24 \ + MKS_APISERVER_ADVERTISE_ADDRESS=10.0.0.10 ``` If you are just trying the image for the first time, start with the default control-plane setup. Worker-node setup is still incomplete. @@ -96,6 +96,20 @@ For all available configuration values, see: - [configs/cluster.env.default](configs/cluster.env.default) +For worker node +``` +make cluster-config \ + MKS_HOSTNAME=monok8s-worker \ + MKS_CLUSTER_ROLE=worker \ + MKS_INIT_CONTROL_PLANE=no \ + MKS_MGMT_ADDRESS=10.0.0.10/24 \ + MKS_APISERVER_ADVERTISE_ADDRESS=10.0.0.10 \ + MKS_API_SERVER_ENDPOINT=10.0.0.1:6443 \ + MKS_CNI_PLUGIN=none \ + MKS_BOOTSTRAP_TOKEN=abcd12.ef3456789abcdef0 \ + MKS_DISCOVERY_TOKEN_CA_CERT_HASH=sha256:9f1c2b3a4d5e6f7890abc1234567890abcdef1234567890abcdef1234567890ab +``` + --- ## Getting shell access diff --git a/devtools/create-join-token.sh b/devtools/create-join-token.sh new file mode 100755 index 0000000..977435d --- /dev/null +++ b/devtools/create-join-token.sh @@ -0,0 +1,196 @@ +#!/bin/sh +set -eu + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +ROOT_DIR="$(realpath "$SCRIPT_DIR/..")" +LIB_DIR="$ROOT_DIR/scripts" +CLUSTER_ENV_WORK="${CLUSTER_ENV_WORK:-$ROOT_DIR/configs/cluster.env.work}" + +KUBECTL="${KUBECTL:-kubectl}" +TTL_HOURS="${TTL_HOURS:-24}" +WAIT_SECONDS="${WAIT_SECONDS:-30}" + +need() { + command -v "$1" >/dev/null 2>&1 || { + echo "missing required command: $1" >&2 + exit 1 + } +} + +rfc3339_after_hours() { + hours="$1" + + # GNU date + if date -u -d "+${hours} hours" '+%Y-%m-%dT%H:%M:%SZ' >/dev/null 2>&1; then + date -u -d "+${hours} hours" '+%Y-%m-%dT%H:%M:%SZ' + return + fi + + # BSD/macOS date + if date -u -v+"${hours}"H '+%Y-%m-%dT%H:%M:%SZ' >/dev/null 2>&1; then + date -u -v+"${hours}"H '+%Y-%m-%dT%H:%M:%SZ' + return + fi + + echo "cannot compute expiration time with this date(1). Set EXPIRATION manually." >&2 + exit 1 +} + +decode_base64_to_file() { + input="$1" + output="$2" + + if printf '%s' "$input" | base64 -d >"$output" 2>/dev/null; then + return + fi + + if printf '%s' "$input" | base64 -D >"$output" 2>/dev/null; then + return + fi + + if printf '%s' "$input" | openssl base64 -d -A >"$output" 2>/dev/null; then + return + fi + + echo "failed to decode certificate-authority-data" >&2 + exit 1 +} + +need "$KUBECTL" +need openssl +need awk +need sed + +TOKEN_ID="${TOKEN_ID:-$(openssl rand -hex 3)}" +TOKEN_SECRET="${TOKEN_SECRET:-$(openssl rand -hex 8)}" +TOKEN="${TOKEN_ID}.${TOKEN_SECRET}" +SECRET_NAME="bootstrap-token-${TOKEN_ID}" + +if [ "${TTL_HOURS}" = "0" ]; then + EXPIRATION="" +else + EXPIRATION="${EXPIRATION:-$(rfc3339_after_hours "$TTL_HOURS")}" +fi + +echo "Creating bootstrap token Secret: ${SECRET_NAME}" >&2 + +{ + cat <&2 + echo "token was created, but cannot print a safe kubeadm join command" >&2 + echo "token: ${TOKEN}" + exit 0 +fi + +decode_base64_to_file "$CA_DATA" "$CA_FILE" + +CA_HASH="$( + openssl x509 -in "$CA_FILE" -pubkey -noout | + openssl pkey -pubin -outform der 2>/dev/null | + openssl dgst -sha256 -hex | + awk '{print $2}' +)" + +SERVER="$("$KUBECTL" config view --raw --minify -o jsonpath='{.clusters[0].cluster.server}')" +JOIN_ENDPOINT="$(printf '%s\n' "$SERVER" | sed -E 's#^https?://##')" + +echo "Waiting for cluster-info signature for token ${TOKEN_ID}..." >&2 + +i=0 +signed="false" +while [ "$i" -lt "$WAIT_SECONDS" ]; do + template="{{ index .data \"jws-kubeconfig-${TOKEN_ID}\" }}" + sig="$("$KUBECTL" -n kube-public get configmap cluster-info -o "go-template=${template}" 2>/dev/null || true)" + + if [ -n "$sig" ]; then + signed="true" + break + fi + + i=$((i + 1)) + sleep 1 +done + +echo +echo "Token:" +echo " ${TOKEN}" + +if [ -n "$EXPIRATION" ]; then + echo + echo "Expires:" + echo " ${EXPIRATION}" +fi + +echo +echo "Join command:" +echo " kubeadm join ${JOIN_ENDPOINT} --token ${TOKEN} --discovery-token-ca-cert-hash sha256:${CA_HASH}" + +TMP_ENV="$(mktemp)" +trap 'rm -f "$TMP_ENV"; rm -rf "$TMPDIR"' EXIT INT TERM + +cat >"$TMP_ENV" <&2 + exit 1 +fi + +"$LIB_DIR/merge-env.sh" "$TMP_ENV" "$CLUSTER_ENV_WORK" + +echo +echo "Merged into:" +echo " $CLUSTER_ENV_WORK" +echo +echo "Try" +cat <&2 + echo "warning: cluster-info was not signed within ${WAIT_SECONDS}s." >&2 + echo "If kubeadm join fails discovery, check that kube-controller-manager enables bootstrapsigner." >&2 +fi diff --git a/docker/kernel-build.Dockerfile b/docker/kernel-build.Dockerfile index 7986c6c..d153117 100644 --- a/docker/kernel-build.Dockerfile +++ b/docker/kernel-build.Dockerfile @@ -23,6 +23,7 @@ WORKDIR /build/nxplinux COPY kernel-extra.config /tmp/kernel-extra.config COPY kernel-build/dts/*.dts ./arch/arm64/boot/dts/freescale/ +COPY kernel-build/ensure-kconfig.sh /build/ RUN grep -q "^dtb-\\\$(CONFIG_ARCH_LAYERSCAPE) += ${DEVICE_TREE_TARGET}.dtb$" \ arch/arm64/boot/dts/freescale/Makefile \ @@ -33,7 +34,7 @@ RUN grep -q "^dtb-\\\$(CONFIG_ARCH_LAYERSCAPE) += ${DEVICE_TREE_TARGET}.dtb$" \ RUN make ARCH="${ARCH}" CROSS_COMPILE="${CROSS_COMPILE}" defconfig lsdk.config \ && ./scripts/kconfig/merge_config.sh -m .config /tmp/kernel-extra.config \ && make ARCH="${ARCH}" CROSS_COMPILE="${CROSS_COMPILE}" olddefconfig \ - && grep '^CONFIG_NF_TABLES=' .config \ + && /build/ensure-kconfig.sh .config /tmp/kernel-extra.config \ && make ARCH="${ARCH}" CROSS_COMPILE="${CROSS_COMPILE}" -j"$(nproc)" # artifact collection diff --git a/docs/cilium.md b/docs/cilium.md new file mode 100644 index 0000000..fb5b650 --- /dev/null +++ b/docs/cilium.md @@ -0,0 +1,16 @@ +# Worker node + +```yaml +apiVersion: cilium.io/v2 +kind: CiliumNodeConfig +metadata: + namespace: kube-system + name: monok8s-worker +spec: + nodeSelector: + matchLabels: + node.kubernetes.io/instance-type: mono-gateway + defaults: + devices: "eth1" + direct-routing-device: "eth1" +``` diff --git a/initramfs/rootfs-extra/init b/initramfs/rootfs-extra/init index 0c21fde..1db1391 100755 --- a/initramfs/rootfs-extra/init +++ b/initramfs/rootfs-extra/init @@ -22,6 +22,9 @@ mount_retry() { i=0 while :; do + # BusyBox mount just needs a normal -o option string here. + # The important bit is that overlayfs itself requires lowerdir/upperdir/workdir, + # and workdir must live on the same filesystem as upperdir. if mount -o "$opts" -t "$fstype" "$dev" "$target"; then return 0 fi @@ -32,6 +35,30 @@ mount_retry() { done } +mount_data_overlay() { + dir="$1" + + case "$dir" in + /*) ;; + *) panic "overlay dir must be absolute: $dir" ;; + esac + + lower="/newroot$dir" + state="/newroot/data${dir}-overlay" + upper="$state/upper" + work="$state/work" + + [ -d "$lower" ] || mkdir -p "$lower" + + mkdir -p "$upper" "$work" + + log "Mounting overlay for $dir" + + mount_or_panic -t overlay overlay \ + -o "lowerdir=$lower,upperdir=$upper,workdir=$work" \ + "$lower" +} + wait_for_path() { path="$1" i=0 @@ -207,6 +234,16 @@ mount_or_panic -t proc proc /proc mount_or_panic -t sysfs sysfs /sys mount_or_panic -t tmpfs tmpfs /run +mkdir -p /sys/fs/bpf + +if ! mountpoint -q /sys/fs/bpf; then + mount_or_panic -t bpf bpffs /sys/fs/bpf +fi + +mount_or_panic --make-rshared /sys +mount_or_panic --make-rshared /run +mount_or_panic --make-shared /sys/fs/bpf + echo 1 > /proc/sys/kernel/printk mkdir -p /dev/pts @@ -264,17 +301,11 @@ mount_retry "$ROOT_DEV" /newroot ext4 ro mount_retry "$DATA_DEV" /newroot/data ext4 rw mkdir -p /newroot/data/var -mkdir -p /newroot/data/etc-overlay/upper -mkdir -p /newroot/data/etc-overlay/work - mount_or_panic --bind /newroot/data/var /newroot/var +mount_or_panic --make-rshared /newroot/var -# BusyBox mount just needs a normal -o option string here. -# The important bit is that overlayfs itself requires lowerdir/upperdir/workdir, -# and workdir must live on the same filesystem as upperdir. -mount_or_panic -t overlay overlay \ - -o "lowerdir=/newroot/etc,upperdir=/newroot/data/etc-overlay/upper,workdir=/newroot/data/etc-overlay/work" \ - /newroot/etc +mount_data_overlay /etc +mount_data_overlay /opt/cni if [ "$BOOT_PART" = "A" ]; then ALT_PART="$(find_sibling_part_on_same_disk "$ROOT_DEV" rootfsB || true)" diff --git a/kernel-build/ensure-kconfig.sh b/kernel-build/ensure-kconfig.sh new file mode 100755 index 0000000..9af19e2 --- /dev/null +++ b/kernel-build/ensure-kconfig.sh @@ -0,0 +1,127 @@ +#!/bin/sh +set -eu + +CONFIG_FILE="${1:-}" +EXPECTED_FILE="${2:-}" + +if [ -z "$CONFIG_FILE" ] || [ -z "$EXPECTED_FILE" ]; then + echo "usage: $0 " >&2 + exit 2 +fi + +if [ ! -f "$CONFIG_FILE" ]; then + echo "error: config file not found: $CONFIG_FILE" >&2 + exit 2 +fi + +if [ ! -f "$EXPECTED_FILE" ]; then + echo "error: expected config fragment not found: $EXPECTED_FILE" >&2 + exit 2 +fi + +failed=0 + +normalize_expected_line() { + line="$1" + + case "$line" in + CONFIG_*=y|CONFIG_*=m) + echo "$line" + ;; + CONFIG_*=n) + sym="${line%%=*}" + echo "# $sym is not set" + ;; + "# CONFIG_"*" is not set") + echo "$line" + ;; + CONFIG_*=*) + echo "$line" + ;; + *) + return 1 + ;; + esac +} + +is_disabled_expected() { + expected="$1" + + case "$expected" in + "# CONFIG_"*" is not set") + return 0 + ;; + *) + return 1 + ;; + esac +} + +symbol_from_expected() { + expected="$1" + + case "$expected" in + CONFIG_*=*) + echo "${expected%%=*}" + ;; + "# CONFIG_"*" is not set") + printf '%s\n' "$expected" | sed 's/^# \(CONFIG_[^ ]*\) is not set$/\1/' + ;; + *) + return 1 + ;; + esac +} + +check_expected_line() { + expected="$1" + sym="$(symbol_from_expected "$expected")" + + actual="$(grep -E "^${sym}=|^# ${sym} is not set$" "$CONFIG_FILE" || true)" + + if [ "$actual" = "$expected" ]; then + return 0 + fi + + # For disabled symbols, absence from the final .config is acceptable. + # Some Kconfig symbols do not exist on this arch/tree, and missing still means "not enabled". + if is_disabled_expected "$expected" && [ -z "$actual" ]; then + return 0 + fi + + echo "kconfig mismatch: $sym" >&2 + echo " expected: $expected" >&2 + if [ -n "$actual" ]; then + echo " actual: $actual" >&2 + else + echo " actual: " >&2 + fi + + failed=1 +} + +while IFS= read -r raw || [ -n "$raw" ]; do + # Strip leading/trailing whitespace. + line="$(printf '%s\n' "$raw" | sed 's/^[[:space:]]*//; s/[[:space:]]*$//')" + + # Ignore blanks. + [ -z "$line" ] && continue + + # Ignore normal comments, but keep '# CONFIG_FOO is not set'. + case "$line" in + "# CONFIG_"*" is not set") ;; + "#"*) continue ;; + esac + + expected="$(normalize_expected_line "$line" || true)" + [ -z "${expected:-}" ] && continue + + check_expected_line "$expected" +done < "$EXPECTED_FILE" + +if [ "$failed" -ne 0 ]; then + echo "error: resolved kernel config does not satisfy $EXPECTED_FILE" >&2 + exit 1 +fi + +echo "kernel config satisfies $EXPECTED_FILE" diff --git a/kernel-extra.config b/kernel-extra.config index 536fc72..d9d7f04 100644 --- a/kernel-extra.config +++ b/kernel-extra.config @@ -3,81 +3,39 @@ ############################################################################### CONFIG_HWMON=y -# Hardware monitoring framework. Needed so sensor drivers can expose temps/fans. - CONFIG_I2C=y -# Core I2C subsystem. Required by your RTC/fan controller drivers. - CONFIG_SENSORS_EMC2305=y -# EMC2305 fan controller driver. Built-in so fan control is available early. - CONFIG_RTC_DRV_PCF2127=y -# RTC driver for PCF2127. Built-in so timekeeping is available early. ############################################################################### # Namespaces -# These are fundamental container primitives. Keep these built-in. ############################################################################### CONFIG_NAMESPACES=y -# Master switch for Linux namespaces. - CONFIG_UTS_NS=y -# Isolates hostname/domainname per container. - CONFIG_IPC_NS=y -# Isolates SysV IPC and POSIX message queues between containers. - CONFIG_PID_NS=y -# Gives containers their own PID tree (so processes inside see their own PID 1). - CONFIG_NET_NS=y -# Gives containers their own network stack, interfaces, routing, etc. - CONFIG_USER_NS=y -# User namespaces. Useful for modern container behavior and future flexibility. -# Not every setup strictly needs this on day one, but I would enable it. ############################################################################### # Cgroups / resource control -# Required for kubelet/CRI-O to manage resource isolation. ############################################################################### CONFIG_CGROUPS=y -# Master switch for cgroups. - CONFIG_CGROUP_BPF=y -# Allows BPF programs to be attached to cgroups. Not required for first boot, -# but modern systems increasingly expect this. - +CONFIG_CGROUP_NET_CLASSID=y CONFIG_CGROUP_FREEZER=y -# Allows freezing/thawing process groups. Useful for container lifecycle control. - CONFIG_CGROUP_PIDS=y -# Limits number of processes in a cgroup. - CONFIG_CGROUP_DEVICE=y -# Controls device access from containers. - CONFIG_CPUSETS=y -# CPU affinity partitioning by cgroup. - CONFIG_MEMCG=y -# Memory cgroup support. Critical for container memory accounting/limits. - CONFIG_BLK_CGROUP=y -# Block IO control/accounting for cgroups. - CONFIG_CGROUP_SCHED=y -# Scheduler integration for cgroups. - CONFIG_FAIR_GROUP_SCHED=y -# Fair scheduler group support for cgroups. - CONFIG_CFS_BANDWIDTH=y -# CPU quota/limit support. Important for kubelet resource enforcement. ############################################################################### @@ -85,23 +43,20 @@ CONFIG_CFS_BANDWIDTH=y ############################################################################### CONFIG_KEYS=y -# Kernel key retention service. Commonly relied on by container/userland tooling. - CONFIG_TMPFS=y -# Tmpfs support. Containers and runtimes rely on this heavily. - CONFIG_TMPFS_XATTR=y -# Extended attributes on tmpfs. Useful for container runtime behavior. - CONFIG_TMPFS_POSIX_ACL=y -# POSIX ACLs on tmpfs. Good compatibility feature for userland. - CONFIG_OVERLAY_FS=y -# Overlay filesystem. This is the big one for container image/layer storage. -# Module is fine; CRI-O can load/use it after boot. No need to bloat FIT image. - CONFIG_FS_POSIX_ACL=y -# General POSIX ACL support. Good to have for overlay/tmpfs behavior. +CONFIG_FHANDLE=y +CONFIG_AUTOFS_FS=y + +CONFIG_PROC_FS=y +CONFIG_SYSFS=y +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y + +CONFIG_BLK_DEV_LOOP=y ############################################################################### @@ -109,171 +64,144 @@ CONFIG_FS_POSIX_ACL=y ############################################################################### CONFIG_INET=y -# IPv4 stack. - CONFIG_IPV6=y -# IPv6 stack. You may be tempted to disable it, but Kubernetes/container stacks -# increasingly assume it exists. Keep it on unless you have a hard reason not to. - CONFIG_UNIX=y -# Unix domain sockets. Containers and runtimes absolutely rely on this. CONFIG_TUN=m -# TUN/TAP device support. Commonly used by networking tools/VPN/CNI-related flows. -# Module is fine. - CONFIG_DUMMY=m -# Dummy network interface. Sometimes useful for CNI/network setups and testing. +CONFIG_VETH=y +CONFIG_BRIDGE=y +CONFIG_BRIDGE_NETFILTER=y +CONFIG_VXLAN=y + +# Enables IPv4/IPv6 policy routing and multiple routing tables. +# Required by CNIs such as Cilium for ip-rule based routing. +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IPV6_MULTIPLE_TABLES=y ############################################################################### -# Netfilter / packet filtering / NAT -# This is where container networking gets messy. Better to enable a sane baseline. +# Netfilter base ############################################################################### CONFIG_NETFILTER=y -# Netfilter core framework. Module is okay if your setup loads it before use. - CONFIG_NETFILTER_ADVANCED=y -# Exposes more advanced netfilter options and modules. +CONFIG_NETFILTER_NETLINK=y +CONFIG_NETFILTER_XTABLES=y + +# Linux 6.17+ gates legacy iptables/xtables support behind these options. +# Without these, IP_NF_* / IP6_NF_* options may silently fall back to =m +# or disappear after olddefconfig. +CONFIG_NETFILTER_XTABLES_LEGACY=y CONFIG_NF_CONNTRACK=y -# Connection tracking. Critical for NAT, Kubernetes service traffic, and many CNIs. - CONFIG_NF_NAT=y -# NAT framework. Required for masquerading and pod egress in many setups. - -CONFIG_NF_TABLES=y -# nftables framework. Modern Linux packet filtering backend. - -CONFIG_NFT_CT=y -# nftables conntrack expressions. - -CONFIG_NFT_COUNTER=y -# nftables packet/byte counters - -CONFIG_NFT_CHAIN_NAT=y -# nftables NAT chain support. - -CONFIG_NFT_MASQ=y -# nftables masquerade support. Often needed for pod egress NAT. - -CONFIG_NFT_REDIR=y -# nftables redirect target. - -CONFIG_NFT_NAT=y -# nftables NAT support. - -CONFIG_NF_NAT_IPV4=y -# IPv4 NAT helper support. Some kernels still expose this separately. - -CONFIG_NF_NAT_IPV6=y -# IPv6 NAT helper support. - CONFIG_NF_CT_NETLINK=y -# userspace netlink access to the conntrack table; kube-proxy uses this for conntrack listing/cleanup - -CONFIG_NF_CT_NETLINK_TIMEOUT=y -# userspace netlink support for conntrack timeout objects - -CONFIG_NF_CT_NETLINK_HELPER=y -# userspace netlink support for conntrack helper objects - -CONFIG_IP_NF_IPTABLES=y -# iptables compatibility for IPv4. Still useful because lots of CNI/plugin code -# still expects iptables even on nft-backed systems. - -CONFIG_IP_NF_NAT=y -# IPv4 NAT support for iptables compatibility. - -CONFIG_IP6_NF_IPTABLES=y -# ip6tables compatibility. - -CONFIG_IP6_NF_FILTER=y -# IPv6 "filter" table (same as above but for IPv6) CONFIG_NF_REJECT_IPV4=y -# core IPv4 reject logic used by netfilter/iptables/nftables +CONFIG_NF_REJECT_IPV6=y +# Do not re-add these stale / absent symbols for this NXP 6.18 tree: +# +# CONFIG_NF_NAT_IPV4 +# CONFIG_NF_NAT_IPV6 +# CONFIG_NFT_CHAIN_NAT +# CONFIG_NFT_COUNTER +# CONFIG_NETFILTER_XT_TARGET_REJECT +# +# Use the currently valid symbols instead: +# +# CONFIG_NF_NAT +# CONFIG_IP_NF_NAT +# CONFIG_IP6_NF_NAT +# CONFIG_IP_NF_TARGET_REJECT +# CONFIG_IP6_NF_TARGET_REJECT +# CONFIG_NFT_REJECT +# CONFIG_NFT_REJECT_INET +# +# Also avoid enabling these unless there is a real need: +# +# CONFIG_NF_CT_NETLINK_TIMEOUT +# CONFIG_NF_CT_NETLINK_HELPER +# +# They exist in this tree, but pull in extra dependencies and are not required +# for basic Kubernetes/Cilium bring-up. + + +############################################################################### +# nftables backend +############################################################################### + +CONFIG_NF_TABLES=y +CONFIG_NF_TABLES_INET=y + +CONFIG_NFT_CT=y +CONFIG_NFT_MASQ=y +CONFIG_NFT_REDIR=y +CONFIG_NFT_NAT=y CONFIG_NFT_REJECT=y -# nftables equivalent of REJECT (needed for nf_tables backend compatibility) +CONFIG_NFT_REJECT_INET=y +############################################################################### +# nftables FIB expression +# +# Required by CNI hostport nftables rules such as: +# fib daddr type local goto hostports +############################################################################### + +CONFIG_NFT_FIB=y +CONFIG_NFT_FIB_INET=y +CONFIG_NFT_FIB_IPV4=y +CONFIG_NFT_FIB_IPV6=y + +############################################################################### +# IPv4 iptables compatibility +############################################################################### + +CONFIG_IP_NF_IPTABLES=y +CONFIG_IP_NF_IPTABLES_LEGACY=y CONFIG_IP_NF_FILTER=y -# IPv4 "filter" table (INPUT/FORWARD/OUTPUT chains for iptables) - +CONFIG_IP_NF_MANGLE=y +CONFIG_IP_NF_NAT=y CONFIG_IP_NF_TARGET_REJECT=y -# IPv4-specific REJECT target for legacy iptables + +############################################################################### +# IPv6 iptables compatibility +############################################################################### + +CONFIG_IP6_NF_IPTABLES=y +CONFIG_IP6_NF_IPTABLES_LEGACY=y +CONFIG_IP6_NF_FILTER=y +CONFIG_IP6_NF_MANGLE=y +CONFIG_IP6_NF_NAT=y CONFIG_IP6_NF_TARGET_REJECT=y -# IPv6-specific REJECT target for legacy iptables -CONFIG_IP_SET=m -# IP sets. Useful for some network policies / firewalling toolchains. -CONFIG_NETFILTER_NETLINK_ACCT=y -# netfilter accounting subsystem used for nfacct-based kube-proxy metrics - -CONFIG_NETFILTER_XT_MATCH_NFACCT=y -# iptables nfacct match that hooks rules into the netfilter accounting subsystem +############################################################################### +# xtables matches / targets +############################################################################### CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=y -# xtables match for address types. Often used in iptables rules. - -CONFIG_NETFILTER_XT_TARGET_REJECT=y -# iptables REJECT target (actively reject packets instead of silently dropping) - CONFIG_NETFILTER_XT_MATCH_COMMENT=y -# Allows comments in iptables rules. Not critical, but harmless and useful. - CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y -# xtables conntrack matching. - -CONFIG_NETFILTER_XT_MATCH_STATISTIC=y -# iptables "statistic" match used for probabilistic packet matching / load balancing - +CONFIG_NETFILTER_XT_MATCH_MARK=y CONFIG_NETFILTER_XT_MATCH_MULTIPORT=y -# Match multiple ports in one rule. - +CONFIG_NETFILTER_XT_MATCH_SOCKET=y +CONFIG_NETFILTER_XT_MATCH_STATISTIC=y CONFIG_NETFILTER_XT_MATCH_TCPMSS=y -# Useful for TCP MSS clamping in some network paths. - -CONFIG_NETFILTER_XT_TARGET_MASQUERADE=y -# iptables MASQUERADE target. Very commonly needed for pod outbound NAT. - -CONFIG_NETFILTER_XT_TARGET_REDIRECT=y -# Redirect target. - -CONFIG_NETFILTER_XT_TARGET_MARK=y -# Packet marking support. Useful for advanced networking/routing rules. CONFIG_NETFILTER_XT_TARGET_CT=y -# Connection tracking target for xtables. +CONFIG_NETFILTER_XT_TARGET_MARK=y +CONFIG_NETFILTER_XT_TARGET_MASQUERADE=y +CONFIG_NETFILTER_XT_TARGET_REDIRECT=y +CONFIG_NETFILTER_XT_TARGET_TPROXY=y -# Optional. Good only if you know you need transparent proxying. -# Not required for initial CRI-O bring-up. -# CONFIG_NETFILTER_XT_TARGET_TPROXY=m +CONFIG_NETFILTER_NETLINK_ACCT=y +CONFIG_NETFILTER_XT_MATCH_NFACCT=y - -############################################################################### -# Bridge / container interface plumbing -############################################################################### - -CONFIG_VETH=y -# Virtual Ethernet pairs. This is how container interfaces are commonly connected -# to the host/network namespace. - -CONFIG_BRIDGE=y -# Ethernet bridge support. Needed by bridge-based CNIs. - -CONFIG_BRIDGE_NETFILTER=y -# Allows bridged traffic to pass through netfilter/iptables/nftables hooks. -# Important for Kubernetes networking behavior. - -# Optional / version-dependent: -# Some kernels expose additional ebtables/bridge netfilter pieces separately. -# Keep this if your kernel has it, but don't panic if it doesn't. -CONFIG_BRIDGE_NF_EBTABLES=y -# Bridge filtering via ebtables compatibility. Sometimes useful, not always critical. +CONFIG_IP_SET=m ############################################################################### @@ -281,24 +209,10 @@ CONFIG_BRIDGE_NF_EBTABLES=y ############################################################################### CONFIG_SECCOMP=y -# Secure computing mode. Lets runtimes restrict syscall surface. - CONFIG_SECCOMP_FILTER=y -# BPF-based seccomp filters. This is the useful seccomp mode for containers. - -# AppArmor / SELinux are optional depending on distro/security model. -# Alpine often won't use AppArmor by default; that's fine for first bring-up. - -# If your kernel tree has these and you care later: -# CONFIG_SECURITY=y -# CONFIG_SECURITYFS=y CONFIG_SECURITY_NETWORK=y -# enables Security Module (LSM) hooks for network operations. CoreDNS needs this - CONFIG_SECURITY_PATH=y -# Recommended for container isolation - CONFIG_SECURITY_NETWORK_XFRM=y @@ -307,59 +221,28 @@ CONFIG_SECURITY_NETWORK_XFRM=y ############################################################################### CONFIG_POSIX_MQUEUE=y -# POSIX message queues. Containers/apps sometimes rely on this. - CONFIG_EPOLL=y -# Event polling. Usually already enabled; standard modern userspace feature. - CONFIG_SIGNALFD=y -# File-descriptor-based signal delivery. Common Linux userspace feature. - CONFIG_TIMERFD=y -# File-descriptor timers. Common Linux userspace feature. - CONFIG_EVENTFD=y -# Event notification file descriptors. Common Linux userspace feature. - CONFIG_MEMFD_CREATE=y -# Anonymous memory-backed file creation. Widely used by modern software. -CONFIG_FHANDLE=y -# File handle support. Useful for container/runtime operations. + +############################################################################### +# Disable unused platform/virtualization pieces +############################################################################### CONFIG_DMIID=n -# Optional on embedded boards; usually not needed unless your tree selects it. - -############################################################################### -# Storage / block / other practical container bits -############################################################################### - -CONFIG_BLK_DEV_LOOP=y -# Loop devices. Often useful for image/layer tooling or debugging. -# Could be =m too, but built-in is harmless and often convenient. - -CONFIG_AUTOFS_FS=y -# Automount filesystem support. Not strictly required for CRI-O, but harmless. - -CONFIG_PROC_FS=y -# /proc support. Essential. - -CONFIG_SYSFS=y -# /sys support. Essential. - -CONFIG_DEVTMPFS=y -# Kernel-managed /dev population support. - -CONFIG_DEVTMPFS_MOUNT=y -# Automatically mount devtmpfs. Very practical on small/custom systems. - -### Disable XEN because it breaks our build and we don't need it CONFIG_XEN=n CONFIG_XEN_DOM0=n CONFIG_VHOST_XEN=n -### For Disk IO diagnostics + +############################################################################### +# Disk IO diagnostics +############################################################################### + CONFIG_TASK_DELAY_ACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_TASKSTATS=y diff --git a/makefile b/makefile index 1d67d50..e698e1a 100644 --- a/makefile +++ b/makefile @@ -27,9 +27,8 @@ CLITOOLS_BIN := bin/ctl-linux-$(ARCH)-$(TAG) CONFIGS_DIR := configs SCRIPTS_DIR := scripts CLUSTER_ENV_DEFAULT := $(CONFIGS_DIR)/cluster.env.default +CLUSTER_ENV_WORK := $(CONFIGS_DIR)/cluster.env.work CLUSTER_ENV := $(OUT_DIR)/cluster.env -NODE_ENV_DEFAULT := configs/node.env.default -NODE_ENV := $(OUT_DIR)/node.env BOARD_ITB := $(OUT_DIR)/board.itb INITRAMFS := $(OUT_DIR)/initramfs.cpio.gz @@ -277,8 +276,15 @@ $(RELEASE_IMAGE): $(RELEASE_DEPS) $(DOWNLOAD_PACKAGES_STAMP) | $(OUT_DIR) # ---- config targets ------------------------------------------------------------ -cluster-config: $(CLUSTER_ENV_DEFAULT) $(SCRIPTS_DIR)/merge-env.sh | $(OUT_DIR) +cluster-config: $(CLUSTER_ENV_DEFAULT) $(CLUSTER_ENV_WORK) $(SCRIPTS_DIR)/merge-env.sh | $(OUT_DIR) + @rm -f $(CLUSTER_ENV) sh $(SCRIPTS_DIR)/merge-env.sh $(CLUSTER_ENV_DEFAULT) $(CLUSTER_ENV) + @if [ -f "$(CLUSTER_ENV_WORK)" ]; then \ + echo "Merging $(CLUSTER_ENV_WORK) into $(CLUSTER_ENV)"; \ + sh $(SCRIPTS_DIR)/merge-env.sh $(CLUSTER_ENV_WORK) $(CLUSTER_ENV); \ + else \ + echo "No $(CLUSTER_ENV_WORK), using defaults only"; \ + fi cluster-defconfig: $(CLUSTER_ENV_DEFAULT) | $(OUT_DIR) cp $(CLUSTER_ENV_DEFAULT) $(CLUSTER_ENV) diff --git a/scripts/merge-env.sh b/scripts/merge-env.sh index 66783e3..2b4c6cd 100755 --- a/scripts/merge-env.sh +++ b/scripts/merge-env.sh @@ -6,6 +6,25 @@ OUTPUT="${2:?output file required}" mkdir -p "$(dirname "$OUTPUT")" +TMP="$(mktemp)" +BASE_CREATED=0 + +if [ -f "$OUTPUT" ]; then + BASE="$OUTPUT" +else + BASE="$(mktemp)" + BASE_CREATED=1 + : > "$BASE" +fi + +cleanup() { + rm -f "$TMP" + if [ "$BASE_CREATED" = "1" ]; then + rm -f "$BASE" + fi +} +trap cleanup EXIT INT TERM + awk ' function trim(s) { sub(/^[[:space:]]+/, "", s) @@ -13,33 +32,76 @@ function trim(s) { return s } -BEGIN { - for (k in ENVIRON) { - env[k] = ENVIRON[k] - } -} - -/^[[:space:]]*#/ || /^[[:space:]]*$/ { - print - next -} - -{ - line = $0 +function parse_key(line, eq, key) { eq = index(line, "=") - if (eq == 0) { - print line - next + return "" } key = trim(substr(line, 1, eq - 1)) - val = substr(line, eq + 1) - if (key in env) { - print key "=" env[key] - } else { - print line + if (key !~ /^[A-Za-z_][A-Za-z0-9_]*$/) { + return "" + } + + return key +} + +function merged_line(key, line) { + if (key ~ /^MKS_/ && key in ENVIRON) { + return key "=" ENVIRON[key] + } + + return line +} + +# First file: INPUT +phase == 1 { + line = $0 + key = parse_key(line) + + if (key != "") { + incoming[key] = merged_line(key, line) + + if (!(key in input_seen)) { + input_order[++input_count] = key + input_seen[key] = 1 + } + } + + next +} + +# Second file: existing OUTPUT / BASE +phase == 2 { + line = $0 + key = parse_key(line) + + if (key != "" && key in incoming) { + print incoming[key] + written[key] = 1 + next + } + + print line + + if (key != "") { + written[key] = 1 + } + + next +} + +END { + for (i = 1; i <= input_count; i++) { + key = input_order[i] + + if (!(key in written)) { + print incoming[key] + written[key] = 1 + } } } -' "$INPUT" > "$OUTPUT" \ No newline at end of file +' phase=1 "$INPUT" phase=2 "$BASE" > "$TMP" + +mv "$TMP" "$OUTPUT"