#!/usr/bin/env bash
# remnawave-node-bootstrap v1.1
# One-shot provisioning of a Remnawave worker node on a fresh VPS.
# Supports: Debian 12+, Ubuntu 22.04+. Run as root. Interactive or env-var driven.

set -euo pipefail

VERSION="1.1"

# === Logging helpers (no ANSI so the script stays portable in the JS wrapper) ===
log()  { echo; echo "[$(date +%H:%M:%S)] $*"; }
ok()   { echo "  [OK]   $*"; }
warn() { echo "  [WARN] $*"; }
die()  { echo; echo "ERROR: $*" >&2; exit 1; }

STEP="init"
trap 'die "Failed at line $LINENO during step: ${STEP:-unknown}"' ERR

# === Preflight ===
STEP="preflight"

[[ $EUID -ne 0 ]] && die "Must run as root on a fresh VPS"
command -v apt-get >/dev/null 2>&1 || die "Supports only Debian/Ubuntu family"

export DEBIAN_FRONTEND=noninteractive

log "remnawave-node-bootstrap v$VERSION"

# Install sudo if missing (some minimal cloud images ship without it)
if ! command -v sudo >/dev/null 2>&1; then
    ok "installing sudo (missing on this image)"
    apt-get update -qq
    apt-get install -y -qq sudo
fi

# OS detection
apt-get install -y -qq lsb-release >/dev/null 2>&1 || true
OS_ID=$(lsb_release -si 2>/dev/null | tr 'A-Z' 'a-z' || echo unknown)
OS_CODENAME=$(lsb_release -sc 2>/dev/null || echo unknown)
case "$OS_ID" in
    debian|ubuntu) ok "OS: $OS_ID $OS_CODENAME" ;;
    *) die "Unsupported OS: $OS_ID (need Debian 12+ or Ubuntu 22.04+)" ;;
esac

# Public IP auto-detection (tries a few services so we don't depend on any single one)
PUBLIC_IP=""
for svc in ifconfig.me api.ipify.org ipinfo.io/ip icanhazip.com; do
    PUBLIC_IP=$(curl -fsS -4 --max-time 5 "https://$svc" 2>/dev/null || true)
    [ -n "$PUBLIC_IP" ] && break
done
[ -n "$PUBLIC_IP" ] || die "Cannot auto-detect public IP (all services failed)"
ok "Public IP: $PUBLIC_IP"

# === Banner ===
cat <<'BANNER'

===================================================================
  Remnawave Node Bootstrap
===================================================================
  This will convert this fresh VPS into a Remnawave worker node:

    1. Create 'default' user with SSH key (root SSH will be disabled)
    2. Harden OS: updates, SSH config, UFW, fail2ban, docker, journald
    3. Deploy remnanode container (Xray-core data plane)
    4. Open firewall: 22/tcp, 2222/tcp (from master only), 443/tcp
    5. Start container and verify master handshake

  After completion, create a Host in your Remnawave panel pointing
  to this server's public IP on port 443 and attach it to a Squad.
===================================================================

BANNER

# Reads must come from /dev/tty because stdin is the curl pipe under
# `bash <(curl ...)` and would otherwise be exhausted.
read_tty() {
    local prompt="$1"
    local val
    read -r -p "$prompt" val < /dev/tty
    echo "$val"
}
confirm_tty() {
    local reply
    read -r -n 1 -p "Continue? [y/N]: " reply < /dev/tty
    echo
    [[ "$reply" =~ ^[Yy]$ ]]
}

confirm_tty || die "Aborted by user"

# === Input collection ===
STEP="input collection"

SSH_PUBKEY="${NODE_SSH_PUBKEY:-}"
MASTER_IP="${NODE_MASTER_IP:-}"
SECRET_KEY="${NODE_SECRET_KEY:-}"

if [ -z "$SSH_PUBKEY" ]; then
    echo
    echo "Step 1/3: SSH public key for user 'default'"
    echo "  One line, starts with 'ssh-rsa' or 'ssh-ed25519'."
    SSH_PUBKEY=$(read_tty "  > ")
fi
case "$SSH_PUBKEY" in
    "ssh-rsa "*|"ssh-ed25519 "*|"ssh-dss "*|"ecdsa-"*) ok "SSH pubkey looks valid" ;;
    *) die "SSH pubkey format is wrong (must start with ssh-rsa / ssh-ed25519 / ecdsa-...)" ;;
esac

if [ -z "$MASTER_IP" ]; then
    echo
    echo "Step 2/3: Remnawave master IP"
    echo "  This is the IP of your Remnawave control-plane server."
    echo "  It will be whitelisted through the firewall to reach gRPC port 2222."
    MASTER_IP=$(read_tty "  > ")
fi
[[ "$MASTER_IP" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]] || die "Master IP doesn't look like IPv4"
ok "Master IP: $MASTER_IP"

if [ -z "$SECRET_KEY" ]; then
    echo
    echo "Step 3/3: Node SECRET_KEY from Remnawave panel"
    echo
    echo "  Open your Remnawave panel in a browser and do:"
    echo "    Nodes -> Create Node"
    echo "    Name:    (anything meaningful, e.g. country or city code)"
    echo "    Address: $PUBLIC_IP   <-- use this exact IP"
    echo "    Country: your choice"
    echo "    -> Save"
    echo
    echo "  The panel will generate a long base64 SECRET_KEY (starts with 'ey')."
    echo "  Copy it from the panel and paste below:"
    SECRET_KEY=$(read_tty "  > ")
fi
case "$SECRET_KEY" in
    ey*)
        # Real Remnawave SECRET_KEYs are ~900+ chars (base64 of four PEMs + JWT pubkey).
        # 200-char floor catches accidental first-line-only pastes without false positives.
        if [ "${#SECRET_KEY}" -lt 200 ]; then
            warn "SECRET_KEY is suspiciously short (${#SECRET_KEY} chars) — possible paste truncation"
        else
            ok "SECRET_KEY format looks right (${#SECRET_KEY} chars)"
        fi
        ;;
    *) warn "SECRET_KEY does not start with 'ey' — double-check it is the panel value" ;;
esac

echo
log "Inputs collected, starting provisioning"

# === Step A: default user ===
STEP="user creation"
log "Creating 'default' user"

if ! id default >/dev/null 2>&1; then
    useradd -m -s /bin/bash default
    ok "user created"
else
    ok "user already exists (skipping useradd)"
fi
passwd -l default >/dev/null 2>&1 || true

install -d -o default -g default -m 700 /home/default/.ssh
touch /home/default/.ssh/authorized_keys
chown default:default /home/default/.ssh/authorized_keys
chmod 600 /home/default/.ssh/authorized_keys

# Append-if-missing: don't clobber keys from other devices on re-run (laptop + phone + work etc.)
if ! grep -qxF "$SSH_PUBKEY" /home/default/.ssh/authorized_keys; then
    echo "$SSH_PUBKEY" >> /home/default/.ssh/authorized_keys
    ok "SSH key appended"
else
    ok "SSH key already present — skipping"
fi

cat > /etc/sudoers.d/90-default-nopasswd <<'EOF'
default ALL=(ALL) NOPASSWD:ALL
EOF
chmod 0440 /etc/sudoers.d/90-default-nopasswd
visudo -cf /etc/sudoers.d/90-default-nopasswd >/dev/null || die "sudoers validation failed"
ok "sudoers NOPASSWD configured"

# Remove cloud-init default users (debian / ubuntu) if present and unused
for u in debian ubuntu; do
    if id "$u" >/dev/null 2>&1 && [ "$u" != "default" ]; then
        pkill -KILL -u "$u" 2>/dev/null || true
        sleep 1
        userdel -r "$u" >/dev/null 2>&1 || true
        ok "removed cloud-init user: $u"
    fi
done
rm -f /etc/sudoers.d/90-cloud-init-users

# === Step B: system update ===
STEP="system update"
log "Updating packages (can take 1-3 minutes)"
apt-get update -qq
apt-get upgrade -y -qq >/dev/null
apt-get autoremove -y -qq >/dev/null
ok "system updated"

# === Step C: base packages ===
STEP="base packages"
log "Installing base packages"
apt-get install -y -qq \
    curl wget git vim nano htop jq ncdu \
    net-tools dnsutils ca-certificates \
    gnupg apt-transport-https software-properties-common \
    unzip ufw fail2ban unattended-upgrades tzdata \
    needrestart >/dev/null
ok "base packages installed"

# === Step D: timezone ===
STEP="timezone"
timedatectl set-timezone UTC
ok "timezone: UTC"

# === Step E: kernel network tuning ===
STEP="sysctl tuning"
log "Applying sysctl tuning for proxy workload"

cat > /etc/sysctl.d/99-remnawave-node.conf <<'EOF'
# High-BDP socket buffers for international proxy traffic
# (RU<->EU ~50-100ms RTT, lossy paths)
net.core.rmem_max = 67108864
net.core.wmem_max = 67108864
net.ipv4.tcp_rmem = 4096 87380 67108864
net.ipv4.tcp_wmem = 4096 65536 67108864

# BBR + fq — significantly better than cubic on lossy international paths
net.core.default_qdisc = fq
net.ipv4.tcp_congestion_control = bbr

# Many concurrent connections
fs.file-max = 1048576
net.ipv4.tcp_fastopen = 3
EOF

# BBR module isn't auto-loaded on all Debian images; load now and persist.
# Must happen before sysctl -p, otherwise tcp_congestion_control=bbr silently no-ops.
modprobe tcp_bbr 2>/dev/null || warn "tcp_bbr module load failed — kernel may lack BBR"
echo tcp_bbr > /etc/modules-load.d/bbr.conf
sysctl -p /etc/sysctl.d/99-remnawave-node.conf >/dev/null
ok "sysctl tuning applied (BBR + fq, high-BDP buffers)"

# === Step F: unattended upgrades ===
STEP="unattended upgrades"
log "Enabling unattended security upgrades"

cat > /etc/apt/apt.conf.d/20auto-upgrades <<'EOF'
APT::Periodic::Update-Package-Lists "1";
APT::Periodic::Download-Upgradeable-Packages "1";
APT::Periodic::AutocleanInterval "7";
APT::Periodic::Unattended-Upgrade "1";
EOF

cat > /etc/apt/apt.conf.d/50unattended-upgrades <<'EOF'
Unattended-Upgrade::Origins-Pattern {
    "origin=Debian,codename=${distro_codename},label=Debian";
    "origin=Debian,codename=${distro_codename},label=Debian-Security";
    "origin=Debian,codename=${distro_codename}-security,label=Debian-Security";
    "origin=Ubuntu,archive=${distro_codename}-security";
};
Unattended-Upgrade::Remove-Unused-Kernel-Packages "true";
Unattended-Upgrade::Remove-Unused-Dependencies "true";
Unattended-Upgrade::Automatic-Reboot "true";
Unattended-Upgrade::Automatic-Reboot-Time "03:00";
EOF

systemctl enable --now unattended-upgrades >/dev/null 2>&1 || true
ok "unattended upgrades enabled (auto-reboot 03:00 UTC)"

# === Step G: SSH hardening ===
STEP="SSH hardening"
log "Hardening sshd"

# cloud-init on Debian/Ubuntu cloud images drops /etc/ssh/sshd_config.d/50-cloud-init.conf
# with "PasswordAuthentication yes". It sorts before our 99-hardening.conf and sshd takes
# the FIRST match for most directives, so without removing it our hardening is a no-op.
rm -f /etc/ssh/sshd_config.d/50-cloud-init.conf

cat > /etc/ssh/sshd_config.d/99-hardening.conf <<'EOF'
# Managed by remnawave-node-bootstrap
PasswordAuthentication no
ChallengeResponseAuthentication no
KbdInteractiveAuthentication no
PermitRootLogin no
PubkeyAuthentication yes
X11Forwarding no
MaxAuthTries 3
ClientAliveInterval 300
ClientAliveCountMax 2
LoginGraceTime 30
EOF

if sshd -t; then
    systemctl restart ssh
    ok "sshd hardened, root SSH disabled"
else
    rm -f /etc/ssh/sshd_config.d/99-hardening.conf
    die "sshd config validation failed, rolled back"
fi

# === Step H: UFW (initial) ===
STEP="UFW initial"
log "Configuring UFW"

ufw default deny incoming >/dev/null
ufw default allow outgoing >/dev/null
# "limit" (not "allow") on 22: UFW rate-limits repeated connections from the same IP
# at the network layer, complementing fail2ban at the application layer.
ufw limit 22/tcp comment 'SSH (rate-limited)' >/dev/null
ufw --force enable >/dev/null 2>&1
ok "UFW active: deny in, allow out, 22/tcp rate-limited"

# === Step I: fail2ban ===
STEP="fail2ban"
log "Configuring fail2ban sshd jail"

# backend=systemd is required on Debian 12+ because /var/log/auth.log no longer exists
# by default — sshd events only reach journald. Without this line fail2ban fails to start.
cat > /etc/fail2ban/jail.d/sshd.local <<'EOF'
[sshd]
enabled = true
port = ssh
backend = systemd
maxretry = 3
findtime = 10m
bantime = 1h
EOF

systemctl enable --now fail2ban >/dev/null 2>&1 || true
systemctl restart fail2ban
sleep 2
if systemctl is-active fail2ban >/dev/null; then
    ok "fail2ban running"
else
    warn "fail2ban not active — check: systemctl status fail2ban"
fi

# === Step J: Docker daemon config ===
STEP="docker daemon config"
log "Writing /etc/docker/daemon.json (log rotation)"

# MUST be written BEFORE docker installs so the daemon picks it up on first start.
# Without this, json-file driver is unbounded — remnanode logs can fill disk in days
# on a busy node. journald cap (Step L) doesn't apply: Docker writes directly to
# /var/lib/docker/containers/<id>/*.log, not through journald.
mkdir -p /etc/docker
cat > /etc/docker/daemon.json <<'EOF'
{
  "log-driver": "json-file",
  "log-opts": {
    "max-size": "10m",
    "max-file": "3"
  }
}
EOF
ok "docker log rotation: 10M x 3 files per container"

# === Step K: Docker ===
STEP="docker install"
log "Installing Docker CE (official repo)"

if ! command -v docker >/dev/null 2>&1; then
    install -m 0755 -d /etc/apt/keyrings
    curl -fsSL "https://download.docker.com/linux/$OS_ID/gpg" -o /etc/apt/keyrings/docker.asc
    chmod a+r /etc/apt/keyrings/docker.asc

    echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/$OS_ID $OS_CODENAME stable" \
        > /etc/apt/sources.list.d/docker.list

    apt-get update -qq
    apt-get install -y -qq \
        docker-ce docker-ce-cli containerd.io \
        docker-buildx-plugin docker-compose-plugin >/dev/null

    systemctl enable --now docker
    ok "Docker installed: $(docker --version)"
else
    ok "Docker already installed: $(docker --version)"
fi

usermod -aG docker default

# === Step L: journald cap ===
STEP="journald"
mkdir -p /etc/systemd/journald.conf.d
cat > /etc/systemd/journald.conf.d/size.conf <<'EOF'
[Journal]
SystemMaxUse=500M
SystemKeepFree=1G
EOF
systemctl restart systemd-journald
ok "journald capped at 500M"

# === Step M: remnanode compose file ===
STEP="remnanode compose"
log "Writing /opt/remnanode/docker-compose.yml"

mkdir -p /opt/remnanode
# SECRET_KEY written without surrounding quotes: in Docker Compose list-style env,
# quotes become part of the literal value. Remnawave's parser currently strips them
# on the server side, but that's not a stable contract — pass the raw base64 value.
cat > /opt/remnanode/docker-compose.yml <<COMPOSE_EOF
services:
  remnanode:
    container_name: remnanode
    hostname: remnanode
    image: remnawave/node:latest
    network_mode: host
    restart: always
    cap_add:
      - NET_ADMIN
    ulimits:
      nofile:
        soft: 1048576
        hard: 1048576
    environment:
      - NODE_PORT=2222
      - SECRET_KEY=$SECRET_KEY
COMPOSE_EOF
chmod 600 /opt/remnanode/docker-compose.yml
ok "compose file written"

# === Step N: UFW node ports ===
STEP="UFW node ports"
log "Opening firewall for node traffic"
ufw allow from "$MASTER_IP" to any port 2222 proto tcp comment 'Remnawave master gRPC' >/dev/null
ufw allow 443/tcp comment 'VLESS Reality inbound' >/dev/null
ok "UFW: 2222/tcp from $MASTER_IP, 443/tcp anywhere"

# === Step O: pull + start container ===
STEP="container start"
log "Pulling image and starting container (30-60s on first run)"
cd /opt/remnanode
docker compose pull -q
docker compose up -d
ok "container started"

# === Step P: verification ===
STEP="verification"
log "Verifying node health"

# Signal hierarchy (replaces the old log-grep for "Master IP" which was
# fragile to upstream log format changes):
#
#   1. Port 2222 listening   — gRPC control plane is up. PRIMARY signal.
#                              If absent, the container failed hard — die.
#   2. Port 443 listening    — master has authenticated and pushed Xray config.
#                              Only happens after a successful handshake, so this
#                              is the DEFINITIVE signal that the node is healthy.
#                              May legitimately be absent if no Host is attached yet.

PORT_2222=0
for _ in 1 2 3 4 5 6 7 8 9 10; do
    sleep 3
    if ss -tln 2>/dev/null | grep -q ':2222 '; then
        PORT_2222=1
        break
    fi
done

if [ "$PORT_2222" != "1" ]; then
    echo
    warn "port 2222 (gRPC) not listening after 30s — container failed to start"
    echo "=== last 50 lines of remnanode logs ==="
    docker logs remnanode --tail 50 2>&1 || true
    echo "=== end logs ==="
    die "remnanode did not come up. Check logs above for the root cause."
fi
ok "port 2222 (gRPC) listening — container is up"

if ss -tln 2>/dev/null | grep -q ':443 '; then
    ok "port 443 (VLESS) listening — master authenticated and pushed Xray config"
else
    warn "port 443 not listening — master has not pushed a Host config yet"
    warn "expected if you haven't created a Host entry + attached it to a Squad in the panel"
fi

# NTP sync — Reality is TLS, TLS is clock-sensitive. Fresh boots may need a few
# seconds for timesyncd to do its first poll, so check with a short retry loop
# instead of a single point-in-time query.
systemctl enable --now systemd-timesyncd >/dev/null 2>&1 || true
for _ in 1 2 3 4 5; do
    [ "$(timedatectl show -p NTPSynchronized --value 2>/dev/null)" = "yes" ] && break
    sleep 2
done
if [ "$(timedatectl show -p NTPSynchronized --value 2>/dev/null)" = "yes" ]; then
    ok "NTP synchronized"
else
    warn "NTP not synchronized after 10s — Reality TLS handshakes may fail"
    warn "check: timedatectl status"
fi

# === Summary ===
STEP="done"
cat <<SUMMARY

===================================================================
  Bootstrap complete
===================================================================
  Public IP    : $PUBLIC_IP
  VLESS port   : 443/tcp
  gRPC port    : 2222/tcp  (from $MASTER_IP only)
  SSH          : key-only, root login disabled
  Docker       : $(docker --version 2>/dev/null || echo 'unknown')

  Next steps in your Remnawave panel:
    1. Verify this node shows "Connected"
    2. Create (or edit) a Host entry:
         Address : $PUBLIC_IP
         Port    : 443
         Inbound : your VLESS Reality inbound
    3. Add that Host to your Default Squad so users see it in their subscription

  Useful commands on this box (as 'default' via SSH):
    sudo docker logs remnanode --tail 50
    sudo docker compose -f /opt/remnanode/docker-compose.yml restart
    sudo ufw status verbose

  Diagnostics if something broke:
    sudo journalctl -u docker --no-pager -n 50
    sudo docker exec remnanode sh -c 'tail -50 /var/log/supervisor/xray.out.log'

  !! If your hoster runs an upstream firewall separate from the OS
     (AWS Security Groups, GCP firewall, OVH IP firewall, Hetzner
     Robot firewall, Oracle Cloud NSG, some Scaleway plans, etc.)
     you must ALSO open 22/tcp and 443/tcp there. UFW on this box
     only controls the OS-level firewall and cannot see the hoster
     layer above it.
===================================================================
SUMMARY

# Kernel reboot notice — printed AFTER the summary so it's the last thing the
# operator sees. needrestart's postinst hook populates /var/run/reboot-required
# when apt upgrade pulls a new kernel package.
if [ -f /var/run/reboot-required ]; then
    echo
    echo "!! Kernel was upgraded during bootstrap — reboot recommended: sudo reboot"
    echo "   The node will keep running on the old kernel until you reboot."
fi
