#ecs

netdata

developer

在服务器集群上部署netdata性能检测面板

本地部署

用apt安装就可以,然而如果你想用netdata cloud服务的话,必须使用官方的源。

配置文件/etc/netdata/netdata.conf,管理netdata的行为。

[global]
        run as user = netdata
        web files owner = root
        web files group = root
        # Netdata is not designed to be exposed to potentially hostile
        # networks. See https://github.com/netdata/netdata/issues/164
        bind socket to IP = 127.0.0.1
        hostname = jump-server-201

[web]
        bind to = *
        allow connections from = *
        allow dashboard from = 100.*

bind to大概是描述了向哪些地址提供web服务,我这里写的很宽泛,因为安全通过ecs安全组实现,就没折腾配置和防火墙。下面是文档里的推荐写法

# Allow private network access
[web]
    allow connections from = localhost 10.* 192.168.*
    allow dashboard from = localhost 10.* 192.168.*
    allow badges from = *
    allow streaming from = *
    allow management from = localhost

hostname就是最终面板中展现的主机名,如果不设置就是系统设置里的名字,云服务器最好弄一个不然太乱了。

多端结构

netdata使用parent-child的模式,通过apikey鉴权。配置文件都位于/etc/netdata/stream.conf。所谓apikey,用系统自带的uuidgen生成就好。

parent

[6608ee09-d5b1-4d9c-9b76-0b3a6c083762]
    # 启用
    enabled = yes

    # 允许的连接数 (根据服务器数量调整)
    default history = 3600
    default memory mode = dbengine

    # 健康检查
    health enabled = yes

    # 允许从子节点接收的数据
    allow from = *

child

[stream]
    enabled = yes
    destination = ...:19999
    api key = 6608ee09-d5b1-4d9c-9b76-0b3a6c083762
    timeout seconds = 60
    default port = 19999
    send charts matching = *
    buffer size bytes = 1048576
    reconnect delay seconds = 5
    initial clock resync iterations = 60

批量部署

parent不需要什么,任务是把child的脚本分发下去。下面是claude写的脚本,很啰唆,很AI,但能用。

#!/bin/bash

# Netdata批量部署脚本 (使用parallel-ssh)
# 功能: 在所有服务器上安装netdata并配置向跳板机汇报

set -e

# 配置变量
HOSTS_FILE="test.txt"
NETDATA_STREAM_API_KEY=""
JUMPSERVER_IP=""
NETDATA_PORT="19999"
STREAM_PORT="19999"

# 颜色输出
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
NC='\033[0m'

echo -e "${GREEN}步骤 1: 创建netdata配置文件${NC}"

# 创建stream.conf配置文件
TEMP_DIR="/tmp/netdata_deploy_$$"
mkdir -p "$TEMP_DIR"
cat > "$TEMP_DIR/stream.conf" <<EOF
[stream]
    enabled = yes
    destination = ${JUMPSERVER_IP}:${STREAM_PORT}
    api key = ${NETDATA_STREAM_API_KEY}
    timeout seconds = 60
    default port = ${STREAM_PORT}
    send charts matching = *
    buffer size bytes = 1048576
    reconnect delay seconds = 5
    initial clock resync iterations = 60
EOF

echo -e "${GREEN}步骤 2: 创建安装脚本${NC}"

# 创建远程执行的安装脚本
cat > "$TEMP_DIR/install_netdata.sh" <<'EOFSCRIPT'
#!/bin/bash

set -e

echo "开始在 $(hostname) 上安装 Netdata..."

sudo apt install netdata -y

# 等待netdata启动
sleep 5

# 确保netdata配置目录存在
sudo mkdir -p /etc/netdata

echo "Netdata 安装完成"
EOFSCRIPT

chmod +x "$TEMP_DIR/install_netdata.sh"

echo -e "${GREEN}步骤 3: 使用parallel-ssh批量安装netdata${NC}"

# 批量复制安装脚本
parallel-scp -h "$HOSTS_FILE" -l ecs-user -p 10 \
    "$TEMP_DIR/install_netdata.sh" /tmp/install_netdata.sh

# 批量执行安装
parallel-ssh -h "$HOSTS_FILE" -l ecs-user -p 10 -t 600 -i \
    "bash /tmp/install_netdata.sh"

echo -e "${GREEN}步骤 4: 配置netdata stream${NC}"

# 批量复制stream配置文件
parallel-scp -h "$HOSTS_FILE" -l ecs-user -p 10 \
    "$TEMP_DIR/stream.conf" /tmp/stream.conf

# 批量部署配置并重启netdata
parallel-ssh -h "$HOSTS_FILE" -l ecs-user -p 10 -i \
    "sudo cp /tmp/stream.conf /etc/netdata/stream.conf && \
     sudo chown netdata:netdata /etc/netdata/stream.conf && \
     sudo chmod 644 /etc/netdata/stream.conf && \
     sudo systemctl restart netdata || sudo service netdata restart"

echo -e "${GREEN}步骤 5: 验证${NC}"

# 检查netdata服务状态
parallel-ssh -h "$HOSTS_FILE" -l root -p 10 -i \
    "sudo systemctl status netdata | head -5 || sudo service netdata status | head -5"

# 保存API Key
echo "NETDATA_STREAM_API_KEY=${NETDATA_STREAM_API_KEY}" > netdata_api_key.txt
echo "JUMPSERVER_IP=${JUMPSERVER_IP}" >> netdata_api_key.txt

批量改名

成功之后看面板发现里面全是主机名,跟乱码一样,而且和大家熟悉的内网ip地址之间没有对应关系。其实只需要改[global]hostname就可以,但网上竟然没找到现成的帖子。

#!/bin/bash

# 配置变量
HOSTS_FILE="/home/ecs-user/deploynd/all.txt" 
SSH_USER="ecs-user"

# 创建临时配置脚本
cat > /tmp/update_netdata_hostname.sh << 'EOF'
#!/bin/bash
# 获取本机IP
LOCAL_IP=$(hostname -I | awk '{print $1}')

# 备份原配置
cp /etc/netdata/netdata.conf /etc/netdata/netdata.conf.bak

# 检查是否已存在hostname配置
if grep -q "hostname = " /etc/netdata/netdata.conf; then
    # 存在则替换
    sed -i "s/.*hostname = .*/\thostname = $LOCAL_IP/" /etc/netdata/netdata.conf
else
    # 不存在则追加
    echo -e "\n\thostname = $LOCAL_IP" >> /etc/netdata/netdata.conf
fi

# 重启netdata
systemctl restart netdata

echo "Configured hostname as: $LOCAL_IP"
EOF

chmod +x /tmp/update_netdata_hostname.sh

# 使用parallel-ssh批量执行
parallel-scp -h $HOSTS_FILE -l $SSH_USER /tmp/update_netdata_hostname.sh /tmp/update_netdata_hostname.sh

parallel-ssh -h $HOSTS_FILE -l $SSH_USER -i "sudo bash /tmp/update_netdata_hostname.sh"

# 清理
rm /tmp/update_netdata_hostname.sh