#ecs
netdata
developer
在服务器集群上部署netdata性能检测面板
本地部署
用apt安装就可以,然而如果你想用netdata cloud服务的话,必须使用官方的源。
配置文件/etc/netdata/netdata.conf,管理netdata的行为。
[global]
run as user = netdata
web files owner = root
web files group = root
# Netdata is not designed to be exposed to potentially hostile
# networks. See https://github.com/netdata/netdata/issues/164
bind socket to IP = 127.0.0.1
hostname = jump-server-201
[web]
bind to = *
allow connections from = *
allow dashboard from = 100.*
bind to大概是描述了向哪些地址提供web服务,我这里写的很宽泛,因为安全通过ecs安全组实现,就没折腾配置和防火墙。下面是文档里的推荐写法
# Allow private network access
[web]
allow connections from = localhost 10.* 192.168.*
allow dashboard from = localhost 10.* 192.168.*
allow badges from = *
allow streaming from = *
allow management from = localhost
hostname就是最终面板中展现的主机名,如果不设置就是系统设置里的名字,云服务器最好弄一个不然太乱了。
多端结构
netdata使用parent-child的模式,通过apikey鉴权。配置文件都位于/etc/netdata/stream.conf。所谓apikey,用系统自带的uuidgen生成就好。
parent
[6608ee09-d5b1-4d9c-9b76-0b3a6c083762]
# 启用
enabled = yes
# 允许的连接数 (根据服务器数量调整)
default history = 3600
default memory mode = dbengine
# 健康检查
health enabled = yes
# 允许从子节点接收的数据
allow from = *
child
[stream]
enabled = yes
destination = ...:19999
api key = 6608ee09-d5b1-4d9c-9b76-0b3a6c083762
timeout seconds = 60
default port = 19999
send charts matching = *
buffer size bytes = 1048576
reconnect delay seconds = 5
initial clock resync iterations = 60
批量部署
parent不需要什么,任务是把child的脚本分发下去。下面是claude写的脚本,很啰唆,很AI,但能用。
#!/bin/bash
# Netdata批量部署脚本 (使用parallel-ssh)
# 功能: 在所有服务器上安装netdata并配置向跳板机汇报
set -e
# 配置变量
HOSTS_FILE="test.txt"
NETDATA_STREAM_API_KEY=""
JUMPSERVER_IP=""
NETDATA_PORT="19999"
STREAM_PORT="19999"
# 颜色输出
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
NC='\033[0m'
echo -e "${GREEN}步骤 1: 创建netdata配置文件${NC}"
# 创建stream.conf配置文件
TEMP_DIR="/tmp/netdata_deploy_$$"
mkdir -p "$TEMP_DIR"
cat > "$TEMP_DIR/stream.conf" <<EOF
[stream]
enabled = yes
destination = ${JUMPSERVER_IP}:${STREAM_PORT}
api key = ${NETDATA_STREAM_API_KEY}
timeout seconds = 60
default port = ${STREAM_PORT}
send charts matching = *
buffer size bytes = 1048576
reconnect delay seconds = 5
initial clock resync iterations = 60
EOF
echo -e "${GREEN}步骤 2: 创建安装脚本${NC}"
# 创建远程执行的安装脚本
cat > "$TEMP_DIR/install_netdata.sh" <<'EOFSCRIPT'
#!/bin/bash
set -e
echo "开始在 $(hostname) 上安装 Netdata..."
sudo apt install netdata -y
# 等待netdata启动
sleep 5
# 确保netdata配置目录存在
sudo mkdir -p /etc/netdata
echo "Netdata 安装完成"
EOFSCRIPT
chmod +x "$TEMP_DIR/install_netdata.sh"
echo -e "${GREEN}步骤 3: 使用parallel-ssh批量安装netdata${NC}"
# 批量复制安装脚本
parallel-scp -h "$HOSTS_FILE" -l ecs-user -p 10 \
"$TEMP_DIR/install_netdata.sh" /tmp/install_netdata.sh
# 批量执行安装
parallel-ssh -h "$HOSTS_FILE" -l ecs-user -p 10 -t 600 -i \
"bash /tmp/install_netdata.sh"
echo -e "${GREEN}步骤 4: 配置netdata stream${NC}"
# 批量复制stream配置文件
parallel-scp -h "$HOSTS_FILE" -l ecs-user -p 10 \
"$TEMP_DIR/stream.conf" /tmp/stream.conf
# 批量部署配置并重启netdata
parallel-ssh -h "$HOSTS_FILE" -l ecs-user -p 10 -i \
"sudo cp /tmp/stream.conf /etc/netdata/stream.conf && \
sudo chown netdata:netdata /etc/netdata/stream.conf && \
sudo chmod 644 /etc/netdata/stream.conf && \
sudo systemctl restart netdata || sudo service netdata restart"
echo -e "${GREEN}步骤 5: 验证${NC}"
# 检查netdata服务状态
parallel-ssh -h "$HOSTS_FILE" -l root -p 10 -i \
"sudo systemctl status netdata | head -5 || sudo service netdata status | head -5"
# 保存API Key
echo "NETDATA_STREAM_API_KEY=${NETDATA_STREAM_API_KEY}" > netdata_api_key.txt
echo "JUMPSERVER_IP=${JUMPSERVER_IP}" >> netdata_api_key.txt
批量改名
成功之后看面板发现里面全是主机名,跟乱码一样,而且和大家熟悉的内网ip地址之间没有对应关系。其实只需要改[global]hostname就可以,但网上竟然没找到现成的帖子。
#!/bin/bash
# 配置变量
HOSTS_FILE="/home/ecs-user/deploynd/all.txt"
SSH_USER="ecs-user"
# 创建临时配置脚本
cat > /tmp/update_netdata_hostname.sh << 'EOF'
#!/bin/bash
# 获取本机IP
LOCAL_IP=$(hostname -I | awk '{print $1}')
# 备份原配置
cp /etc/netdata/netdata.conf /etc/netdata/netdata.conf.bak
# 检查是否已存在hostname配置
if grep -q "hostname = " /etc/netdata/netdata.conf; then
# 存在则替换
sed -i "s/.*hostname = .*/\thostname = $LOCAL_IP/" /etc/netdata/netdata.conf
else
# 不存在则追加
echo -e "\n\thostname = $LOCAL_IP" >> /etc/netdata/netdata.conf
fi
# 重启netdata
systemctl restart netdata
echo "Configured hostname as: $LOCAL_IP"
EOF
chmod +x /tmp/update_netdata_hostname.sh
# 使用parallel-ssh批量执行
parallel-scp -h $HOSTS_FILE -l $SSH_USER /tmp/update_netdata_hostname.sh /tmp/update_netdata_hostname.sh
parallel-ssh -h $HOSTS_FILE -l $SSH_USER -i "sudo bash /tmp/update_netdata_hostname.sh"
# 清理
rm /tmp/update_netdata_hostname.sh