diff --git a/.gitmodules b/.gitmodules index 7c9dcff1f0..f9182f0090 100644 --- a/.gitmodules +++ b/.gitmodules @@ -5,4 +5,4 @@ [submodule "eggroll"] path = eggroll url = https://github.com/WeBankFinTech/eggroll.git - branch = v2.0.1 + branch = release-2.0.2-build-6 diff --git a/.travis.yml b/.travis.yml index 9054c89616..f2e6d4a696 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,9 +14,9 @@ matrix: - env: OS='centos:7' script: - - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.4.tar.gz - - tar -xzf standalone-fate-master-1.4.4.tar.gz - - cd standalone-fate-master-1.4.4 + - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.5.tar.gz + - tar -xzf standalone-fate-master-1.4.5.tar.gz + - cd standalone-fate-master-1.4.5 - sed -i.bak "s/sh service.sh/bash service.sh/g" init.sh - source init.sh init - ls -alh diff --git a/arch/api/impl/based_2x/federation.py b/arch/api/impl/based_2x/federation.py index b19bd6eed2..e4fcb99171 100644 --- a/arch/api/impl/based_2x/federation.py +++ b/arch/api/impl/based_2x/federation.py @@ -135,10 +135,18 @@ def done_callback(fut): except Exception as e: import os import signal + import traceback + import logging + import sys + exc_info = sys.exc_info() + traceback.print_exception(*exc_info) pid = os.getpid() LOGGER.exception(f"remote fail, terminating process(pid={pid})") - os.kill(pid, signal.SIGTERM) - raise e + try: + logging.shutdown() + finally: + os.kill(pid, signal.SIGTERM) + raise e for future in futures: future.add_done_callback(done_callback) diff --git a/cluster-deploy/README.md b/cluster-deploy/README.md index 2aeca269f8..53fff6d7cb 100644 --- a/cluster-deploy/README.md +++ b/cluster-deploy/README.md @@ -199,10 +199,10 @@ Execute under the app user of the target server (192.168.0.1 has an external net ``` mkdir -p /data/projects/install cd /data/projects/install -wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.3-release.tar.gz +wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.5-release.tar.gz wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/jdk-8u192-linux-x64.tar.gz -wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.3-release.tar.gz -wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.3-release.tar.gz +wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.5-release.tar.gz +wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.5-release.tar.gz #Send to 192.168.0.2和192.168.0.3 scp *.tar.gz app@192.168.0.2:/data/projects/install @@ -355,7 +355,7 @@ sh Miniconda3-4.5.4-Linux-x86_64.sh -b -p /data/projects/fate/common/miniconda3 tar xvf pip-packages-fate-*.tar.gz source /data/projects/fate/common/python/venv/bin/activate pip install setuptools-42.0.2-py2.py3-none-any.whl -pip install -r pip-packages-fate-1.4.3/requirements.txt -f ./pip-packages-fate-1.4.3 --no-index +pip install -r pip-packages-fate-1.4.5/requirements.txt -f ./pip-packages-fate-1.4.5 --no-index pip list | wc -l #The result should be 161 ``` diff --git a/cluster-deploy/README.rst b/cluster-deploy/README.rst index f432d8fbba..40ec8c672f 100644 --- a/cluster-deploy/README.rst +++ b/cluster-deploy/README.rst @@ -239,10 +239,10 @@ external network environment): mkdir -p /data/projects/install cd /data/projects/install - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.3-release.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.5-release.tar.gz wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/jdk-8u192-linux-x64.tar.gz - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.3-release.tar.gz - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.3-release.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.5-release.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.5-release.tar.gz #Send to 192.168.0.2和192.168.0.3 scp *.tar.gz app@192.168.0.2:/data/projects/install @@ -400,7 +400,7 @@ external network environment): tar xvf pip-packages-fate-*.tar.gz source /data/projects/fate/common/python/venv/bin/activate pip install setuptools-42.0.2-py2.py3-none-any.whl - pip install -r pip-packages-fate-1.4.3/requirements.txt -f ./pip-packages-fate-1.4.3 --no-index + pip install -r pip-packages-fate-1.4.5/requirements.txt -f ./pip-packages-fate-1.4.5 --no-index pip list | wc -l #The result should be 161 diff --git a/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.md b/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.md index 676ce535ff..57c955ae62 100644 --- a/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.md +++ b/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.md @@ -255,8 +255,8 @@ Swap: 131071 0 131071 ``` cd /data/projects/ -wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/fate-cluster-install-1.4.4-release-c7-u18.tar.gz -tar xzf fate-cluster-install-1.4.4-release-c7-u18.tar.gz +wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/fate-cluster-install-1.4.5-release-c7-u18.tar.gz +tar xzf fate-cluster-install-1.4.5-release-c7-u18.tar.gz ``` ## 5.2 部署前检查 @@ -292,7 +292,7 @@ vi fate-cluster-install/allInone/conf/setup.conf | 配置项 | 配置项值 | 说明 | | ---------------- | --------------------------------------------- | ------------------------------------------------------------ | | roles | 默认:"host" "guest" | 部署的角色,有HOST端、GUEST端 | -| version | 默认:1.4.4 | Fate 版本号 | +| version | 默认:1.4.5 | Fate 版本号 | | pbase | 默认: /data/projects | 项目根目录 | | lbase | 默认:/data/logs | 保持默认不要修改 | | ssh_user | 默认:app | ssh连接目标机器的用户,也是部署后文件的属主 | @@ -321,7 +321,7 @@ vi fate-cluster-install/allInone/conf/setup.conf #to install role roles=( "host" "guest" ) -version="1.4.4" +version="1.4.5" #project base pbase="/data/projects" @@ -377,7 +377,7 @@ basemodules=( "base" "java" "python" "eggroll" "fate" ) #to install role roles=( "host" ) -version="1.4.4" +version="1.4.5" #project base pbase="/data/projects" diff --git a/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.rst b/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.rst index b21fd57593..fa739a5d1f 100644 --- a/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.rst +++ b/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.rst @@ -275,8 +275,8 @@ ssh app@192.168.0.2 :: cd /data/projects/ - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/fate-cluster-install-1.4.4-release-c7-u18.tar.gz - tar xzf fate-cluster-install-1.4.4-release-c7-u18.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/fate-cluster-install-1.4.5-release-c7-u18.tar.gz + tar xzf fate-cluster-install-1.4.5-release-c7-u18.tar.gz 5.2 部署前检查 -------------- @@ -314,7 +314,7 @@ ssh app@192.168.0.2 +======================+=================================================+==============================================================================+ | roles | 默认:"host" "guest" | 部署的角色,有HOST端、GUEST端 | +----------------------+-------------------------------------------------+------------------------------------------------------------------------------+ -| version | 默认:1.4.4 | Fate 版本号 | +| version | 默认:1.4.5 | Fate 版本号 | +----------------------+-------------------------------------------------+------------------------------------------------------------------------------+ | pbase | 默认: /data/projects | 项目根目录 | +----------------------+-------------------------------------------------+------------------------------------------------------------------------------+ @@ -366,7 +366,7 @@ ssh app@192.168.0.2 #to install role roles=( "host" "guest" ) - version="1.4.4" + version="1.4.5" #project base pbase="/data/projects" @@ -421,7 +421,7 @@ ssh app@192.168.0.2 #to install role roles=( "host" ) - version="1.4.4" + version="1.4.5" #project base pbase="/data/projects" diff --git a/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.md b/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.md index 7d5678fa36..4980426d29 100644 --- a/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.md +++ b/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.md @@ -146,7 +146,7 @@ fi ``` cd /data/projects/install wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/jdk-8u192-linux-x64.tar.gz -wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.4-release.tar.gz +wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.5-release.tar.gz ``` ## 5.2 操作系统参数检查 diff --git a/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.rst b/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.rst index 47297f85f5..afc58587bc 100644 --- a/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.rst +++ b/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.rst @@ -160,7 +160,7 @@ ufw status cd /data/projects/install wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/jdk-8u192-linux-x64.tar.gz - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.4-release.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.5-release.tar.gz 5.2 操作系统参数检查 -------------------- diff --git a/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md b/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md index bb134c4b21..a716e6ca02 100644 --- a/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md +++ b/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md @@ -40,10 +40,12 @@ Eggroll 是一个适用于机器学习和深度学习的大规模分布式架构 本示例是每端只有一台主机,每端可以多台主机,目前只支持nodemanager多节点部署,其他组件都是单节点。 -| role | partyid | IP地址 | 操作系统 | 主机配置 | 存储 | 部署模块 | -| ----- | ------- | --------------------- | ----------------------- | -------- | ---- | ------------------------------------------------------------ | -| host | 10000 | 192.168.0.1 (有外网) | CentOS 7.2/Ubuntu 16.04 | 8C16G | 500G | fate_flow,fateboard,clustermanager,nodemanger,rollsite,mysql | -| guest | 9999 | 192.168.0.2 | CentOS 7.2/Ubuntu 16.04 | 8C16G | 500G | fate_flow,fateboard,clustermanager,nodemanger,rollsite,mysql | +| role | partyid | IP地址 | 操作系统 | 主机配置 | 存储 | 部署模块 | +| ----- | ------- | --------------------- | ---------- | -------- | ---- | ------------------------------------------------------------ | +| host | 10000 | 192.168.0.1 (有外网) | CentOS 7.2 | 8C16G | 500G | fate_flow,fateboard,clustermanager,nodemanger,rollsite,mysql | +| guest | 9999 | 192.168.0.2 | CentOS 7.2 | 8C16G | 500G | fate_flow,fateboard,clustermanager,nodemanger,rollsite,mysql | + +备注:涉及exchange说明会用192.168.0.88表示其IP,但本次示例不涉及exchange的部署。 ## 2.2.主机资源和操作系统要求 @@ -51,7 +53,7 @@ Eggroll 是一个适用于机器学习和深度学习的大规模分布式架构 | -------- | ------------------------------------------------------------ | | 主机配置 | 不低于8C16G500G,千兆网卡 | | 操作系统 | CentOS linux 7.2及以上同时低于8/Ubuntu 16.04 或 Ubuntu 18.04 | -| 依赖包 | 需要安装如下依赖包:
#centos
gcc gcc-c++ make openssl-devel gmp-devel mpfr-devel libmpc-devel libaio
numactl autoconf automake libtool libffi-devel ansible jq supervisor
#ubuntu
gcc g++ make openssl supervisor ansible jq libgmp-dev libmpfr-dev libmpc-dev
libaio libaio-dev numactl autoconf automake libtool libffi-dev ansible jq supervisor
cd /usr/lib/x86_64-linux-gnu
if [ ! -f "libssl.so.10" ];then
ln -s libssl.so.1.0.0 libssl.so.10
ln -s libcrypto.so.1.0.0 libcrypto.so.10
fi | +| 依赖包 | 需要安装如下依赖包:
#centos
gcc gcc-c++ make openssl-devel gmp-devel mpfr-devel libmpc-devel libaio
numactl autoconf automake libtool libffi-devel ansible jq supervisor | | 用户 | 用户:app,属主:apps(app用户需可以sudo su root而无需密码) | | 文件系统 | 1、数据盘挂载在/data目录下。
2、创建/data/projects目录,目录属主为:app:apps。
3、根目录空闲空间不低于20G。 | | 虚拟内存 | 不低于128G | @@ -100,8 +102,6 @@ vim /etc/hosts centos系统执行:rpm -qa | grep selinux -ubuntu系统执行:apt list --installed | grep selinux - 如果已安装了selinux就执行:setenforce 0 3.3 修改Linux系统参数 @@ -109,22 +109,32 @@ ubuntu系统执行:apt list --installed | grep selinux **在目标服务器(192.168.0.1 192.168.0.2)root用户下执行:** -1)vim /etc/security/limits.conf +1)清理20-nproc.conf文件 + +cd /etc/security/limits.d + +ls -lrt 20-nproc.conf + +存在则:mv 20-nproc.conf 20-nproc.conf_bak + +2)vim /etc/security/limits.conf \* soft nofile 65535 \* hard nofile 65535 -2)vim /etc/security/limits.d/20-nproc.conf +\* soft nproc 65535 + +\* hard nproc 65535 -\* soft nproc unlimited +重新登陆,ulimit -a查看是否生效 3.4 关闭防火墙 -------------- **在目标服务器(192.168.0.1 192.168.0.2 )root用户下执行** -如果是Centos系统: +Centos系统: systemctl disable firewalld.service @@ -132,12 +142,6 @@ systemctl stop firewalld.service systemctl status firewalld.service -如果是Ubuntu系统: - -ufw disable - -ufw status - 3.5 软件环境初始化 ------------------ @@ -165,6 +169,8 @@ Defaults !env_reset **3)配置ssh无密登录** +**注意:192.168.0.1不但需要可以免密登陆192.168.0.2,也需要可以免密登陆自身,配置完后务必手工ssh连接下自身和192.168.0.2,确认下认证信息。** + **a. 在目标服务器(192.168.0.1 192.168.0.2)app用户下执行** su app @@ -206,6 +212,8 @@ ssh app\@192.168.0.2 生产环境使用时,因内存计算需要增加128G虚拟内存,执行前需检查存储空间是否足够。 +注意:dd执行时间较长,请耐心等待 + ``` cd /data dd if=/dev/zero of=/data/swapfile128G bs=1024 count=134217728 @@ -215,18 +223,21 @@ cat /proc/swaps echo '/data/swapfile128G swap swap defaults 0 0' >> /etc/fstab ``` -## 3.7 安装ansible +## 3.7 安装依赖包 -**目标服务器(192.168.0.1) root用户执行** +**目标服务器(192.168.0.1 192.168.0.2)root用户执行** ``` -#判断是否已安装ansible -ansible --version -#没有则执行 -yum install -y ansible -``` - +#安装基础依赖包 +yum install -y gcc gcc-c++ make openssl-devel gmp-devel mpfr-devel libmpc-devel libaio numactl autoconf automake libtool libffi-devel +#如果有报错,需要解决yum源问题。 +#安装ansible和进程管理依赖包 +yum install -y ansible jq supervisor +#如果有报错同时服务器有外网,没有外网的需要解决yum源不全的问题,执行: +yum install -y epel-release +#增加一个更全面的第三方的源,然后再重新安装ansible jq supervisor +``` 4.项目部署 ========== @@ -242,16 +253,16 @@ yum install -y ansible **在目标服务器(192.168.0.1 192.168.0.2)app用户下执行** ``` -#虚拟内存,size不低于128G,如不满足需参考4.6章节重新设置 +#虚拟内存,size不低于128G,如不满足需参考3.6章节重新设置 cat /proc/swaps Filename Type Size Used Priority /data/swapfile128G file 134217724 384 -1 -#文件句柄数,不低于65535,如不满足需参考4.3章节重新设置 +#文件句柄数,不低于65535,如不满足需参考3.3章节重新设置 ulimit -n 65535 -#用户进程数,不低于64000,如不满足需参考4.3章节重新设置 +#用户进程数,不低于64000,如不满足需参考3.3章节重新设置 ulimit -u 65535 @@ -261,6 +272,7 @@ ps -ef| grep -i fate netstat -tlnp | grep 4670 netstat -tlnp | grep 4671 netstat -tlnp | grep 9370 +netstat -tlnp | grep 9371 netstat -tlnp | grep 9360 netstat -tlnp | grep 8080 netstat -tlnp | grep 3306 @@ -284,8 +296,8 @@ ls -lrt /data/projects/common/supervisord/supervisord.d/fate-*.conf ``` #注意:URL链接有换行,拷贝的时候注意整理成一行 cd /data/projects/ -wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/ansible_nfate_1.4.4_release-1.0.0.tar.gz -tar xzf ansible_nfate_1.4.4_release-1.0.0.tar.gz +wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/ansible_nfate_1.4.5_release-1.0.0.tar.gz +tar xzf ansible_nfate_1.4.5_release-1.0.0.tar.gz ``` ### 4.4 配置文件修改和示例 @@ -295,9 +307,6 @@ tar xzf ansible_nfate_1.4.4_release-1.0.0.tar.gz ``` cd ansible-nfate-* #init.sh文件不需要修改,主要是辅助生成一些配置文件 - -#测试环境加test参数执行 - sh ./tools/init.sh test #生产环境加prod参数执行 sh ./tools/init.sh prod @@ -309,6 +318,44 @@ init var_files/prod init project_prod.yml ``` +### 4.4.2 证书制作配置(可选) + +1)证书制作 + +``` +vi /data/projects/ansible-nfate-1.*/tools/make.sh + +#1、自定义安全证书需同时部署两端,只部署一端需要手工处理证书,手工处理部分暂不介绍。 +#2、安全证书支持如下部署方式: + 1)部署host+guest,host和guest使用安全证书通讯。 + 2)部署host+exchange+guest,其中host和exchange使用安全证书通讯,guest和exchange普通通讯。 + 3)部署host+exchange+guest,其中guest和exchange使用安全证书通讯,host和exchange普通通讯。 + +guest_host="192.168.0.1" ---根据实际IP修改 +host_host="192.168.0.2" ---根据实际IP修改 +exchange_host="192.168.0.88" ---根据实际IP修改,本示例不部署无需修改 +``` + +2)执行脚本制作证书 + +``` +cd tools +sh ./make.sh + +在keys/host,guest目录下会产生证书文件。 +``` + +3)拷贝证书到部署目录 + +``` +sh cp-keys.sh host guest + +证书文件会拷贝到roles/eggroll/files/keys目录 + +特别说明: +1、目前脚本部署只支持2方设置证书认证。(host&guest、host&exchange、guest&exchange) +``` + #### 4.4.2 修改配置文件 **1)修改初始化主机IP** @@ -317,10 +364,13 @@ init project_prod.yml vi /data/projects/ansible-nfate-1.*/environments/prod/hosts #ansible格式配置文件 -[init] ---把需要部署的主机IP填入init组 +[fate] ---把需要部署的主机IP填入fate组 192.168.0.1 192.168.0.2 +[deploy_check] ---把执行ansible的本机IP填入deploy_check组 +192.168.0.1 + [all:vars] ansible_connection=ssh ansible_ssh_port=22 ---根据实际情况修改 @@ -344,6 +394,8 @@ deploy_mode: "install" ---默认为空,修改为install,表示新部署 **3)修改host方参数** +**注意:默认是不启用安全证书的配置,如果启用安全证书通讯需把server_secure,client_secure,is_secure设置为true,以及is_secure对应的port设置为9371**。 + ``` #不部署host方则不用修改 #除了nodemanger可以设置多个IP外,其他都是单个IP @@ -355,13 +407,17 @@ host: enable: True ips: ---IP列表,目前rollsite只支持部署到一台服务器 - 192.168.0.1 - port: 9370 + port: 9370 ---grpc端口 + secure_port: 9371 ---grpcs端口 pool_size: 600 ---线程池大小 - max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如8G - default_rules: ---本party指向exchange或者其他party的IP,端口路由配置 + max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如12G,如果是rollsite专用的机器,配置成物理内存的75%。 + server_secure: False ---作为服务端,开启安全证书验证,不使用安全证书默认即可 + client_secure: False ---作为客户端,使用证书发起安全请求,不使用安全证书默认即可 + default_rules: ---本party指向exchange或者其他party的IP、端口路由配置 - name: default - ip: 192.168.0.3 ---exchange或者对端party rollsite IP - port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370 + ip: 192.168.0.2 ---exchange或者对端party rollsite IP + port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370,即无安全证书部署;如需开启安全证书通信,应设置为9371; + is_secure: False ---是否使用安全认证通讯;需要结合server_secure或者client_secure使用,当三者都为true时,表示和下一跳rollsite使用安全认证通讯,同时上一个参数port需设置为9371;不使用安全证书默认即可。 rules: ---本party自身路由配置 - name: default ip: 192.168.0.1 @@ -415,6 +471,8 @@ host: **4)修改guest参数** +**注意:默认是不启用安全证书的配置,如果启用安全证书通讯需把server_secure,client_secure,is_secure设置为true,以及is_secure对应的port设置为9371**。 + ``` #不部署guest方则不用修改 #除了nodemanger可以设置多个IP外,其他都是单个IP @@ -426,13 +484,17 @@ guest: enable: True ips: ---IP列表,目前rollsite只支持部署到一台服务器 - 192.168.0.2 - port: 9370 + port: 9370 ---grpc端口 + secure_port: 9371 ---grpcs端口 pool_size: 600 ---线程池大小 - max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如8G - default_rules: ---本party指向exchange或者其他party的IP,端口路由配置 + max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如12G,如果是rollsite专用的机器,配置成物理内存的75%。 + server_secure: False ---作为服务端,开启安全证书验证,不使用安全证书默认即可 + client_secure: False ---作为客户端,使用证书发起安全请求,不使用安全证书默认即可 + default_rules: ---本party指向exchange或者其他party的IP、端口路由配置 - name: default - ip: 192.168.0.3 ---exchange或者对端party rollsite IP - port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370 + ip: 192.168.0.1 ---exchange或者对端party rollsite IP + port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370,即无安全证书部署;如需开启安全证书通信,应设置为9371; + is_secure: False ---server_secure或者client_secure为true,指向的下一跳rollsite也开启了安全认证,此参数需要设置为true,上一个参数port需设置为9371,不使用安全证书默认即可 rules: ---本party自身路由配置 - name: default ip: 192.168.0.2 @@ -486,33 +548,38 @@ guest: **5)修改exchange参数** +**注意:默认是不启用安全证书的配置,如果启用安全证书通讯需把server_secure,client_secure,is_secure设置为true,以及is_secure对应的port设置为9371**。 + ``` #不部署exchange则不需要修改 vi /data/projects/ansible-nfate-1.*/var_files/prod/fate_exchange exchange: - enable: True + enable: False --部署exchange需修改为True rollsite: ips: - - 192.168.0.3 + - 192.168.0.88 port: 9370 + secure_port: 9371 ---grpcs端口 pool_size: 600 - max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如8G + max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如12G,如果是rollsite专用的机器,配置成物理内存的75%。 + server_secure: False ---作为服务端,开启安全证书验证,不使用安全证书默认即可 + client_secure: False ---作为客户端,使用证书发起安全请求,不使用安全证书默认即可 partys: ---指向各party的路由配置 - id: 10000 rules: - name: default ip: 192.168.0.1 - port: 9367 + port: 9370 ---对应party rollsite 端口,一般默认9370,即无安全证书通讯;如需开启安全证书通信,应设置为9371; + is_secure: False ---server_secure或者client_secure为true,指向的下一跳rollsite也开启了安全认证,此参数需要设置为true,上一个参数port需设置为9371,不使用安全证书默认即可 - id: 9999 rules: - name: default ip: 192.168.0.2 - port: 9370 + port: 9370 ---对应party rollsite 端口,一般默认9370,即无安全证书通讯;如需开启安全证书通信,应设置为9371; + is_secure: False ---server_secure或者client_secure为true,指向的下一跳rollsite也开启了安全认证,此参数需要设置为true,上一个参数port需设置为9371,不使用安全证书默认即可 ``` - - ### 4.5 部署 按照上述配置含义修改对应的配置项后,然后执行部署脚本: @@ -520,11 +587,10 @@ exchange: ``` #相对ansible-nfate-*目录 cd /data/projects/ansible-nfate-1.* -#测试环境加test参数执行 - nohup sh ./boot.sh test -D > logs/boot.log 2>&1 & #生产环境加prod参数执行 nohup sh ./boot.sh prod -D > logs/boot.log 2>&1 & + ``` 部署日志输出在logs目录下,实时查看是否有报错: @@ -551,6 +617,16 @@ tail -f ansible.log (实时查看部署情况,如果没有这个日志文件 ---/data/projects/common目录存在,需要mv。 ``` +fateflow部署完重启: + +``` +#因为fate_flow依赖的组件比较多,可能启动会有异常,处理如下: +netstat -tlnp | grep 9360 +如果没有端口则重起fateflow: +sh service.sh stop fate-fateflow +sh service.sh start fate-fateflow +``` + ### 4.6 问题定位 1)eggroll日志 @@ -593,6 +669,8 @@ cd /data/projects/fate/python/examples/toy_example/ python run_toy_example.py 10000 10000 1 ``` +注意:如果超过1分钟没输出,表示部署有问题,需要看日志进行问题定位。 + 类似如下结果表示成功: "2020-04-28 18:26:20,789 - secure_add_guest.py[line:126] - INFO: success to calculate secure_sum, it is 1999.9999999999998" @@ -605,6 +683,8 @@ cd /data/projects/fate/python/examples/toy_example/ python run_toy_example.py 9999 9999 1 ``` +注意:如果超过1分钟没输出,表示部署有问题,需要看日志进行问题定位。 + 类似如下结果表示成功: "2020-04-28 18:26:20,789 - secure_add_guest.py[line:126] - INFO: success to calculate secure_sum, it is 1999.9999999999998" @@ -682,18 +762,27 @@ Fateboard是一项Web服务。如果成功启动了fateboard服务,则可以 cd /data/projects/common/supervisord ``` -启动/关闭/查看所有: +启动/关闭/重启/查看所有: ``` -sh service.sh start/stop/status all +#注意:因mysql是基础组件,启动较慢,建议重启操作是先停止所有组件,然后先启动mysql,再启动其他组件 +sh service.sh start/stop/restart/status all + +#说明:因为fateflow依赖的组件比较多,重启所有的操作可能会导致fateflow启动异常,处理如下: +netstat -tlnp | grep 9360 +如果没有端口则重起fateflow: +sh service.sh stop fate-fateflow +sh service.sh start fate-fateflow ``` -启动/关闭/查看单个模块(可选:clustermanager,nodemanager,rollsite,fateflow,fateboard,mysql): +启动/关闭/重启/查看单个模块(可选:clustermanager,nodemanager,rollsite,fateflow,fateboard,mysql): ``` -sh service.sh start/stop/status fate-clustermanager +sh service.sh start/stop/rsstart/status fate-clustermanager ``` + + ## 6.2 查看进程和端口 **在目标服务器(192.168.0.1 192.168.0.2 )app用户下执行** diff --git a/cluster-deploy/doc/Fate_guest_install_guide_ansible.md b/cluster-deploy/doc/Fate_guest_install_guide_ansible.md index 7b2d6e0926..9312f7e0bc 100644 --- a/cluster-deploy/doc/Fate_guest_install_guide_ansible.md +++ b/cluster-deploy/doc/Fate_guest_install_guide_ansible.md @@ -37,18 +37,20 @@ Eggroll 是一个适用于机器学习和深度学习的大规模分布式架构 ## 2.1.部署规划 -| role | partyid | IP地址 | 操作系统 | 主机配置 | 存储 | 外网IP | 外网带宽 | 部署模块 | -| ----- | ---------------------- | --------------------- | ----------------------- | -------- | ---- | ----------- | -------- | ----------------------------------------------------- | -| guest | 9999(根据实际规划修改) | 192.168.0.1 (有外网) | CentOS 7.2/Ubuntu 16.04 | 8C16G | 500G | xx.xx.xx.xx | >=20Mb | fate_flow,fateboard,clustermanager,rollsite,mysql | -| guest | 9999(根据实际规划修改) | 192.168.0.2 | CentOS 7.2/Ubuntu 16.04 | 16C32G | 2T | | | nodemanger | +| role | partyid | IP地址 | 操作系统 | 主机配置 | 存储 | 外网IP | 外网带宽 | 部署模块 | +| ----- | ---------------------- | --------------------- | ---------- | -------- | ---- | ----------- | -------- | ----------------------------------------------------- | +| guest | 9999(根据实际规划修改) | 192.168.0.1 (有外网) | CentOS 7.2 | 8C16G | 500G | xx.xx.xx.xx | >=20Mb | fate_flow,fateboard,clustermanager,rollsite,mysql | +| guest | 9999(根据实际规划修改) | 192.168.0.2 | CentOS 7.2 | 16C32G | 2T | | | nodemanger | + +备注:涉及exchange说明会用192.168.0.88表示其IP,但本次示例不涉及exchange的部署。 ## 2.2.主机资源和操作系统要求 | **类别** | **说明** | | -------- | ------------------------------------------------------------ | | 主机配置 | 不低于8C16G500G,千兆网卡 | -| 操作系统 | CentOS linux 7.2及以上同时低于8/Ubuntu 16.04 或 Ubuntu 18.04 | -| 依赖包 | 需要安装如下依赖包:
#centos
gcc gcc-c++ make openssl-devel gmp-devel mpfr-devel libmpc-devel libaio
numactl autoconf automake libtool libffi-devel ansible jq supervisor
#ubuntu
gcc g++ make openssl supervisor ansible jq libgmp-dev libmpfr-dev libmpc-dev
libaio libaio-dev numactl autoconf automake libtool libffi-dev ansible jq supervisor
cd /usr/lib/x86_64-linux-gnu
if [ ! -f "libssl.so.10" ];then
ln -s libssl.so.1.0.0 libssl.so.10
ln -s libcrypto.so.1.0.0 libcrypto.so.10
fi | +| 操作系统 | CentOS linux 7.2及以上同时低于8 | +| 依赖包 | 需要安装如下依赖包:
#centos
gcc gcc-c++ make openssl-devel gmp-devel mpfr-devel libmpc-devel libaio
numactl autoconf automake libtool libffi-devel ansible jq supervisor | | 用户 | 用户:app,属主:apps(app用户需可以sudo su root而无需密码) | | 文件系统 | 1、数据盘挂载在/data目录下。
2、创建/data/projects目录,目录属主为:app:apps。
3、根目录空闲空间不低于20G。 | | 虚拟内存 | 不低于128G | @@ -97,8 +99,6 @@ vim /etc/hosts centos系统执行:rpm -qa | grep selinux -ubuntu系统执行:apt list --installed | grep selinux - 如果已安装了selinux就执行:setenforce 0 3.3 修改Linux系统参数 @@ -106,35 +106,37 @@ ubuntu系统执行:apt list --installed | grep selinux **在目标服务器(192.168.0.1 192.168.0.2)root用户下执行:** -1)vim /etc/security/limits.conf +1)清理20-nproc.conf文件 + +cd /etc/security/limits.d + +ls -lrt 20-nproc.conf + +存在则:mv 20-nproc.conf 20-nproc.conf_bak + +2)vim /etc/security/limits.conf \* soft nofile 65535 \* hard nofile 65535 -2)vim /etc/security/limits.d/20-nproc.conf +\* soft nproc 65535 -\* soft nproc unlimited +\* hard nproc 65535 + +重新登陆,ulimit -a查看是否生效 3.4 关闭防火墙 -------------- **在目标服务器(192.168.0.1 192.168.0.2 )root用户下执行** -如果是Centos系统: - systemctl disable firewalld.service systemctl stop firewalld.service systemctl status firewalld.service -如果是Ubuntu系统: - -ufw disable - -ufw status - 3.5 软件环境初始化 ------------------ @@ -162,6 +164,8 @@ Defaults !env_reset **3)配置ssh无密登录** +**注意:192.168.0.1不但需要可以免密登陆192.168.0.2,也需要可以免密登陆自身,配置完后务必手工ssh连接下自身和192.168.0.2,确认下认证信息。** + **a. 在目标服务器(192.168.0.1 192.168.0.2)app用户下执行** su app @@ -203,6 +207,8 @@ ssh app\@192.168.0.2 生产环境使用时,因内存计算需要增加128G虚拟内存,执行前需检查存储空间是否足够。 +注意:dd执行时间较长,请耐心等待 + ``` cd /data dd if=/dev/zero of=/data/swapfile128G bs=1024 count=134217728 @@ -212,15 +218,20 @@ cat /proc/swaps echo '/data/swapfile128G swap swap defaults 0 0' >> /etc/fstab ``` -## 3.7 安装ansible +## 3.7 安装依赖包 -**目标服务器(192.168.0.1) root用户执行** +**目标服务器(192.168.0.1 192.168.0.2) root用户执行** ``` -#判断是否已安装ansible -ansible --version -#没有则执行 -yum install -y ansible +#安装基础依赖包 +yum install -y gcc gcc-c++ make openssl-devel gmp-devel mpfr-devel libmpc-devel libaio numactl autoconf automake libtool libffi-devel +#如果有报错,需要解决yum源问题。 + +#安装ansible和进程管理依赖包 +yum install -y ansible jq supervisor +#如果有报错同时服务器有外网,没有外网的需要解决yum源不全的问题,执行: +yum install -y epel-release +#增加一个更全面的第三方的源,然后再重新安装ansible jq supervisor ``` 4 项目部署 @@ -238,16 +249,16 @@ yum install -y ansible **在目标服务器(192.168.0.1 192.168.0.2)app用户下执行** ``` -#虚拟内存,size不低于128G,如不满足需参考4.6章节重新设置 +#虚拟内存,size不低于128G,如不满足需参考3.6章节重新设置 cat /proc/swaps Filename Type Size Used Priority /data/swapfile128G file 134217724 384 -1 -#文件句柄数,不低于65535,如不满足需参考4.3章节重新设置 +#文件句柄数,不低于65535,如不满足需参考3.3章节重新设置 ulimit -n 65535 -#用户进程数,不低于64000,如不满足需参考4.3章节重新设置 +#用户进程数,不低于64000,如不满足需参考3.3章节重新设置 ulimit -u 65535 @@ -257,6 +268,7 @@ ps -ef| grep -i fate netstat -tlnp | grep 4670 netstat -tlnp | grep 4671 netstat -tlnp | grep 9370 +netstat -tlnp | grep 9371 netstat -tlnp | grep 9360 netstat -tlnp | grep 8080 netstat -tlnp | grep 3306 @@ -281,8 +293,8 @@ ls -lrt /data/projects/common/supervisord/supervisord.d/fate-*.conf ``` #注意:URL链接有换行,拷贝的时候注意整理成一行 cd /data/projects/ -wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/ansible_nfate_1.4.4_release-1.0.0.tar.gz -tar xzf ansible_nfate_1.4.4_release-1.0.0.tar.gz +wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/ansible_nfate_1.4.5_release-1.0.0.tar.gz +tar xzf ansible_nfate_1.4.5_release-1.0.0.tar.gz ``` 4.4 配置文件修改和示例 @@ -293,9 +305,6 @@ tar xzf ansible_nfate_1.4.4_release-1.0.0.tar.gz ``` cd ansible-nfate-* #init.sh文件不需要修改,主要是辅助生成一些配置文件 - -#测试环境加test参数执行 - sh ./tools/init.sh test #生产环境加prod参数执行 sh ./tools/init.sh prod @@ -307,6 +316,25 @@ init var_files/prod init project_prod.yml ``` +### 4.4.2 证书部署前配置(可选) + +1)联系webank获取guest端部署证书文件。 + +2)放置到部署目录 + +``` +cd /data/projects/ansible-nfate-* +mkdir -p roles/eggroll/files/keys/guest +cd roles/eggroll/files/keys/guest +把获取到证书文件解压缩并放置到此目录下,如下: +-rw-r--r-- 1 app apps 1371 Sep 4 18:07 guest-ca.pem +-rw-r--r-- 1 app apps 241 Sep 4 18:07 guest-server.key +-rw-r--r-- 1 app apps 1151 Sep 4 18:07 guest-server.pem +-rw-r--r-- 1 app apps 1371 Sep 4 18:07 host-client-ca.pem +-rw-r--r-- 1 app apps 241 Sep 4 18:07 host-client.key +-rw-r--r-- 1 app apps 1143 Sep 4 18:07 host-client.pem +``` + ### 4.4.2 修改配置文件 **1)修改初始化主机IP** @@ -315,10 +343,13 @@ init project_prod.yml vi /data/projects/ansible-nfate-1.*/environments/prod/hosts #ansible格式配置文件 -[init] ---把需要部署的主机IP填入init组 +[fate] ---把需要部署的主机IP填入fate组 192.168.0.1 192.168.0.2 +[deploy_check] ---把执行ansible的本机IP填入deploy_check组 +192.168.0.1 + [all:vars] ansible_connection=ssh ansible_ssh_port=22 ---根据实际主机ssh协议端口修改 @@ -342,6 +373,8 @@ deploy_mode: "install" ---默认为空,修改为install,表示新部署 **3)修改guest参数** +**注意:默认是不启用安全证书的配置,如果启用安全证书通讯需把server_secure,client_secure,is_secure设置为true,以及is_secure对应的port设置为9371**。 + ``` #除了nodemanger可以设置多个IP外,其他都是单个IP vi /data/projects/ansible-nfate-1.*/var_files/prod/fate_guest @@ -352,13 +385,17 @@ guest: enable: True ---是否部署rollsite模块,True为部署,False为否 ips: ---IP列表,目前rollsite只支持部署到一台服务器 - 192.168.0.1 - port: 9370 ---rollsite端口 + port: 9370 ---rollsite grpc端口 + secure_port: 9371 ---rollsite grpcs端口 pool_size: 600 ---线程池大小 - max_memory: 8G ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如8G + max_memory: 12G ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如12G,如果是rollsite专用的机器,配置成物理内存的75%。 + server_secure: False ---作为服务端,开启安全证书验证,不使用安全证书默认即可 + client_secure: False ---作为客户端,使用证书发起安全请求,不使用安全证书默认即可 default_rules: ---默认路由,本party指向exchange或者其他party的IP,端口 - name: default ---名称,默认即可 - ip: 192.168.0.3 ---exchange或者对端party rollsite IP,和webank确认后修改 - port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370,和webank确认后修改 + ip: 192.168.0.88 ---exchange或者对端party rollsite IP,和webank确认后修改。 + port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370,即无安全证书部署;如需开启安全证书通信,应设置为9371;和webank确认后修改。 + is_secure: False ---是否使用安全认证通讯;需要结合server_secure或者client_secure使用,当三者都为true时,表示和下一跳rollsite使用安全认证通讯,同时上一个参数port需设置为9371;不使用安全证书默认即可。 rules: ---本party自身路由配置 - name: default ---本party rollsite所在主机IP和端口 ip: 192.168.0.1 @@ -419,8 +456,6 @@ guest: ``` #相对ansible-nfate-*目录 cd /data/projects/ansible-nfate-1.* -#测试环境加test参数执行 - nohup sh ./boot.sh test -D > logs/boot.log 2>&1 & #生产环境加prod参数执行 nohup sh ./boot.sh prod -D > logs/boot.log 2>&1 & @@ -450,6 +485,16 @@ tail -f ansible.log (实时查看部署情况,如果没有这个日志文件 ---/data/projects/common目录存在,需要mv。 ``` +fateflow部署完重启: + +``` +#因为fate_flow依赖的组件比较多,可能启动会有异常,处理如下: +netstat -tlnp | grep 9360 +如果没有端口则重起fateflow: +sh service.sh stop fate-fateflow +sh service.sh start fate-fateflow +``` + ## 4.6 问题定位 1)eggroll日志 @@ -492,6 +537,8 @@ cd /data/projects/fate/python/examples/toy_example/ python run_toy_example.py 9999 9999 1 ``` +注意:如果超过1分钟没输出,表示部署有问题,需要看日志进行问题定位。 + 类似如下结果表示成功: "2020-04-28 18:26:20,789 - secure_add_guest.py[line:126] - INFO: success to calculate secure_sum, it is 1999.9999999999998" @@ -569,16 +616,23 @@ Fateboard是一项Web服务。如果成功启动了fateboard服务,则可以 cd /data/projects/common/supervisord ``` -启动/关闭/查看所有: +启动/关闭/重启/查看所有: ``` -sh service.sh start/stop/status all +#注意:因mysql是基础组件,启动较慢,建议重启操作是先停止所有组件,然后先启动mysql,再启动其他组件 +sh service.sh start/stop/restart/status all + +#说明:因为fateflow依赖的组件比较多,重启所有的操作可能会导致fateflow启动异常,处理如下: +netstat -tlnp | grep 9360 +如果没有端口则重起fateflow: +sh service.sh stop fate-fateflow +sh service.sh start fate-fateflow ``` -启动/关闭/查看单个模块(可选:clustermanager,nodemanager,rollsite,fateflow,fateboard,mysql): +启动/关闭/重启/查看单个模块(可选:clustermanager,nodemanager,rollsite,fateflow,fateboard,mysql): ``` -sh service.sh start/stop/status fate-clustermanager +sh service.sh start/stop/restart/status fate-clustermanager ``` ## 6.2 查看进程和端口 diff --git a/cluster-deploy/doc/Fate_step_by_step_install_zh.md b/cluster-deploy/doc/Fate_step_by_step_install_zh.md index d5d1b0efe5..fa1a0e8821 100644 --- a/cluster-deploy/doc/Fate_step_by_step_install_zh.md +++ b/cluster-deploy/doc/Fate_step_by_step_install_zh.md @@ -180,10 +180,10 @@ echo '/data/swapfile128G swap swap defaults 0 0' >> /etc/fstab ``` mkdir -p /data/projects/install cd /data/projects/install -wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.4-release.tar.gz +wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.5-release.tar.gz wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/jdk-8u192-linux-x64.tar.gz -wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.4-release.tar.gz -wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.4-release.tar.gz +wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.5-release.tar.gz +wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.5-release.tar.gz #传输到192.168.0.2和192.168.0.3 scp *.tar.gz app@192.168.0.2:/data/projects/install @@ -336,7 +336,7 @@ sh Miniconda3-4.5.4-Linux-x86_64.sh -b -p /data/projects/fate/common/miniconda3 tar xvf pip-packages-fate-*.tar.gz source /data/projects/fate/common/python/venv/bin/activate pip install setuptools-42.0.2-py2.py3-none-any.whl -pip install -r pip-packages-fate-1.4.4/requirements.txt -f ./pip-packages-fate-1.4.4 --no-index +pip install -r pip-packages-fate-1.4.5/requirements.txt -f ./pip-packages-fate-1.4.5 --no-index pip list | wc -l #结果应为161 ``` diff --git a/cluster-deploy/doc/Fate_step_by_step_install_zh.rst b/cluster-deploy/doc/Fate_step_by_step_install_zh.rst index 8d6b638b3a..f8f5c342fb 100644 --- a/cluster-deploy/doc/Fate_step_by_step_install_zh.rst +++ b/cluster-deploy/doc/Fate_step_by_step_install_zh.rst @@ -202,10 +202,10 @@ ufw status mkdir -p /data/projects/install cd /data/projects/install - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.4-release.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.5-release.tar.gz wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/jdk-8u192-linux-x64.tar.gz - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.4-release.tar.gz - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.4-release.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.5-release.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.5-release.tar.gz #传输到192.168.0.2和192.168.0.3 scp *.tar.gz app@192.168.0.2:/data/projects/install @@ -359,7 +359,7 @@ ufw status tar xvf pip-packages-fate-*.tar.gz source /data/projects/fate/common/python/venv/bin/activate pip install setuptools-42.0.2-py2.py3-none-any.whl - pip install -r pip-packages-fate-1.4.4/requirements.txt -f ./pip-packages-fate-1.4.4 --no-index + pip install -r pip-packages-fate-1.4.5/requirements.txt -f ./pip-packages-fate-1.4.5 --no-index pip list | wc -l #结果应为161 diff --git a/fate.env b/fate.env index e46d4e96e7..0f3970743b 100755 --- a/fate.env +++ b/fate.env @@ -1,4 +1,4 @@ -FATE=1.4.4 +FATE=1.4.5 CENTOS=7.2 UBUNTU=16.04 PYTHON=3.6.5 diff --git a/fate_flow/db/db_models.py b/fate_flow/db/db_models.py index 46ecd4a515..abb8ffb488 100644 --- a/fate_flow/db/db_models.py +++ b/fate_flow/db/db_models.py @@ -186,8 +186,8 @@ class DataView(DataBaseModel): f_task_id = CharField(max_length=100) f_type = CharField(max_length=50, null=True) f_ttl = IntegerField(default=0) - f_party_model_id = CharField(max_length=100, null=True) - f_model_version = CharField(max_length=100, null=True) + f_party_model_id = CharField(max_length=200, null=True) + f_model_version = CharField(max_length=200, null=True) f_size = BigIntegerField(default=0) f_description = TextField(null=True, default='') f_tag = CharField(max_length=50, null=True, index=True, default='') diff --git a/standalone-deploy/README.md b/standalone-deploy/README.md index abd2990070..d7d6face52 100644 --- a/standalone-deploy/README.md +++ b/standalone-deploy/README.md @@ -22,11 +22,11 @@ It is strongly recommended to use docker, which greatly reduces the possibility ``` #Get code -wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/docker_standalone-fate-1.4.4.tar.gz -tar -xzvf docker_standalone-fate-1.4.4.tar.gz +wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/docker_standalone-fate-1.4.5.tar.gz +tar -xzvf docker_standalone-fate-1.4.5.tar.gz #Execute the command -cd docker_standalone-fate-1.4.4 +cd docker_standalone-fate-1.4.5 bash install_standalone_docker.sh ``` @@ -82,14 +82,14 @@ Http://hostip:8080. 2. Download the compressed package of stand-alone version and decompress it. ``` - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.4.tar.gz - tar -xzvf standalone-fate-master-1.4.4.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.5.tar.gz + tar -xzvf standalone-fate-master-1.4.5.tar.gz ``` 3. Enter FATE directory and execute the init.sh. ``` - cd standalone-fate-master-1.4.4 + cd standalone-fate-master-1.4.5 source init.sh init ``` @@ -98,7 +98,7 @@ Http://hostip:8080. - Unit Test ``` - cd standalone-fate-master-1.4.4 + cd standalone-fate-master-1.4.5 bash ./federatedml/test/run_test.sh ``` @@ -111,7 +111,7 @@ Http://hostip:8080. - Toy_example Test ``` - cd standalone-fate-master-1.4.4 + cd standalone-fate-master-1.4.5 python ./examples/toy_example/run_toy_example.py 10000 10000 0 ``` diff --git a/standalone-deploy/README.rst b/standalone-deploy/README.rst index c664644200..aa76567a07 100644 --- a/standalone-deploy/README.rst +++ b/standalone-deploy/README.rst @@ -36,11 +36,11 @@ possibility of encountering problems. :: #Get code - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/docker_standalone-fate-1.4.4.tar.gz - tar -xzvf docker_standalone-fate-1.4.4.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/docker_standalone-fate-1.4.5.tar.gz + tar -xzvf docker_standalone-fate-1.4.5.tar.gz #Execute the command - cd docker_standalone-fate-1.4.4 + cd docker_standalone-fate-1.4.5 bash install_standalone_docker.sh @@ -98,14 +98,14 @@ Http://hostip:8080. :: - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.4.tar.gz - tar -xzvf standalone-fate-master-1.4.4.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.5.tar.gz + tar -xzvf standalone-fate-master-1.4.5.tar.gz 3. Enter FATE directory and execute the init.sh. :: - cd standalone-fate-master-1.4.4 + cd standalone-fate-master-1.4.5 source init.sh init 4. Test @@ -114,7 +114,7 @@ Http://hostip:8080. :: - cd standalone-fate-master-1.4.4 + cd standalone-fate-master-1.4.5 bash ./federatedml/test/run_test.sh If success, the screen shows like blow: @@ -127,7 +127,7 @@ Http://hostip:8080. :: - cd standalone-fate-master-1.4.4 + cd standalone-fate-master-1.4.5 python ./examples/toy_example/run_toy_example.py 10000 10000 0 If success, the screen shows like blow: diff --git a/standalone-deploy/doc/Fate-standalone_deployment_guide_zh.md b/standalone-deploy/doc/Fate-standalone_deployment_guide_zh.md index e79046e157..c6fa12ff0a 100644 --- a/standalone-deploy/doc/Fate-standalone_deployment_guide_zh.md +++ b/standalone-deploy/doc/Fate-standalone_deployment_guide_zh.md @@ -21,11 +21,11 @@ ``` #获取安装包 - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/docker_standalone-fate-1.4.4.tar.gz - tar -xzvf docker_standalone-fate-1.4.4.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/docker_standalone-fate-1.4.5.tar.gz + tar -xzvf docker_standalone-fate-1.4.5.tar.gz #执行部署 - cd docker_standalone-fate-1.4.4 + cd docker_standalone-fate-1.4.5 bash install_standalone_docker.sh ``` @@ -80,14 +80,14 @@ 2. 下载独立版本的压缩包并解压缩。 ``` - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.4.tar.gz - tar -xzvf standalone-fate-master-1.4.4.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.5.tar.gz + tar -xzvf standalone-fate-master-1.4.5.tar.gz ``` 3. 进入FATE目录并执行init.sh。 ``` - cd standalone-fate-master-1.4.4 + cd standalone-fate-master-1.4.5 source init.sh init ``` @@ -96,7 +96,7 @@ - 单元测试 ``` - cd standalone-fate-master-1.4.4 + cd standalone-fate-master-1.4.5 bash ./federatedml/test/run_test.sh ``` @@ -109,7 +109,7 @@ - Toy测试 ``` - cd standalone-fate-master-1.4.4 + cd standalone-fate-master-1.4.5 python ./examples/toy_example/run_toy_example.py 10000 10000 0 ``` diff --git a/tools/debug/check_conf.sh b/tools/debug/check_conf.sh new file mode 100644 index 0000000000..ecc8ff3f36 --- /dev/null +++ b/tools/debug/check_conf.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# + +cwd=$(cd `dirname $0`; pwd) +source ./check_iplist.sh + +cd $EGGROLL_HOME + +echo "----------------------$EGGROLL_HOME/conf/eggroll.properties--------------------" +cat $EGGROLL_HOME/conf/eggroll.properties | grep -v ^# | grep -v ^$ +echo "" +echo "-----------------------$EGGROLL_HOME/conf/route_table.json---------------------" +cat $EGGROLL_HOME/conf/route_table.json | grep -v ^# | grep -v ^$ + +for ip in ${iplist[@]};do + echo "------------------diff $ip with ./conf/eggroll.properties-------------------------" + ssh $user@$ip "cat $EGGROLL_HOME/conf/eggroll.properties" | diff - conf/eggroll.properties + echo "" +done + +cd $cwd diff --git a/tools/debug/check_env.sh b/tools/debug/check_env.sh new file mode 100644 index 0000000000..c81f0b44d4 --- /dev/null +++ b/tools/debug/check_env.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +cwd=$(cd `dirname $0`; pwd) + +get_property() { + property_value=`grep $1 $2 | cut -d '=' -f 2-` + test_value $1 $2 ${property_value} +} + +echo_red() { + echo -e "\e[1;31m $1\e[0m" +} + +echo_green() { + echo -e "\e[1;32m $1\e[0m" +} + +echo_yellow() { + echo -e "\e[1;33m $1\e[0m" +} + +check_max_count() { + value=`cat $1` + if [ $value -ge 65535 ];then + echo_green "[OK] $1 is ok." + else + echo_red "[ERROR] please check $1, no less than 65535." + fi +} + +check_file_count() { + value=`cat $1 | grep $2 | awk '{print $4}'` + for v in ${value[@]};do + test_value $1 $2 $v + done +} + +test_value() { + if [ $3 -ge 65535 ];then + echo_green "[OK] $1 in $2 is ok." + else + echo_red "[ERROR] please check $1 in $2, no less than 65535." + fi +} + +echo_green `date +"%Y-%m-%d_%H:%M:%S"` + +echo_green "=============check max user processes============" +check_max_count "/proc/sys/kernel/threads-max" +get_property "kernel.pid_max" "/etc/sysctl.conf" +check_max_count "/proc/sys/kernel/pid_max" +check_max_count "/proc/sys/vm/max_map_count" + +echo_green "=============check max files count==============" +check_file_count "/etc/security/limits.conf" "nofile" +check_file_count "/etc/security/limits.d/80-nofile.conf" "nofile" +get_property "fs.file-max" "/etc/sysctl.conf" +check_max_count "/proc/sys/fs/file-max" + +mem_total=`free -m | grep Mem | awk '{print $2}' | tr -cd "[0-9,.]"` +mem_used=`free -m | grep Mem | awk '{print $3}' | tr -cd "[0-9],."` +swap_total=`free -m | grep Swap | awk '{print $2}' | tr -cd "[0-9,.]"` +swap_used=`free -m | grep Swap | awk '{print $3}' | tr -cd "[0-9,.]"` + +echo_green "=============Memory used and total===============" +echo_yellow "[WARNING] MemTotal:`awk 'BEGIN{printf "%.2f%%\n",('$mem_total'/1024)}'`G, MemUsed:`awk 'BEGIN{printf "%.2f%%\n",('$mem_used'/1024)}'`G, MemUsed%:`awk 'BEGIN{printf "%.2f%%\n",('$mem_used'/'$mem_total')*100}'`" +echo_green "=============SwapMem used and total===============" +echo_yellow "[WARNING] SwapTotal:`awk 'BEGIN{printf "%.2f%%\n",('$swap_total'/1024)}'`G, SwapUsed:`awk 'BEGIN{printf "%.2f%%\n",('$swap_used'/1024)}'`G, SwapUsed%:`awk 'BEGIN{printf "%.2f%%\n",('$swap_used'/'$swap_total')*100}'`" +echo_green "=============Disk use and total==================" +echo_yellow "[WARNING] `df -lh | grep /data`" + + diff --git a/tools/debug/check_iplist.sh b/tools/debug/check_iplist.sh new file mode 100644 index 0000000000..b753a6382c --- /dev/null +++ b/tools/debug/check_iplist.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# + +user=app +iplist=(xxx xxx) diff --git a/tools/debug/cluster_env_check.sh b/tools/debug/cluster_env_check.sh new file mode 100644 index 0000000000..9848f69c0a --- /dev/null +++ b/tools/debug/cluster_env_check.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +cwd=$(cd `dirname $0`; pwd) +source ./check_iplist.sh + +for ip in ${iplist[@]};do + if ! ssh -tt $user@$ip test -d "${EGGROLL_HOME}/bin/debug"; then + echo "${EGGROLL_HOME}/bin/debug in $ip is not exist, mkdir -p ${EGGROLL_HOME}/bin/debug." + ssh -tt $user@$ip "mkdir -p ${EGGROLL_HOME}/bin/debug" + fi + + if ! ssh -tt $user@$ip test -e ${EGGROLL_HOME}/bin/debug/check_env.sh;then + echo "${EGGROLL_HOME}/bin/debug/check_env.sh in $ip is not exist, scp check_env.sh to $ip:${EGGROLL_HOME}/bin/debug" + scp ./check_env.sh $user@$ip:${EGGROLL_HOME}/bin/debug + fi + ssh $user@$ip "sh ${EGGROLL_HOME}/bin/debug/check_env.sh" >> $ip + echo "The check result from $ip has saved in $cwd/$ip, please check it." +done diff --git a/tools/debug/env_check.py b/tools/debug/env_check.py new file mode 100644 index 0000000000..815ba7cb55 --- /dev/null +++ b/tools/debug/env_check.py @@ -0,0 +1,175 @@ +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +import re +import sys +import json +import time +import socket +import psutil +import datetime +import argparse +import subprocess +from eggroll.core.session import ErSession +from eggroll.roll_pair.roll_pair import RollPairContext +from eggroll.utils.log_utils import get_logger + +L = get_logger() + +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument("-t","--time", type=int, help="Sleep time wait, default value 0s", default=0) +arg_parser.add_argument("-n","--nodes", type=int, help="Eggroll session processors per node, default value 1", default=1) +arg_parser.add_argument("-p","--partitions", type=int, help="Total partitions, default value 1", default=1) +args = arg_parser.parse_args() + +def str_generator(include_key=True, row_limit=10, key_suffix_size=0, value_suffix_size=0): + for i in range(row_limit): + if include_key: + yield str(i) + "s"*key_suffix_size, str(i) + "s"*value_suffix_size + else: + yield str(i) + "s"*value_suffix_size + +def round2(x): + return str(round(x / 1024 / 1024 / 1024, 2)) + +def print_red(str): + print("\033[1;31;40m\t" + str + "\033[0m") + +def print_green(str): + print("\033[1;32;40m\t" + str + "\033[0m") + +def print_yellow(str): + print("\033[1;33;40m\t" + str + "\033[0m") + +def check_actual_max_threads(): + def getMemInfo(fn): + def query_cmd(cmd): + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0].decode().strip().split('\n') + return p[0] + + def get_host_ip(): + try: + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.connect(('8.8.8.8', 80)) + ip = s.getsockname()[0] + finally: + s.close() + return ip + + mem = psutil.virtual_memory() + mem_total = round2(mem.total) + mem_used = round2(mem.used) + mem_used_per = str(round(mem.percent)) + '%' + + swap_mem = psutil.swap_memory() + swap_total = round2(swap_mem.total) + swap_used = round2(swap_mem.used) + swap_use_per = str(round(swap_mem.percent)) + '%' + + data_disk = psutil.disk_usage('/data') + disk_total = round2(data_disk.total) + disk_used = round2(data_disk.used) + disk_per = str(round(data_disk.percent)) + '%' + + mem_info = {} + mem_info["Ip"] = get_host_ip() + mem_info["MemTotal"] = mem_total + mem_info["MemUsed"] = mem_used + mem_info["MemUsedPCT"] = mem_used_per + + mem_info["SwapTotal"] = swap_total + mem_info["SwapUsed"] = swap_used + mem_info["SwapUsePCT"] = swap_use_per + + mem_info["DiskTotal"] = disk_total + mem_info["DiskUsed"] = disk_used + mem_info["DiskUsedPCT"] = disk_per + + mem_info["/proc/sys/kernel/threads-max"] = query_cmd("cat /proc/sys/kernel/threads-max") + mem_info["/etc/sysctl.conf"] = query_cmd("grep kernel.pid_max /etc/sysctl.conf | awk -F= '{print $2}'") + mem_info["/proc/sys/kernel/pid_max"] = query_cmd("cat /proc/sys/kernel/pid_max") + mem_info["/proc/sys/vm/max_map_count"] = query_cmd("cat /proc/sys/vm/max_map_count") + + mem_info["/etc/security/limits.conf"] = query_cmd("cat /etc/security/limits.conf | grep nofile | awk '{print $4}'") + mem_info["/etc/security/limits.d/80-nofile.conf"] = query_cmd("cat /etc/security/limits.d/80-nofile.conf | grep nofile | awk '{print $4}'") + mem_info["/etc/sysctl.conf"] = query_cmd("grep fs.file-max /etc/sysctl.conf | awk -F= '{print $2}'") + mem_info["/proc/sys/fs/file-max"] = query_cmd("cat /proc/sys/fs/file-max") + + mem_info["CurrentUseProcesses"] = query_cmd("pstree -p `ps -e |grep egg_pair |awk '{print $1}'` |wc -l") + mem_info["NodeProcessors"] = query_cmd("grep eggroll.session.processors.per.node ${EGGROLL_HOME}/conf/eggroll.properties | awk -F= '{print $2}'") + mem_info["PoolSize"] = query_cmd("grep eggroll.rollpair.eggpair.server.executor.pool.max.size ${EGGROLL_HOME}/conf/eggroll.properties | awk -F= '{print $2}'") + + rollsite_pid = query_cmd("ps aux | grep ${EGGROLL_HOME} | grep com.webank.eggroll.rollsite.Proxy | grep -v grep | awk '{print $2}'") + if rollsite_pid: + rollsite_used_memory = psutil.Process(int(rollsite_pid)).memory_info().rss + myfile = open(sys.path[1] + '/../../../conf/eggroll.properties') + properties = myfile.read() + jvm_options = re.findall(r"(?<=MaxHeapSize=).*?(?=G)", properties) + if len(jvm_options): + rollsite_total_memory = int(jvm_options[0]) * 1024 * 1024 * 1024 + else: + rollsite_total_memory = mem.total + myfile.close() + + mem_info["RollsiteUsedPercent"] = '{:.2%}'.format(rollsite_used_memory / (rollsite_total_memory * 4)) + else: + mem_info["RollsiteUsedPercent"] = 0 + + + return mem_info + + session = ErSession(options={"eggroll.session.processors.per.node": args.nodes}) + try: + ctx = RollPairContext(session) + rp = ctx.parallelize(str_generator(row_limit=1000), options={'total_partitions': args.partitions}) + result = rp.with_stores(func=getMemInfo) + print_green(str(datetime.datetime.now())) + #print(json.dumps(result, indent=1)) + for node in result: + print_green("==============This is node " + str(node[0]) + ":" + node[1]["Ip"] + "===========================================") + print_yellow("[WARNING] MemTotal:" + node[1]["MemTotal"] + "G, MemUsed:" + node[1]["MemUsed"] + "G, MemUsedPCT:" + node[1]["MemUsedPCT"]) + if float(node[1]["SwapTotal"]) < 128: + print_red("[ERROR] The swap memory is:" + node[1]["SwapTotal"] + "G, no less than 128G.") + else: + print_yellow("[WARNING] SwapTotal:" + node[1]["SwapTotal"] + "G, SwapUsed:" + node[1]["SwapUsed"] + "G, SwapUsePCT:" + node[1]["SwapUsePCT"]) + print_yellow("[WARNING] DiskTotal:" + node[1]["DiskTotal"] + "G, DiskUsed:" + node[1]["DiskUsed"] + "G, DiskUsedPCT:" + node[1]["DiskUsedPCT"]) + print_green("--------------Max user processes and max file count----------------------------------------") + for key in ["/proc/sys/kernel/threads-max", "/etc/sysctl.conf", "/proc/sys/kernel/pid_max", "/proc/sys/vm/max_map_count", "/etc/security/limits.conf", "/etc/security/limits.d/80-nofile.conf", "/etc/sysctl.conf", "/proc/sys/fs/file-max"]: + if int(node[1][key]) > 65535: + print_green("[OK] " + key + " = " + node[1][key]) + else: + print_red("[ERROR] please check " + key + " = " + node[1][key] + ", no less than 65535.") + print_green("--------------Thread count check-----------------------------------------------------------") + if len(node[1]["PoolSize"]) == 0: + node[1]["PoolSize"] = 500 + if int(node[1]["CurrentUseProcesses"]) < int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"]): + print_green("[OK] The thread count = %s, the total processes = %s * %s = %i" % (node[1]["CurrentUseProcesses"], node[1]["NodeProcessors"] ,node[1]["PoolSize"], int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"]))) + else: + print_red("[ERROR] The thread count = %s, the total processes = %s * %s = %i. eggroll.rollpair.eggpair.server.executor.pool.max.size is not enough, turn it up." % (node[1]["CurrentUseProcesses"], node[1]["NodeProcessors"] ,node[1]["PoolSize"], int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"]))) + if node[1]["RollsiteUsedPercent"] != 0: + print_green("----------Rollsite memory use percent--------------------------------------------------") + print_yellow("[WARNING] rollsite memory use: " + node[1]["RollsiteUsedPercent"]) + print("\n") + finally: + session.kill() + + +if __name__ == '__main__': + if args.time == 0: + check_actual_max_threads() + else: + while 1: + check_actual_max_threads() + time.sleep(args.time) diff --git a/tools/debug/env_check.sh b/tools/debug/env_check.sh new file mode 100644 index 0000000000..3a81cbc4d3 --- /dev/null +++ b/tools/debug/env_check.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +cwd=$(cd `dirname $0`; pwd) +nodes=$1 + +LogLevel=$EGGROLL_LOG_LEVEL +export EGGROLL_LOG_LEVEL=INFO +python env_check.py -p $nodes > result_env.log +export EGGROLL_LOG_LEVEL=$LogLevel +echo "The check result has saved in $cwd/result_env.log, please check it." + diff --git a/tools/debug/grep_logs.sh b/tools/debug/grep_logs.sh new file mode 100644 index 0000000000..0351e36b4c --- /dev/null +++ b/tools/debug/grep_logs.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +cwd=$(cd `dirname $0`; pwd) +source ./check_iplist.sh +session=$1 + +for ip in ${iplist[@]};do + mkdir -p $session/$ip + scp -r $user@$ip:$EGGROLL_HOME/logs/*$session* $session/$ip + echo "The $session logs from $ip has saved in $cwd/$session/$ip, please check it." +done +cd $cwd diff --git a/tools/debug/server_check.py b/tools/debug/server_check.py new file mode 100644 index 0000000000..d39eda417d --- /dev/null +++ b/tools/debug/server_check.py @@ -0,0 +1,164 @@ +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +import re +import os +import sys +import json +import time +import socket +import psutil +import datetime +import threading +import argparse +import subprocess +from eggroll.core.session import ErSession +from eggroll.roll_pair.roll_pair import RollPairContext +from eggroll.utils.log_utils import get_logger + +L = get_logger() + +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument("-t","--time", type=int, help="Sleep time wait, default value 0s", default=0) +arg_parser.add_argument("-n","--nodes", type=int, help="Eggroll session processors per node, default value 1", default=1) +arg_parser.add_argument("-p","--partitions", type=int, help="Total partitions, default value 1", default=1) +arg_parser.add_argument("-d","--partyid", type=int, help="host partyid", default=0) +args = arg_parser.parse_args() + +def str_generator(include_key=True, row_limit=10, key_suffix_size=0, value_suffix_size=0): + for i in range(row_limit): + if include_key: + yield str(i) + "s"*key_suffix_size, str(i) + "s"*value_suffix_size + else: + yield str(i) + "s"*value_suffix_size + +def round2(x): + return str(round(x / 1024 / 1024 / 1024, 2)) + +def print_red(str): + print("\033[1;31;40m\t" + str + "\033[0m") + +def print_green(str): + print("\033[1;32;40m\t" + str + "\033[0m") + +def print_yellow(str): + print("\033[1;33;40m\t" + str + "\033[0m") + +def check_actual_max_threads(): + def getMemInfo(fn): + def query_cmd(cmd): + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0].decode().strip().split('\n') + return p[0] + + def get_host_ip(): + try: + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.connect(('8.8.8.8', 80)) + ip = s.getsockname()[0] + finally: + s.close() + return ip + fate_flow_client = "/data/projects/fate/python/fate_flow/fate_flow_client.py" + mem_info = {} + mem_info["Ip"] = get_host_ip() + eggroll_home = query_cmd("echo $EGGROLL_HOME") + route_file = eggroll_home + "/conf/route_table.json" + f = open(route_file, encoding='utf-8') + mem_info["route_table"] = json.load(f) + mem_info["data_access"] = query_cmd("ps aux |grep data_access_server |grep -v grep |wc -l") + if args.partyid != 0: + mem_info["data_test"] = query_cmd("curl -X POST --header 'Content-Type: application/json' -d '{\"local\": {\"role\": \"host\", \"party_id\": %s}, \"id_type\":\"phone\", \"encrypt_type\":\"md5\"}' 'http://127.0.0.1:9350/v1/data/query_imported_id_library_info'" %(args.partyid)) + mem_info["data_num"] = mem_info["data_test"].split(':')[-1].split('}')[0] + mem_info["directory"] = query_cmd("if [ -d /data/projects/fdn/FDN-DataAcces ];then echo 1; else echo 0; fi") + mem_info["services"] = ['ClusterManagerBootstrap','NodeManagerBootstrap','rollsite','fate_flow_server.py','fateboard','mysql'] + mem_info["job_run"] = query_cmd("if [ -f %s ];then python %s -f query_job -s running | grep f_job_id |wc -l; else echo -1; fi" %(fate_flow_client,fate_flow_client)) + mem_info["job_wait"] = query_cmd("if [ -f %s ];then python %s -f query_job -s waiting | grep f_job_id |wc -l; else echo -1; fi" %(fate_flow_client,fate_flow_client)) + mem_info["job_thread"] = [] + mem_info["jobs"] = query_cmd("array=(`python %s -f query_job -s running | grep f_job_id |awk -F: '{print $2}' |awk -F '\"' '{print $2}'`);echo ${array[@]}" %(fate_flow_client)) + mem_info["job_mem"] = [] + for job_id in mem_info["jobs"]: + mem_info["job_thread"] = query_cmd("ps -ef |grep egg_pair |grep -v grep |grep %s |wc -l" %(job_id)) + mem_info["job_mem"] = query_cmd("ps aux |grep egg_pair |grep %s |awk '{sum+=$6};END {print sum}'" %(job_id)) + mem_info["server_mem"] = {} + mem_info["thread"] = {} + for service in mem_info["services"]: + mem_info["thread"][service] = query_cmd("ps -ef |grep %s |grep -v grep |wc -l" %(service)) + mem_info["server_mem"][service] = str(query_cmd("ps aux |grep %s |grep -v grep |awk '{sum+=$6};END {print sum}'" %(service))) + return mem_info + + session = ErSession(options={"eggroll.session.processors.per.node": args.nodes}) + try: + ctx = RollPairContext(session) + rp = ctx.parallelize(str_generator(row_limit=1000), options={'total_partitions': args.partitions}) + result = rp.with_stores(func=getMemInfo) + print_green(str(datetime.datetime.now())) + for node in result: + print_green("==============This is node " + str(node[0]) + ":" + node[1]["Ip"] + "===========================================") + print_green("-------------default route check-------------------------------------------------------") + route_table_dict = node[1]["route_table"] + if 'default' not in route_table_dict['route_table']: + print_red("[ERROR] eggroll exchange route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed!") + else: + try: + ip = route_table_dict['route_table']['default']['default'][0]['ip'] + port = route_table_dict['route_table']['default']['default'][0]['port'] + print_green("[OK] eggroll route configured!") + print_green("exchange ip:{}, exchange port:{}".format(ip, port)) + except KeyError: + print_red("[ERROR] eggroll exchange route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed!") + + print_green("--------------data_access service check-------------------------------------------------") + if int(node[1]["data_access"]) == 0: + if int(node[1]["directory"]) == 0: + print_red("[ERROR] data_access service and directory not found, please check if it is installed!") + else: + print_yellow("[WARNING] data_access not running or check /data/projects/fdn/FDN-DataAcces directory") + else: + print_green("[OK] Installed and running data_access service!") + if args.partyid != 0: + if int(node[1]["data_num"]) == 0 or int(node[1]["data_num"]) == 201: + print_green("[OK] Route verification success!") + else: + print_yellow("[WARNING] data_access service not available, please check host and host route!") + + print_green("--------------fate service check-------------------------------------------------------") + for server in node[1]["services"]: + if int(node[1]["thread"][server]) > 0: + print_green("[OK] the " + server.ljust(23) + " service is running , number of processes is : " + str(node[1]["thread"][server]) + "; used memory : " + str(node[1]["server_mem"][server]) + "KB.") + else: + print_yellow("[WARNING] the " + server + " service not running, please check service status.") + + print_green("--------------fate_flow jobs process and mem info check--------------------------------------------------") + if int(node[1]["job_run"]) == -1: + print_red("[ERROR] There is no such fate_flow_client.py file, please check fate_flow server if it is running!") + else: + print_green("[OK] Number of tasks running is " + node[1]["job_run"]) + print_green("[OK] Number of tasks waiting is " + node[1]["job_wait"]) + if int(node[1]["job_run"]) > 0: + for job_id in node[1]["jobs"].split(" "): + print_green("[OK] running task job_id : " + job_id + ", number of egg_pair processes is : " + str(node[1]["job_thread"]) + "; used memory : " + str(node[1]["job_mem"]) + "KB.") + + print("\n") + finally: + session.kill() + + +if __name__ == '__main__': + if args.time == 0: + check_actual_max_threads() + else: + while 1: + check_actual_max_threads() + time.sleep(args.time) diff --git a/tools/debug/server_check.sh b/tools/debug/server_check.sh new file mode 100644 index 0000000000..56cc20416c --- /dev/null +++ b/tools/debug/server_check.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +cwd=$(cd `dirname $0`; pwd) +if [ ! -f 'result_server.log' ];then +touch result_server.log +fi + +nodes=$1 +party=$2 +LogLevel=$EGGROLL_LOG_LEVEL +export EGGROLL_LOG_LEVEL=INFO +if [ -n "$party" ];then + python server_check.py -p $nodes -d $party >> result_server.log +else + python server_check.py -p $nodes >> result_server.log +fi +export EGGROLL_LOG_LEVEL=$LogLevel +echo "Check the result in the current directory, Please execute command: cat result_server.log" diff --git a/tools/debug/test_env.py b/tools/debug/test_env.py new file mode 100644 index 0000000000..73661f90c0 --- /dev/null +++ b/tools/debug/test_env.py @@ -0,0 +1,67 @@ +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +import re +import subprocess + +def sub_dict(form_dict, sub_keys, default=None): + return dict([(k.strip(), form_dict.get(k.strip(), default)) for k in sub_keys.split(',')]) + + +def query_file(file_name, opts=''): + mem_info = {} + print(file_name) + with open(file_name, 'r') as f: + data = f.readlines() + for i in data: + if ':' in i or '=' in i: + i = i.replace(':', ',').replace('=', ',') + k, v = [x.strip() for x in i.split(',')] + mem_info[k] = int(v.split()[0]) + return sub_dict(mem_info, opts) + + +def query_cmd(cmd, opts=''): + if opts: + opts = " | grep -E '" + opts.replace(',', '|').replace(' ', '') + "'" + print(cmd + opts) + p = subprocess.Popen(cmd + opts, stdout=subprocess.PIPE, shell=True) + return p.communicate()[0] + +def query(cmd, opts='', flags=True): + if flags: + print(str(query_cmd(cmd, opts))) + else: + print(str(query_file(cmd, opts))) + +if __name__ == "__main__": + max_user_processes_params=[('cat /proc/sys/kernel/threads-max',),('/etc/sysctl.conf', 'kernel.pid_max', False),('cat /proc/sys/kernel/pid_max',),('cat /proc/sys/vm/max_map_count',)] + print('==============max user processes===============') + for p in max_user_processes_params: + s = query(*p) + + max_files_count_params=[('cat /etc/security/limits.conf', 'nofile'),('cat /etc/security/limits.d/80-nofile.conf',),('/etc/sysctl.conf','fs.file-max', False),('cat /proc/sys/fs/file-max',)] + print('===============max files count=================') + for i in max_files_count_params: + query(*i) + + memory_params=('/proc/meminfo', 'MemTotal, MemFree, MemAvailable, SwapTotal, SwapFree', False) + print('================memory info====================') + query(*memory_params) + + disk_params=('df -lh', '/dev/vdb,/dev/vda1') + print('================disk info====================') + query(*disk_params) + diff --git a/tools/debug/time_check.py b/tools/debug/time_check.py new file mode 100644 index 0000000000..51c5801879 --- /dev/null +++ b/tools/debug/time_check.py @@ -0,0 +1,29 @@ +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +import os +import time +import argparse + +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument("-t","--time", type=int, help="Sleep time wait, default value 0s", default=0) +args = arg_parser.parse_args() + +if args.time == 0: + os.system('sh ./cluster_env_check.sh') +else: + while 1: + os.system('sh ./cluster_env_check.sh') + time.sleep(args.time) diff --git "a/tools/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/tools/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" new file mode 100644 index 0000000000..2038034f8b --- /dev/null +++ "b/tools/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" @@ -0,0 +1,394 @@ +# 脚本使用说明 + +## 一 概述 + +本工具集提供4个工具,功能如下: + +| 工具名称 | 工具功能 | 使用场景 | +| ---------------- | ------------------------------------------ | -------------------- | +| 机器基础信息检测 | 验证机器设置是否满足跑fate任务要求 | 部署完成并启动服务后 | +| fate运行信息检测 | 验证机器当前状态是否适合新建一个fate任务 | 启动fate任务前 | +| 日志搜集 | 搜集该集群下所有session_id的日志到当前目录 | 跑任务出现错误后 | +| 集群配置检测 | 搜集展示集群的配置文件信息 | 部署完成启动服务失败 | + +名词解释: + +| 名词 | 解释 | +| ----------------------- | -------------------------------------------------------- | +| $FATE_HOME | 通常在/data/projects/fate | +| $EGGROLL_HOME | 通常在/data/projects/fate/eggroll | +| ${集群节点个数} | 如果运行脚本的机器所在集群有3个节点,就取3 | +| ${host party_id} | 可选参数,检查data_access服务是否可用,取host方partyid值 | +| ${需要查询的session-id} | 是一个21位左右的长id。如202009031227285073491。 | + + + +## 二 机器基础信息检测 + +### 2.1 使用场景 + +------ + +此脚本在完成部署并正常启动服务后运行,脚本功能检查系统内存 / 虚拟内存 / 磁盘 / 最大用户进程数 / 文件数 / 线程数设置 / rollsite进程堆内存 等机器基础信息,用于验证机器设置是否满足跑fate任务要求。 + +### 2.2 工具功能 + +------ + +此检测检测提供两种版本: + +- 单集群版:基于eggroll服务检测,需要各个节点eggroll服务正常启动后方可使用,用于检测各个nodemanager服务所在节点的集群基础信息,其检测项包含以下所有共7项; + +- 跨集群版:无需依赖eggroll服务,可以跨节点检测指定所有ip的基础信息,其检测项仅包括以下列出前5项,**需支持节点间免密登录**。 + +1)检查系统内存:系统内存总量、系统内存使用量、系统内存使用占比 + +2)检查虚拟内存:虚拟内存总量、虚拟内存使用量、虚拟内存使用占比 + +3)检查磁盘使用情况:磁盘总量、磁盘使用量、磁盘使用占比 + +4)检查系统最大用户进程数 + +5)检查最大文件数 + +6)检查线程数设置:检查egg pair线程数eggroll.rollpair.eggpair.server.executor.pool.max.size设置是否充足 + +7)检查rollsite进程堆内存是否充足: + +### 2.3 使用方法 + +------ + +#### 2.3.1 单集群版 + +```shell +source $FATE_HOME/init_env.sh +cd $EGGROLL_HOME/bin/debug +sh env_check.sh ${集群节点个数} +cat result_env.log +``` + +若对几个$开头的变量有疑问,请参考概述中的名词解释。 + +#### 2.3.2 跨集群版 + +**需支持节点间免密scp、ssh操作,也可以手动输入密码执行** + +1、设置环境变量: + +```shell +source $FATE_HOME/init_env.sh +``` + +2、编辑配置文件: + +``` +cd $EGGROLL_HOME/bin/debug +vi check_iplist.sh +``` + +参数说明: + +```shell +user=app <远程登录用户名> +iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> +``` + +3、执行检测脚本: + +```python +python time_check.py +//查看检测结果,各个ip的检测结果生成于当前目录下以ip命名的文件,直接cat可查看对应ip的返回信息 +cat ./$ip +``` + +//若需定时检测观察内存信息变化则加-t参数,可指定间隔秒数定时输出 + +``` +python time_check.py -t {检测间隔秒数,不填只检测一次} +``` + +### 2.4 检测结果说明 + +------ + +返回示例信息如下: + +*说明:以下信息分为三种提示等级:* + +*[OK\] 表示该检查项正常;* + +*[WARNING\]表示该项需要注意,仅作关键信息展示,需要自行判断;* + +*[ERROR\]表示该项不符合预期结果,需要按提示修改。* + +```properties +//脚本执行时间 + 2020-09-02 15:00:41.424053 +//返回的节点ip + ==============This is node 0:127.0.0.1=========================================== +//系统内存总量、系统内存使用量、系统内存使用占比 + [WARNING] MemTotal:78.51G, MemUsed:11.5G, MemUsedPCT:15% +//虚拟内存总量、虚拟内存使用量、虚拟内存使用占比,若小于128G,则提示ERROR,如下所示: + [ERROR] The swap memory is:32.0G, no less than 128G. <虚拟内存不足 + [WARNING] MemTotal:16.51G, MemUsed:128G, MemUsedPCT:12.3% <虚拟内存正常 +//磁盘总量、磁盘使用量、磁盘使用占比 + [WARNING] DiskTotal:984.18G, DiskUsed:566.53G, DiskUsedPCT:61% + --------------Max user processes and max file count------------------------------ +//最大用户进程数与最大文件数各个文件设置值展示,其中不满足65535的项则报[ERROR提示]: + [OK] /proc/sys/kernel/threads-max = 642956 + [OK] /etc/sysctl.conf = 1048576 + [OK] /proc/sys/kernel/pid_max = 131072 + [ERROR] please check /proc/sys/vm/max_map_count = 65530, no less than 65535. + [OK] /etc/security/limits.conf = 102401 + [OK] /etc/security/limits.d/80-nofile.conf = 131072 + [OK] /etc/sysctl.conf = 1048576 + [OK] /proc/sys/fs/file-max = 1048576 + --------------Thread count check------------------------------------------------- +//判断eggroll.properties中eggroll.rollpair.eggpair.server.executor.pool.max.size配置项设置的线程值是否充足,若不充足,则报[ERROR]提示需要调大线程值 + [OK] The thread count = 1406, the total processes = 16 * 500 = 8000 + ----------Rollsite memory use percent-------------------------------------------- +//展示rollsite进程占用堆内存与rollsite设置内存上限比值,以判断rollsite内存是否充足,若百分比偏大,则需考虑释放rollsite内存或调高rollsite内存上限 + [WARNING] rollsite memory use: 0.69% +``` + + + +## 三 fate运行信息检测 + +### 3.1 使用场景 + +------ + +跑fate任务前,检测fate运行信息。验证机器当前状态是否适合新建一个fate任务 + +### 3.2 工具功能 + +------ + +检测fate运行信息:eggroll路由是不是默认路由、是否已安装data access、fate服务的运行状态、进程数及占用内存情况、当前环境正在运行及等待的job任务数、job任务有多少进程及占用的内存情况。 + +### 3.3 使用方法 + +``` +source $FATE_HOME/init_env.sh //FATE_HOME为用户环境的fate目录 +cd $EGGROLL_HOME/bin/debug +sh server_check.sh ${集群内节点个数} ${host party_id(可选)} +例:sh server_check.sh 1 10000 +``` + +可选参数: + +​ {host party_id} //当需要检查data_assess的服务是否可用时使用。若不提供该参数时不检测。 + +结果保存在result_server.log文件中 + +### 3.4 检测结果说明 + +------ + +#### 3.4.1 default route check(eggroll路由是不是默认路由) + +- 检测通过提示: + + [OK] eggroll route configured! + + "port": 9370, "ip": "127.0.0.1" + +- 检测失败提示: + + [ERROR] eggroll route is not configured, please check /data/projects/fate/eggroll/conf/route_table.json file if it is existed! + +- 检查方法: + + 检测/data/projects/fate/eggroll/conf/route_table.json 是否有配置default参数。如果有,把ip和端口打印出来。如果无,提示ERROR。 + + + +#### 3.4.2 data_access service check(是否已安装data access) + +- 检测通过提示: + + [OK] Installed and running data_access service! + +- 检测失败提示: + + [ERROR] data_access service and directory not found, please check if it is installed! + +- 检查方法: + + 先检查data_access 进程是否存在或者目录是否存在。若存在,会进一步检查data_access 服务是否可用。详细逻辑是: + + ``` + 若返回进程数为0,判断检查服务目录的返回值,若为0,则视为没有安装access,提示ERROR;否则,则视为没有启动access,提示WARNING; + + 若返回进程数大于0,判断路由验证返回码,如果返回 "status":0,或 "status":201,则说明 DataAccess 服务以及路由表配置没有问题,否则提示WARNING检查路由设置 + ``` + +#### 3.4.3 fate service check(fate服务状态、进程数及占用内存) + +- 检测通过提醒: + + [OK] the service is running , number of processes is :; used memory: + +- 检测失败提醒: + + [WARNING] the service not running, please check service status. + +- 检查方法: + + 检查服务列表: + + 'ClusterManagerBootstrap','NodeManagerBootstrap','rollsite','fate_flow_server.py','fateboard','mysql' + + 检查进程数方法: + + ``` + thread = ps -ef |grep service |grep -v grep |wc -l + ``` + + 检查服务占用内存方法: + + ``` + server_mem = ps aux |grep %s |grep -v grep |awk '{sum+=$6};END {print sum}' + ``` + + +#### 3.4.4 fate_flow jobs process and mem info check(job任务数检测、job任务进程及占用内存) +- 检测通过提醒: + + [OK] Number of tasks running is xxx + + [OK] Number of tasks waiting is xxx + + [OK] running task job_id :xxx ,number of egg_pair processes is :xxx; used memory:xxx + +- 检测失败提醒: + + [ERROR] There is no such fate_flow_client.py file, please check fate_flow server if it is running! + +- 检查方法: + + 通过FATE自带的fate_flow_client 命令查看任务相关信息,通过ps命令查看内存相关信息。 + + + +## 四 日志搜集 + +### 4.1 使用场景 + +------ + +适用于跑任务出现错误后,在开发人员指导下进行错误日志搜集脚本,需要从报错日志中提取关键报错信息。 + +### 4.2 工具功能 + +------ + +拉取指定ip:$EGGROLL_HOME/logs目录下带传入关键字的目录到本机当前目录下 + +### 4.3 使用方法 + +**需支持节点间免密scp、ssh操作,也可以手动输入密码执行** + +1、设置环境变量: + +```shell +source $FATE_HOME/init_env.sh +``` + +2、编辑配置文件: + +``` +cd $EGGROLL_HOME/bin/debug +vi check_iplist.sh +``` + +参数说明: + +```shell +user=app <远程登录用户名> +iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> +``` + +3、执行检测脚本: + +```shell +sh grep_logs.sh ${需要查询的session-id} <带上需要搜集的session-id,支持模糊查询> +``` + +执行后该session-id的各个ip的日志便会搜集到当前目录下的$session-id/$ip目录下 + +### 4.4 结果说明 + +------ + +执行完可在当前目录下看到传入的$session_id目录,目录下是各个ip的关于$session_id的日志。 + + + +## 五 集群配置检测 + +### 5.1 使用场景 + +------ + +适用于运维人员部署好项目后,肉眼检查各个机器的eggroll.properties、route_table.json配置是否存在问题。 + +### 5.2 工具功能 + +------ + +拉取指定ip的eggroll.properties、route_table.json配置到本机展示。 + +### 5.3 使用方法 + +**需支持节点间免密scp、ssh操作,或手动输入密码执行也可以** + +------ + +1、设置环境变量: + +```shell +source $FATE_HOME/init_env.sh +``` + +2、编辑配置文件: + +``` +cd $EGGROLL_HOME/bin/debug +vi check_iplist.sh +``` + +参数说明: + +```shell +user=app <远程登录用户名> +iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> +``` + +3、然后执行脚本: + +```shell +sh check_conf.sh +``` + +### 5.4 结果说明 + +------ + +该脚本展示配置所有ip与本机的配置对比,说明如下: + +```properties +//展示本机eggroll.properties配置信息 +----------------------$EGGROLL_HOME/conf/eggroll.properties-------------------- +//展示本机route_table.json配置信息 +-----------------------$EGGROLL_HOME/conf/route_table.json--------------------- +//展示ip列表中第一个ip配置与本机的diff结果,若为空则完全相同 +------------------diff $ip1 with ./conf/eggroll.properties------------------------- +//展示ip列表中第二个ip配置与本机的diff结果,若为空则完全相同 +------------------diff $ip2 with ./conf/eggroll.properties------------------------- +//展示ip列表中第三个ip配置与本机的diff结果,若为空则完全相同 +------------------diff $ip3 with ./conf/eggroll.properties------------------------- +``` +