diff --git a/.gitmodules b/.gitmodules
index 7c9dcff1f0..f9182f0090 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -5,4 +5,4 @@
[submodule "eggroll"]
path = eggroll
url = https://github.com/WeBankFinTech/eggroll.git
- branch = v2.0.1
+ branch = release-2.0.2-build-6
diff --git a/.travis.yml b/.travis.yml
index 9054c89616..f2e6d4a696 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,9 +14,9 @@ matrix:
- env: OS='centos:7'
script:
- - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.4.tar.gz
- - tar -xzf standalone-fate-master-1.4.4.tar.gz
- - cd standalone-fate-master-1.4.4
+ - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.5.tar.gz
+ - tar -xzf standalone-fate-master-1.4.5.tar.gz
+ - cd standalone-fate-master-1.4.5
- sed -i.bak "s/sh service.sh/bash service.sh/g" init.sh
- source init.sh init
- ls -alh
diff --git a/arch/api/impl/based_2x/federation.py b/arch/api/impl/based_2x/federation.py
index b19bd6eed2..e4fcb99171 100644
--- a/arch/api/impl/based_2x/federation.py
+++ b/arch/api/impl/based_2x/federation.py
@@ -135,10 +135,18 @@ def done_callback(fut):
except Exception as e:
import os
import signal
+ import traceback
+ import logging
+ import sys
+ exc_info = sys.exc_info()
+ traceback.print_exception(*exc_info)
pid = os.getpid()
LOGGER.exception(f"remote fail, terminating process(pid={pid})")
- os.kill(pid, signal.SIGTERM)
- raise e
+ try:
+ logging.shutdown()
+ finally:
+ os.kill(pid, signal.SIGTERM)
+ raise e
for future in futures:
future.add_done_callback(done_callback)
diff --git a/cluster-deploy/README.md b/cluster-deploy/README.md
index 2aeca269f8..53fff6d7cb 100644
--- a/cluster-deploy/README.md
+++ b/cluster-deploy/README.md
@@ -199,10 +199,10 @@ Execute under the app user of the target server (192.168.0.1 has an external net
```
mkdir -p /data/projects/install
cd /data/projects/install
-wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.3-release.tar.gz
+wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.5-release.tar.gz
wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/jdk-8u192-linux-x64.tar.gz
-wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.3-release.tar.gz
-wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.3-release.tar.gz
+wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.5-release.tar.gz
+wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.5-release.tar.gz
#Send to 192.168.0.2和192.168.0.3
scp *.tar.gz app@192.168.0.2:/data/projects/install
@@ -355,7 +355,7 @@ sh Miniconda3-4.5.4-Linux-x86_64.sh -b -p /data/projects/fate/common/miniconda3
tar xvf pip-packages-fate-*.tar.gz
source /data/projects/fate/common/python/venv/bin/activate
pip install setuptools-42.0.2-py2.py3-none-any.whl
-pip install -r pip-packages-fate-1.4.3/requirements.txt -f ./pip-packages-fate-1.4.3 --no-index
+pip install -r pip-packages-fate-1.4.5/requirements.txt -f ./pip-packages-fate-1.4.5 --no-index
pip list | wc -l
#The result should be 161
```
diff --git a/cluster-deploy/README.rst b/cluster-deploy/README.rst
index f432d8fbba..40ec8c672f 100644
--- a/cluster-deploy/README.rst
+++ b/cluster-deploy/README.rst
@@ -239,10 +239,10 @@ external network environment):
mkdir -p /data/projects/install
cd /data/projects/install
- wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.3-release.tar.gz
+ wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.5-release.tar.gz
wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/jdk-8u192-linux-x64.tar.gz
- wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.3-release.tar.gz
- wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.3-release.tar.gz
+ wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.5-release.tar.gz
+ wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.5-release.tar.gz
#Send to 192.168.0.2和192.168.0.3
scp *.tar.gz app@192.168.0.2:/data/projects/install
@@ -400,7 +400,7 @@ external network environment):
tar xvf pip-packages-fate-*.tar.gz
source /data/projects/fate/common/python/venv/bin/activate
pip install setuptools-42.0.2-py2.py3-none-any.whl
- pip install -r pip-packages-fate-1.4.3/requirements.txt -f ./pip-packages-fate-1.4.3 --no-index
+ pip install -r pip-packages-fate-1.4.5/requirements.txt -f ./pip-packages-fate-1.4.5 --no-index
pip list | wc -l
#The result should be 161
diff --git a/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.md b/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.md
index 676ce535ff..57c955ae62 100644
--- a/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.md
+++ b/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.md
@@ -255,8 +255,8 @@ Swap: 131071 0 131071
```
cd /data/projects/
-wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/fate-cluster-install-1.4.4-release-c7-u18.tar.gz
-tar xzf fate-cluster-install-1.4.4-release-c7-u18.tar.gz
+wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/fate-cluster-install-1.4.5-release-c7-u18.tar.gz
+tar xzf fate-cluster-install-1.4.5-release-c7-u18.tar.gz
```
## 5.2 部署前检查
@@ -292,7 +292,7 @@ vi fate-cluster-install/allInone/conf/setup.conf
| 配置项 | 配置项值 | 说明 |
| ---------------- | --------------------------------------------- | ------------------------------------------------------------ |
| roles | 默认:"host" "guest" | 部署的角色,有HOST端、GUEST端 |
-| version | 默认:1.4.4 | Fate 版本号 |
+| version | 默认:1.4.5 | Fate 版本号 |
| pbase | 默认: /data/projects | 项目根目录 |
| lbase | 默认:/data/logs | 保持默认不要修改 |
| ssh_user | 默认:app | ssh连接目标机器的用户,也是部署后文件的属主 |
@@ -321,7 +321,7 @@ vi fate-cluster-install/allInone/conf/setup.conf
#to install role
roles=( "host" "guest" )
-version="1.4.4"
+version="1.4.5"
#project base
pbase="/data/projects"
@@ -377,7 +377,7 @@ basemodules=( "base" "java" "python" "eggroll" "fate" )
#to install role
roles=( "host" )
-version="1.4.4"
+version="1.4.5"
#project base
pbase="/data/projects"
diff --git a/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.rst b/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.rst
index b21fd57593..fa739a5d1f 100644
--- a/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.rst
+++ b/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.rst
@@ -275,8 +275,8 @@ ssh app@192.168.0.2
::
cd /data/projects/
- wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/fate-cluster-install-1.4.4-release-c7-u18.tar.gz
- tar xzf fate-cluster-install-1.4.4-release-c7-u18.tar.gz
+ wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/fate-cluster-install-1.4.5-release-c7-u18.tar.gz
+ tar xzf fate-cluster-install-1.4.5-release-c7-u18.tar.gz
5.2 部署前检查
--------------
@@ -314,7 +314,7 @@ ssh app@192.168.0.2
+======================+=================================================+==============================================================================+
| roles | 默认:"host" "guest" | 部署的角色,有HOST端、GUEST端 |
+----------------------+-------------------------------------------------+------------------------------------------------------------------------------+
-| version | 默认:1.4.4 | Fate 版本号 |
+| version | 默认:1.4.5 | Fate 版本号 |
+----------------------+-------------------------------------------------+------------------------------------------------------------------------------+
| pbase | 默认: /data/projects | 项目根目录 |
+----------------------+-------------------------------------------------+------------------------------------------------------------------------------+
@@ -366,7 +366,7 @@ ssh app@192.168.0.2
#to install role
roles=( "host" "guest" )
- version="1.4.4"
+ version="1.4.5"
#project base
pbase="/data/projects"
@@ -421,7 +421,7 @@ ssh app@192.168.0.2
#to install role
roles=( "host" )
- version="1.4.4"
+ version="1.4.5"
#project base
pbase="/data/projects"
diff --git a/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.md b/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.md
index 7d5678fa36..4980426d29 100644
--- a/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.md
+++ b/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.md
@@ -146,7 +146,7 @@ fi
```
cd /data/projects/install
wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/jdk-8u192-linux-x64.tar.gz
-wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.4-release.tar.gz
+wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.5-release.tar.gz
```
## 5.2 操作系统参数检查
diff --git a/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.rst b/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.rst
index 47297f85f5..afc58587bc 100644
--- a/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.rst
+++ b/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.rst
@@ -160,7 +160,7 @@ ufw status
cd /data/projects/install
wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/jdk-8u192-linux-x64.tar.gz
- wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.4-release.tar.gz
+ wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.5-release.tar.gz
5.2 操作系统参数检查
--------------------
diff --git a/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md b/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md
index bb134c4b21..a716e6ca02 100644
--- a/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md
+++ b/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md
@@ -40,10 +40,12 @@ Eggroll 是一个适用于机器学习和深度学习的大规模分布式架构
本示例是每端只有一台主机,每端可以多台主机,目前只支持nodemanager多节点部署,其他组件都是单节点。
-| role | partyid | IP地址 | 操作系统 | 主机配置 | 存储 | 部署模块 |
-| ----- | ------- | --------------------- | ----------------------- | -------- | ---- | ------------------------------------------------------------ |
-| host | 10000 | 192.168.0.1 (有外网) | CentOS 7.2/Ubuntu 16.04 | 8C16G | 500G | fate_flow,fateboard,clustermanager,nodemanger,rollsite,mysql |
-| guest | 9999 | 192.168.0.2 | CentOS 7.2/Ubuntu 16.04 | 8C16G | 500G | fate_flow,fateboard,clustermanager,nodemanger,rollsite,mysql |
+| role | partyid | IP地址 | 操作系统 | 主机配置 | 存储 | 部署模块 |
+| ----- | ------- | --------------------- | ---------- | -------- | ---- | ------------------------------------------------------------ |
+| host | 10000 | 192.168.0.1 (有外网) | CentOS 7.2 | 8C16G | 500G | fate_flow,fateboard,clustermanager,nodemanger,rollsite,mysql |
+| guest | 9999 | 192.168.0.2 | CentOS 7.2 | 8C16G | 500G | fate_flow,fateboard,clustermanager,nodemanger,rollsite,mysql |
+
+备注:涉及exchange说明会用192.168.0.88表示其IP,但本次示例不涉及exchange的部署。
## 2.2.主机资源和操作系统要求
@@ -51,7 +53,7 @@ Eggroll 是一个适用于机器学习和深度学习的大规模分布式架构
| -------- | ------------------------------------------------------------ |
| 主机配置 | 不低于8C16G500G,千兆网卡 |
| 操作系统 | CentOS linux 7.2及以上同时低于8/Ubuntu 16.04 或 Ubuntu 18.04 |
-| 依赖包 | 需要安装如下依赖包:
#centos
gcc gcc-c++ make openssl-devel gmp-devel mpfr-devel libmpc-devel libaio
numactl autoconf automake libtool libffi-devel ansible jq supervisor
#ubuntu
gcc g++ make openssl supervisor ansible jq libgmp-dev libmpfr-dev libmpc-dev
libaio libaio-dev numactl autoconf automake libtool libffi-dev ansible jq supervisor
cd /usr/lib/x86_64-linux-gnu
if [ ! -f "libssl.so.10" ];then
ln -s libssl.so.1.0.0 libssl.so.10
ln -s libcrypto.so.1.0.0 libcrypto.so.10
fi |
+| 依赖包 | 需要安装如下依赖包:
#centos
gcc gcc-c++ make openssl-devel gmp-devel mpfr-devel libmpc-devel libaio
numactl autoconf automake libtool libffi-devel ansible jq supervisor |
| 用户 | 用户:app,属主:apps(app用户需可以sudo su root而无需密码) |
| 文件系统 | 1、数据盘挂载在/data目录下。
2、创建/data/projects目录,目录属主为:app:apps。
3、根目录空闲空间不低于20G。 |
| 虚拟内存 | 不低于128G |
@@ -100,8 +102,6 @@ vim /etc/hosts
centos系统执行:rpm -qa | grep selinux
-ubuntu系统执行:apt list --installed | grep selinux
-
如果已安装了selinux就执行:setenforce 0
3.3 修改Linux系统参数
@@ -109,22 +109,32 @@ ubuntu系统执行:apt list --installed | grep selinux
**在目标服务器(192.168.0.1 192.168.0.2)root用户下执行:**
-1)vim /etc/security/limits.conf
+1)清理20-nproc.conf文件
+
+cd /etc/security/limits.d
+
+ls -lrt 20-nproc.conf
+
+存在则:mv 20-nproc.conf 20-nproc.conf_bak
+
+2)vim /etc/security/limits.conf
\* soft nofile 65535
\* hard nofile 65535
-2)vim /etc/security/limits.d/20-nproc.conf
+\* soft nproc 65535
+
+\* hard nproc 65535
-\* soft nproc unlimited
+重新登陆,ulimit -a查看是否生效
3.4 关闭防火墙
--------------
**在目标服务器(192.168.0.1 192.168.0.2 )root用户下执行**
-如果是Centos系统:
+Centos系统:
systemctl disable firewalld.service
@@ -132,12 +142,6 @@ systemctl stop firewalld.service
systemctl status firewalld.service
-如果是Ubuntu系统:
-
-ufw disable
-
-ufw status
-
3.5 软件环境初始化
------------------
@@ -165,6 +169,8 @@ Defaults !env_reset
**3)配置ssh无密登录**
+**注意:192.168.0.1不但需要可以免密登陆192.168.0.2,也需要可以免密登陆自身,配置完后务必手工ssh连接下自身和192.168.0.2,确认下认证信息。**
+
**a. 在目标服务器(192.168.0.1 192.168.0.2)app用户下执行**
su app
@@ -206,6 +212,8 @@ ssh app\@192.168.0.2
生产环境使用时,因内存计算需要增加128G虚拟内存,执行前需检查存储空间是否足够。
+注意:dd执行时间较长,请耐心等待
+
```
cd /data
dd if=/dev/zero of=/data/swapfile128G bs=1024 count=134217728
@@ -215,18 +223,21 @@ cat /proc/swaps
echo '/data/swapfile128G swap swap defaults 0 0' >> /etc/fstab
```
-## 3.7 安装ansible
+## 3.7 安装依赖包
-**目标服务器(192.168.0.1) root用户执行**
+**目标服务器(192.168.0.1 192.168.0.2)root用户执行**
```
-#判断是否已安装ansible
-ansible --version
-#没有则执行
-yum install -y ansible
-```
-
+#安装基础依赖包
+yum install -y gcc gcc-c++ make openssl-devel gmp-devel mpfr-devel libmpc-devel libaio numactl autoconf automake libtool libffi-devel
+#如果有报错,需要解决yum源问题。
+#安装ansible和进程管理依赖包
+yum install -y ansible jq supervisor
+#如果有报错同时服务器有外网,没有外网的需要解决yum源不全的问题,执行:
+yum install -y epel-release
+#增加一个更全面的第三方的源,然后再重新安装ansible jq supervisor
+```
4.项目部署
==========
@@ -242,16 +253,16 @@ yum install -y ansible
**在目标服务器(192.168.0.1 192.168.0.2)app用户下执行**
```
-#虚拟内存,size不低于128G,如不满足需参考4.6章节重新设置
+#虚拟内存,size不低于128G,如不满足需参考3.6章节重新设置
cat /proc/swaps
Filename Type Size Used Priority
/data/swapfile128G file 134217724 384 -1
-#文件句柄数,不低于65535,如不满足需参考4.3章节重新设置
+#文件句柄数,不低于65535,如不满足需参考3.3章节重新设置
ulimit -n
65535
-#用户进程数,不低于64000,如不满足需参考4.3章节重新设置
+#用户进程数,不低于64000,如不满足需参考3.3章节重新设置
ulimit -u
65535
@@ -261,6 +272,7 @@ ps -ef| grep -i fate
netstat -tlnp | grep 4670
netstat -tlnp | grep 4671
netstat -tlnp | grep 9370
+netstat -tlnp | grep 9371
netstat -tlnp | grep 9360
netstat -tlnp | grep 8080
netstat -tlnp | grep 3306
@@ -284,8 +296,8 @@ ls -lrt /data/projects/common/supervisord/supervisord.d/fate-*.conf
```
#注意:URL链接有换行,拷贝的时候注意整理成一行
cd /data/projects/
-wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/ansible_nfate_1.4.4_release-1.0.0.tar.gz
-tar xzf ansible_nfate_1.4.4_release-1.0.0.tar.gz
+wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/ansible_nfate_1.4.5_release-1.0.0.tar.gz
+tar xzf ansible_nfate_1.4.5_release-1.0.0.tar.gz
```
### 4.4 配置文件修改和示例
@@ -295,9 +307,6 @@ tar xzf ansible_nfate_1.4.4_release-1.0.0.tar.gz
```
cd ansible-nfate-*
#init.sh文件不需要修改,主要是辅助生成一些配置文件
-
-#测试环境加test参数执行
- sh ./tools/init.sh test
#生产环境加prod参数执行
sh ./tools/init.sh prod
@@ -309,6 +318,44 @@ init var_files/prod
init project_prod.yml
```
+### 4.4.2 证书制作配置(可选)
+
+1)证书制作
+
+```
+vi /data/projects/ansible-nfate-1.*/tools/make.sh
+
+#1、自定义安全证书需同时部署两端,只部署一端需要手工处理证书,手工处理部分暂不介绍。
+#2、安全证书支持如下部署方式:
+ 1)部署host+guest,host和guest使用安全证书通讯。
+ 2)部署host+exchange+guest,其中host和exchange使用安全证书通讯,guest和exchange普通通讯。
+ 3)部署host+exchange+guest,其中guest和exchange使用安全证书通讯,host和exchange普通通讯。
+
+guest_host="192.168.0.1" ---根据实际IP修改
+host_host="192.168.0.2" ---根据实际IP修改
+exchange_host="192.168.0.88" ---根据实际IP修改,本示例不部署无需修改
+```
+
+2)执行脚本制作证书
+
+```
+cd tools
+sh ./make.sh
+
+在keys/host,guest目录下会产生证书文件。
+```
+
+3)拷贝证书到部署目录
+
+```
+sh cp-keys.sh host guest
+
+证书文件会拷贝到roles/eggroll/files/keys目录
+
+特别说明:
+1、目前脚本部署只支持2方设置证书认证。(host&guest、host&exchange、guest&exchange)
+```
+
#### 4.4.2 修改配置文件
**1)修改初始化主机IP**
@@ -317,10 +364,13 @@ init project_prod.yml
vi /data/projects/ansible-nfate-1.*/environments/prod/hosts
#ansible格式配置文件
-[init] ---把需要部署的主机IP填入init组
+[fate] ---把需要部署的主机IP填入fate组
192.168.0.1
192.168.0.2
+[deploy_check] ---把执行ansible的本机IP填入deploy_check组
+192.168.0.1
+
[all:vars]
ansible_connection=ssh
ansible_ssh_port=22 ---根据实际情况修改
@@ -344,6 +394,8 @@ deploy_mode: "install" ---默认为空,修改为install,表示新部署
**3)修改host方参数**
+**注意:默认是不启用安全证书的配置,如果启用安全证书通讯需把server_secure,client_secure,is_secure设置为true,以及is_secure对应的port设置为9371**。
+
```
#不部署host方则不用修改
#除了nodemanger可以设置多个IP外,其他都是单个IP
@@ -355,13 +407,17 @@ host:
enable: True
ips: ---IP列表,目前rollsite只支持部署到一台服务器
- 192.168.0.1
- port: 9370
+ port: 9370 ---grpc端口
+ secure_port: 9371 ---grpcs端口
pool_size: 600 ---线程池大小
- max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如8G
- default_rules: ---本party指向exchange或者其他party的IP,端口路由配置
+ max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如12G,如果是rollsite专用的机器,配置成物理内存的75%。
+ server_secure: False ---作为服务端,开启安全证书验证,不使用安全证书默认即可
+ client_secure: False ---作为客户端,使用证书发起安全请求,不使用安全证书默认即可
+ default_rules: ---本party指向exchange或者其他party的IP、端口路由配置
- name: default
- ip: 192.168.0.3 ---exchange或者对端party rollsite IP
- port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370
+ ip: 192.168.0.2 ---exchange或者对端party rollsite IP
+ port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370,即无安全证书部署;如需开启安全证书通信,应设置为9371;
+ is_secure: False ---是否使用安全认证通讯;需要结合server_secure或者client_secure使用,当三者都为true时,表示和下一跳rollsite使用安全认证通讯,同时上一个参数port需设置为9371;不使用安全证书默认即可。
rules: ---本party自身路由配置
- name: default
ip: 192.168.0.1
@@ -415,6 +471,8 @@ host:
**4)修改guest参数**
+**注意:默认是不启用安全证书的配置,如果启用安全证书通讯需把server_secure,client_secure,is_secure设置为true,以及is_secure对应的port设置为9371**。
+
```
#不部署guest方则不用修改
#除了nodemanger可以设置多个IP外,其他都是单个IP
@@ -426,13 +484,17 @@ guest:
enable: True
ips: ---IP列表,目前rollsite只支持部署到一台服务器
- 192.168.0.2
- port: 9370
+ port: 9370 ---grpc端口
+ secure_port: 9371 ---grpcs端口
pool_size: 600 ---线程池大小
- max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如8G
- default_rules: ---本party指向exchange或者其他party的IP,端口路由配置
+ max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如12G,如果是rollsite专用的机器,配置成物理内存的75%。
+ server_secure: False ---作为服务端,开启安全证书验证,不使用安全证书默认即可
+ client_secure: False ---作为客户端,使用证书发起安全请求,不使用安全证书默认即可
+ default_rules: ---本party指向exchange或者其他party的IP、端口路由配置
- name: default
- ip: 192.168.0.3 ---exchange或者对端party rollsite IP
- port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370
+ ip: 192.168.0.1 ---exchange或者对端party rollsite IP
+ port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370,即无安全证书部署;如需开启安全证书通信,应设置为9371;
+ is_secure: False ---server_secure或者client_secure为true,指向的下一跳rollsite也开启了安全认证,此参数需要设置为true,上一个参数port需设置为9371,不使用安全证书默认即可
rules: ---本party自身路由配置
- name: default
ip: 192.168.0.2
@@ -486,33 +548,38 @@ guest:
**5)修改exchange参数**
+**注意:默认是不启用安全证书的配置,如果启用安全证书通讯需把server_secure,client_secure,is_secure设置为true,以及is_secure对应的port设置为9371**。
+
```
#不部署exchange则不需要修改
vi /data/projects/ansible-nfate-1.*/var_files/prod/fate_exchange
exchange:
- enable: True
+ enable: False --部署exchange需修改为True
rollsite:
ips:
- - 192.168.0.3
+ - 192.168.0.88
port: 9370
+ secure_port: 9371 ---grpcs端口
pool_size: 600
- max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如8G
+ max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如12G,如果是rollsite专用的机器,配置成物理内存的75%。
+ server_secure: False ---作为服务端,开启安全证书验证,不使用安全证书默认即可
+ client_secure: False ---作为客户端,使用证书发起安全请求,不使用安全证书默认即可
partys: ---指向各party的路由配置
- id: 10000
rules:
- name: default
ip: 192.168.0.1
- port: 9367
+ port: 9370 ---对应party rollsite 端口,一般默认9370,即无安全证书通讯;如需开启安全证书通信,应设置为9371;
+ is_secure: False ---server_secure或者client_secure为true,指向的下一跳rollsite也开启了安全认证,此参数需要设置为true,上一个参数port需设置为9371,不使用安全证书默认即可
- id: 9999
rules:
- name: default
ip: 192.168.0.2
- port: 9370
+ port: 9370 ---对应party rollsite 端口,一般默认9370,即无安全证书通讯;如需开启安全证书通信,应设置为9371;
+ is_secure: False ---server_secure或者client_secure为true,指向的下一跳rollsite也开启了安全认证,此参数需要设置为true,上一个参数port需设置为9371,不使用安全证书默认即可
```
-
-
### 4.5 部署
按照上述配置含义修改对应的配置项后,然后执行部署脚本:
@@ -520,11 +587,10 @@ exchange:
```
#相对ansible-nfate-*目录
cd /data/projects/ansible-nfate-1.*
-#测试环境加test参数执行
- nohup sh ./boot.sh test -D > logs/boot.log 2>&1 &
#生产环境加prod参数执行
nohup sh ./boot.sh prod -D > logs/boot.log 2>&1 &
+
```
部署日志输出在logs目录下,实时查看是否有报错:
@@ -551,6 +617,16 @@ tail -f ansible.log (实时查看部署情况,如果没有这个日志文件
---/data/projects/common目录存在,需要mv。
```
+fateflow部署完重启:
+
+```
+#因为fate_flow依赖的组件比较多,可能启动会有异常,处理如下:
+netstat -tlnp | grep 9360
+如果没有端口则重起fateflow:
+sh service.sh stop fate-fateflow
+sh service.sh start fate-fateflow
+```
+
### 4.6 问题定位
1)eggroll日志
@@ -593,6 +669,8 @@ cd /data/projects/fate/python/examples/toy_example/
python run_toy_example.py 10000 10000 1
```
+注意:如果超过1分钟没输出,表示部署有问题,需要看日志进行问题定位。
+
类似如下结果表示成功:
"2020-04-28 18:26:20,789 - secure_add_guest.py[line:126] - INFO: success to calculate secure_sum, it is 1999.9999999999998"
@@ -605,6 +683,8 @@ cd /data/projects/fate/python/examples/toy_example/
python run_toy_example.py 9999 9999 1
```
+注意:如果超过1分钟没输出,表示部署有问题,需要看日志进行问题定位。
+
类似如下结果表示成功:
"2020-04-28 18:26:20,789 - secure_add_guest.py[line:126] - INFO: success to calculate secure_sum, it is 1999.9999999999998"
@@ -682,18 +762,27 @@ Fateboard是一项Web服务。如果成功启动了fateboard服务,则可以
cd /data/projects/common/supervisord
```
-启动/关闭/查看所有:
+启动/关闭/重启/查看所有:
```
-sh service.sh start/stop/status all
+#注意:因mysql是基础组件,启动较慢,建议重启操作是先停止所有组件,然后先启动mysql,再启动其他组件
+sh service.sh start/stop/restart/status all
+
+#说明:因为fateflow依赖的组件比较多,重启所有的操作可能会导致fateflow启动异常,处理如下:
+netstat -tlnp | grep 9360
+如果没有端口则重起fateflow:
+sh service.sh stop fate-fateflow
+sh service.sh start fate-fateflow
```
-启动/关闭/查看单个模块(可选:clustermanager,nodemanager,rollsite,fateflow,fateboard,mysql):
+启动/关闭/重启/查看单个模块(可选:clustermanager,nodemanager,rollsite,fateflow,fateboard,mysql):
```
-sh service.sh start/stop/status fate-clustermanager
+sh service.sh start/stop/rsstart/status fate-clustermanager
```
+
+
## 6.2 查看进程和端口
**在目标服务器(192.168.0.1 192.168.0.2 )app用户下执行**
diff --git a/cluster-deploy/doc/Fate_guest_install_guide_ansible.md b/cluster-deploy/doc/Fate_guest_install_guide_ansible.md
index 7b2d6e0926..9312f7e0bc 100644
--- a/cluster-deploy/doc/Fate_guest_install_guide_ansible.md
+++ b/cluster-deploy/doc/Fate_guest_install_guide_ansible.md
@@ -37,18 +37,20 @@ Eggroll 是一个适用于机器学习和深度学习的大规模分布式架构
## 2.1.部署规划
-| role | partyid | IP地址 | 操作系统 | 主机配置 | 存储 | 外网IP | 外网带宽 | 部署模块 |
-| ----- | ---------------------- | --------------------- | ----------------------- | -------- | ---- | ----------- | -------- | ----------------------------------------------------- |
-| guest | 9999(根据实际规划修改) | 192.168.0.1 (有外网) | CentOS 7.2/Ubuntu 16.04 | 8C16G | 500G | xx.xx.xx.xx | >=20Mb | fate_flow,fateboard,clustermanager,rollsite,mysql |
-| guest | 9999(根据实际规划修改) | 192.168.0.2 | CentOS 7.2/Ubuntu 16.04 | 16C32G | 2T | | | nodemanger |
+| role | partyid | IP地址 | 操作系统 | 主机配置 | 存储 | 外网IP | 外网带宽 | 部署模块 |
+| ----- | ---------------------- | --------------------- | ---------- | -------- | ---- | ----------- | -------- | ----------------------------------------------------- |
+| guest | 9999(根据实际规划修改) | 192.168.0.1 (有外网) | CentOS 7.2 | 8C16G | 500G | xx.xx.xx.xx | >=20Mb | fate_flow,fateboard,clustermanager,rollsite,mysql |
+| guest | 9999(根据实际规划修改) | 192.168.0.2 | CentOS 7.2 | 16C32G | 2T | | | nodemanger |
+
+备注:涉及exchange说明会用192.168.0.88表示其IP,但本次示例不涉及exchange的部署。
## 2.2.主机资源和操作系统要求
| **类别** | **说明** |
| -------- | ------------------------------------------------------------ |
| 主机配置 | 不低于8C16G500G,千兆网卡 |
-| 操作系统 | CentOS linux 7.2及以上同时低于8/Ubuntu 16.04 或 Ubuntu 18.04 |
-| 依赖包 | 需要安装如下依赖包:
#centos
gcc gcc-c++ make openssl-devel gmp-devel mpfr-devel libmpc-devel libaio
numactl autoconf automake libtool libffi-devel ansible jq supervisor
#ubuntu
gcc g++ make openssl supervisor ansible jq libgmp-dev libmpfr-dev libmpc-dev
libaio libaio-dev numactl autoconf automake libtool libffi-dev ansible jq supervisor
cd /usr/lib/x86_64-linux-gnu
if [ ! -f "libssl.so.10" ];then
ln -s libssl.so.1.0.0 libssl.so.10
ln -s libcrypto.so.1.0.0 libcrypto.so.10
fi |
+| 操作系统 | CentOS linux 7.2及以上同时低于8 |
+| 依赖包 | 需要安装如下依赖包:
#centos
gcc gcc-c++ make openssl-devel gmp-devel mpfr-devel libmpc-devel libaio
numactl autoconf automake libtool libffi-devel ansible jq supervisor |
| 用户 | 用户:app,属主:apps(app用户需可以sudo su root而无需密码) |
| 文件系统 | 1、数据盘挂载在/data目录下。
2、创建/data/projects目录,目录属主为:app:apps。
3、根目录空闲空间不低于20G。 |
| 虚拟内存 | 不低于128G |
@@ -97,8 +99,6 @@ vim /etc/hosts
centos系统执行:rpm -qa | grep selinux
-ubuntu系统执行:apt list --installed | grep selinux
-
如果已安装了selinux就执行:setenforce 0
3.3 修改Linux系统参数
@@ -106,35 +106,37 @@ ubuntu系统执行:apt list --installed | grep selinux
**在目标服务器(192.168.0.1 192.168.0.2)root用户下执行:**
-1)vim /etc/security/limits.conf
+1)清理20-nproc.conf文件
+
+cd /etc/security/limits.d
+
+ls -lrt 20-nproc.conf
+
+存在则:mv 20-nproc.conf 20-nproc.conf_bak
+
+2)vim /etc/security/limits.conf
\* soft nofile 65535
\* hard nofile 65535
-2)vim /etc/security/limits.d/20-nproc.conf
+\* soft nproc 65535
-\* soft nproc unlimited
+\* hard nproc 65535
+
+重新登陆,ulimit -a查看是否生效
3.4 关闭防火墙
--------------
**在目标服务器(192.168.0.1 192.168.0.2 )root用户下执行**
-如果是Centos系统:
-
systemctl disable firewalld.service
systemctl stop firewalld.service
systemctl status firewalld.service
-如果是Ubuntu系统:
-
-ufw disable
-
-ufw status
-
3.5 软件环境初始化
------------------
@@ -162,6 +164,8 @@ Defaults !env_reset
**3)配置ssh无密登录**
+**注意:192.168.0.1不但需要可以免密登陆192.168.0.2,也需要可以免密登陆自身,配置完后务必手工ssh连接下自身和192.168.0.2,确认下认证信息。**
+
**a. 在目标服务器(192.168.0.1 192.168.0.2)app用户下执行**
su app
@@ -203,6 +207,8 @@ ssh app\@192.168.0.2
生产环境使用时,因内存计算需要增加128G虚拟内存,执行前需检查存储空间是否足够。
+注意:dd执行时间较长,请耐心等待
+
```
cd /data
dd if=/dev/zero of=/data/swapfile128G bs=1024 count=134217728
@@ -212,15 +218,20 @@ cat /proc/swaps
echo '/data/swapfile128G swap swap defaults 0 0' >> /etc/fstab
```
-## 3.7 安装ansible
+## 3.7 安装依赖包
-**目标服务器(192.168.0.1) root用户执行**
+**目标服务器(192.168.0.1 192.168.0.2) root用户执行**
```
-#判断是否已安装ansible
-ansible --version
-#没有则执行
-yum install -y ansible
+#安装基础依赖包
+yum install -y gcc gcc-c++ make openssl-devel gmp-devel mpfr-devel libmpc-devel libaio numactl autoconf automake libtool libffi-devel
+#如果有报错,需要解决yum源问题。
+
+#安装ansible和进程管理依赖包
+yum install -y ansible jq supervisor
+#如果有报错同时服务器有外网,没有外网的需要解决yum源不全的问题,执行:
+yum install -y epel-release
+#增加一个更全面的第三方的源,然后再重新安装ansible jq supervisor
```
4 项目部署
@@ -238,16 +249,16 @@ yum install -y ansible
**在目标服务器(192.168.0.1 192.168.0.2)app用户下执行**
```
-#虚拟内存,size不低于128G,如不满足需参考4.6章节重新设置
+#虚拟内存,size不低于128G,如不满足需参考3.6章节重新设置
cat /proc/swaps
Filename Type Size Used Priority
/data/swapfile128G file 134217724 384 -1
-#文件句柄数,不低于65535,如不满足需参考4.3章节重新设置
+#文件句柄数,不低于65535,如不满足需参考3.3章节重新设置
ulimit -n
65535
-#用户进程数,不低于64000,如不满足需参考4.3章节重新设置
+#用户进程数,不低于64000,如不满足需参考3.3章节重新设置
ulimit -u
65535
@@ -257,6 +268,7 @@ ps -ef| grep -i fate
netstat -tlnp | grep 4670
netstat -tlnp | grep 4671
netstat -tlnp | grep 9370
+netstat -tlnp | grep 9371
netstat -tlnp | grep 9360
netstat -tlnp | grep 8080
netstat -tlnp | grep 3306
@@ -281,8 +293,8 @@ ls -lrt /data/projects/common/supervisord/supervisord.d/fate-*.conf
```
#注意:URL链接有换行,拷贝的时候注意整理成一行
cd /data/projects/
-wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/ansible_nfate_1.4.4_release-1.0.0.tar.gz
-tar xzf ansible_nfate_1.4.4_release-1.0.0.tar.gz
+wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/ansible_nfate_1.4.5_release-1.0.0.tar.gz
+tar xzf ansible_nfate_1.4.5_release-1.0.0.tar.gz
```
4.4 配置文件修改和示例
@@ -293,9 +305,6 @@ tar xzf ansible_nfate_1.4.4_release-1.0.0.tar.gz
```
cd ansible-nfate-*
#init.sh文件不需要修改,主要是辅助生成一些配置文件
-
-#测试环境加test参数执行
- sh ./tools/init.sh test
#生产环境加prod参数执行
sh ./tools/init.sh prod
@@ -307,6 +316,25 @@ init var_files/prod
init project_prod.yml
```
+### 4.4.2 证书部署前配置(可选)
+
+1)联系webank获取guest端部署证书文件。
+
+2)放置到部署目录
+
+```
+cd /data/projects/ansible-nfate-*
+mkdir -p roles/eggroll/files/keys/guest
+cd roles/eggroll/files/keys/guest
+把获取到证书文件解压缩并放置到此目录下,如下:
+-rw-r--r-- 1 app apps 1371 Sep 4 18:07 guest-ca.pem
+-rw-r--r-- 1 app apps 241 Sep 4 18:07 guest-server.key
+-rw-r--r-- 1 app apps 1151 Sep 4 18:07 guest-server.pem
+-rw-r--r-- 1 app apps 1371 Sep 4 18:07 host-client-ca.pem
+-rw-r--r-- 1 app apps 241 Sep 4 18:07 host-client.key
+-rw-r--r-- 1 app apps 1143 Sep 4 18:07 host-client.pem
+```
+
### 4.4.2 修改配置文件
**1)修改初始化主机IP**
@@ -315,10 +343,13 @@ init project_prod.yml
vi /data/projects/ansible-nfate-1.*/environments/prod/hosts
#ansible格式配置文件
-[init] ---把需要部署的主机IP填入init组
+[fate] ---把需要部署的主机IP填入fate组
192.168.0.1
192.168.0.2
+[deploy_check] ---把执行ansible的本机IP填入deploy_check组
+192.168.0.1
+
[all:vars]
ansible_connection=ssh
ansible_ssh_port=22 ---根据实际主机ssh协议端口修改
@@ -342,6 +373,8 @@ deploy_mode: "install" ---默认为空,修改为install,表示新部署
**3)修改guest参数**
+**注意:默认是不启用安全证书的配置,如果启用安全证书通讯需把server_secure,client_secure,is_secure设置为true,以及is_secure对应的port设置为9371**。
+
```
#除了nodemanger可以设置多个IP外,其他都是单个IP
vi /data/projects/ansible-nfate-1.*/var_files/prod/fate_guest
@@ -352,13 +385,17 @@ guest:
enable: True ---是否部署rollsite模块,True为部署,False为否
ips: ---IP列表,目前rollsite只支持部署到一台服务器
- 192.168.0.1
- port: 9370 ---rollsite端口
+ port: 9370 ---rollsite grpc端口
+ secure_port: 9371 ---rollsite grpcs端口
pool_size: 600 ---线程池大小
- max_memory: 8G ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如8G
+ max_memory: 12G ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如12G,如果是rollsite专用的机器,配置成物理内存的75%。
+ server_secure: False ---作为服务端,开启安全证书验证,不使用安全证书默认即可
+ client_secure: False ---作为客户端,使用证书发起安全请求,不使用安全证书默认即可
default_rules: ---默认路由,本party指向exchange或者其他party的IP,端口
- name: default ---名称,默认即可
- ip: 192.168.0.3 ---exchange或者对端party rollsite IP,和webank确认后修改
- port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370,和webank确认后修改
+ ip: 192.168.0.88 ---exchange或者对端party rollsite IP,和webank确认后修改。
+ port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370,即无安全证书部署;如需开启安全证书通信,应设置为9371;和webank确认后修改。
+ is_secure: False ---是否使用安全认证通讯;需要结合server_secure或者client_secure使用,当三者都为true时,表示和下一跳rollsite使用安全认证通讯,同时上一个参数port需设置为9371;不使用安全证书默认即可。
rules: ---本party自身路由配置
- name: default ---本party rollsite所在主机IP和端口
ip: 192.168.0.1
@@ -419,8 +456,6 @@ guest:
```
#相对ansible-nfate-*目录
cd /data/projects/ansible-nfate-1.*
-#测试环境加test参数执行
- nohup sh ./boot.sh test -D > logs/boot.log 2>&1 &
#生产环境加prod参数执行
nohup sh ./boot.sh prod -D > logs/boot.log 2>&1 &
@@ -450,6 +485,16 @@ tail -f ansible.log (实时查看部署情况,如果没有这个日志文件
---/data/projects/common目录存在,需要mv。
```
+fateflow部署完重启:
+
+```
+#因为fate_flow依赖的组件比较多,可能启动会有异常,处理如下:
+netstat -tlnp | grep 9360
+如果没有端口则重起fateflow:
+sh service.sh stop fate-fateflow
+sh service.sh start fate-fateflow
+```
+
## 4.6 问题定位
1)eggroll日志
@@ -492,6 +537,8 @@ cd /data/projects/fate/python/examples/toy_example/
python run_toy_example.py 9999 9999 1
```
+注意:如果超过1分钟没输出,表示部署有问题,需要看日志进行问题定位。
+
类似如下结果表示成功:
"2020-04-28 18:26:20,789 - secure_add_guest.py[line:126] - INFO: success to calculate secure_sum, it is 1999.9999999999998"
@@ -569,16 +616,23 @@ Fateboard是一项Web服务。如果成功启动了fateboard服务,则可以
cd /data/projects/common/supervisord
```
-启动/关闭/查看所有:
+启动/关闭/重启/查看所有:
```
-sh service.sh start/stop/status all
+#注意:因mysql是基础组件,启动较慢,建议重启操作是先停止所有组件,然后先启动mysql,再启动其他组件
+sh service.sh start/stop/restart/status all
+
+#说明:因为fateflow依赖的组件比较多,重启所有的操作可能会导致fateflow启动异常,处理如下:
+netstat -tlnp | grep 9360
+如果没有端口则重起fateflow:
+sh service.sh stop fate-fateflow
+sh service.sh start fate-fateflow
```
-启动/关闭/查看单个模块(可选:clustermanager,nodemanager,rollsite,fateflow,fateboard,mysql):
+启动/关闭/重启/查看单个模块(可选:clustermanager,nodemanager,rollsite,fateflow,fateboard,mysql):
```
-sh service.sh start/stop/status fate-clustermanager
+sh service.sh start/stop/restart/status fate-clustermanager
```
## 6.2 查看进程和端口
diff --git a/cluster-deploy/doc/Fate_step_by_step_install_zh.md b/cluster-deploy/doc/Fate_step_by_step_install_zh.md
index d5d1b0efe5..fa1a0e8821 100644
--- a/cluster-deploy/doc/Fate_step_by_step_install_zh.md
+++ b/cluster-deploy/doc/Fate_step_by_step_install_zh.md
@@ -180,10 +180,10 @@ echo '/data/swapfile128G swap swap defaults 0 0' >> /etc/fstab
```
mkdir -p /data/projects/install
cd /data/projects/install
-wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.4-release.tar.gz
+wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.5-release.tar.gz
wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/jdk-8u192-linux-x64.tar.gz
-wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.4-release.tar.gz
-wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.4-release.tar.gz
+wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.5-release.tar.gz
+wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.5-release.tar.gz
#传输到192.168.0.2和192.168.0.3
scp *.tar.gz app@192.168.0.2:/data/projects/install
@@ -336,7 +336,7 @@ sh Miniconda3-4.5.4-Linux-x86_64.sh -b -p /data/projects/fate/common/miniconda3
tar xvf pip-packages-fate-*.tar.gz
source /data/projects/fate/common/python/venv/bin/activate
pip install setuptools-42.0.2-py2.py3-none-any.whl
-pip install -r pip-packages-fate-1.4.4/requirements.txt -f ./pip-packages-fate-1.4.4 --no-index
+pip install -r pip-packages-fate-1.4.5/requirements.txt -f ./pip-packages-fate-1.4.5 --no-index
pip list | wc -l
#结果应为161
```
diff --git a/cluster-deploy/doc/Fate_step_by_step_install_zh.rst b/cluster-deploy/doc/Fate_step_by_step_install_zh.rst
index 8d6b638b3a..f8f5c342fb 100644
--- a/cluster-deploy/doc/Fate_step_by_step_install_zh.rst
+++ b/cluster-deploy/doc/Fate_step_by_step_install_zh.rst
@@ -202,10 +202,10 @@ ufw status
mkdir -p /data/projects/install
cd /data/projects/install
- wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.4-release.tar.gz
+ wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.5-release.tar.gz
wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/jdk-8u192-linux-x64.tar.gz
- wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.4-release.tar.gz
- wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.4-release.tar.gz
+ wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.5-release.tar.gz
+ wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.5-release.tar.gz
#传输到192.168.0.2和192.168.0.3
scp *.tar.gz app@192.168.0.2:/data/projects/install
@@ -359,7 +359,7 @@ ufw status
tar xvf pip-packages-fate-*.tar.gz
source /data/projects/fate/common/python/venv/bin/activate
pip install setuptools-42.0.2-py2.py3-none-any.whl
- pip install -r pip-packages-fate-1.4.4/requirements.txt -f ./pip-packages-fate-1.4.4 --no-index
+ pip install -r pip-packages-fate-1.4.5/requirements.txt -f ./pip-packages-fate-1.4.5 --no-index
pip list | wc -l
#结果应为161
diff --git a/fate.env b/fate.env
index e46d4e96e7..0f3970743b 100755
--- a/fate.env
+++ b/fate.env
@@ -1,4 +1,4 @@
-FATE=1.4.4
+FATE=1.4.5
CENTOS=7.2
UBUNTU=16.04
PYTHON=3.6.5
diff --git a/fate_flow/db/db_models.py b/fate_flow/db/db_models.py
index 46ecd4a515..abb8ffb488 100644
--- a/fate_flow/db/db_models.py
+++ b/fate_flow/db/db_models.py
@@ -186,8 +186,8 @@ class DataView(DataBaseModel):
f_task_id = CharField(max_length=100)
f_type = CharField(max_length=50, null=True)
f_ttl = IntegerField(default=0)
- f_party_model_id = CharField(max_length=100, null=True)
- f_model_version = CharField(max_length=100, null=True)
+ f_party_model_id = CharField(max_length=200, null=True)
+ f_model_version = CharField(max_length=200, null=True)
f_size = BigIntegerField(default=0)
f_description = TextField(null=True, default='')
f_tag = CharField(max_length=50, null=True, index=True, default='')
diff --git a/standalone-deploy/README.md b/standalone-deploy/README.md
index abd2990070..d7d6face52 100644
--- a/standalone-deploy/README.md
+++ b/standalone-deploy/README.md
@@ -22,11 +22,11 @@ It is strongly recommended to use docker, which greatly reduces the possibility
```
#Get code
-wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/docker_standalone-fate-1.4.4.tar.gz
-tar -xzvf docker_standalone-fate-1.4.4.tar.gz
+wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/docker_standalone-fate-1.4.5.tar.gz
+tar -xzvf docker_standalone-fate-1.4.5.tar.gz
#Execute the command
-cd docker_standalone-fate-1.4.4
+cd docker_standalone-fate-1.4.5
bash install_standalone_docker.sh
```
@@ -82,14 +82,14 @@ Http://hostip:8080.
2. Download the compressed package of stand-alone version and decompress it.
```
- wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.4.tar.gz
- tar -xzvf standalone-fate-master-1.4.4.tar.gz
+ wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.5.tar.gz
+ tar -xzvf standalone-fate-master-1.4.5.tar.gz
```
3. Enter FATE directory and execute the init.sh.
```
- cd standalone-fate-master-1.4.4
+ cd standalone-fate-master-1.4.5
source init.sh init
```
@@ -98,7 +98,7 @@ Http://hostip:8080.
- Unit Test
```
- cd standalone-fate-master-1.4.4
+ cd standalone-fate-master-1.4.5
bash ./federatedml/test/run_test.sh
```
@@ -111,7 +111,7 @@ Http://hostip:8080.
- Toy_example Test
```
- cd standalone-fate-master-1.4.4
+ cd standalone-fate-master-1.4.5
python ./examples/toy_example/run_toy_example.py 10000 10000 0
```
diff --git a/standalone-deploy/README.rst b/standalone-deploy/README.rst
index c664644200..aa76567a07 100644
--- a/standalone-deploy/README.rst
+++ b/standalone-deploy/README.rst
@@ -36,11 +36,11 @@ possibility of encountering problems.
::
#Get code
- wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/docker_standalone-fate-1.4.4.tar.gz
- tar -xzvf docker_standalone-fate-1.4.4.tar.gz
+ wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/docker_standalone-fate-1.4.5.tar.gz
+ tar -xzvf docker_standalone-fate-1.4.5.tar.gz
#Execute the command
- cd docker_standalone-fate-1.4.4
+ cd docker_standalone-fate-1.4.5
bash install_standalone_docker.sh
@@ -98,14 +98,14 @@ Http://hostip:8080.
::
- wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.4.tar.gz
- tar -xzvf standalone-fate-master-1.4.4.tar.gz
+ wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.5.tar.gz
+ tar -xzvf standalone-fate-master-1.4.5.tar.gz
3. Enter FATE directory and execute the init.sh.
::
- cd standalone-fate-master-1.4.4
+ cd standalone-fate-master-1.4.5
source init.sh init
4. Test
@@ -114,7 +114,7 @@ Http://hostip:8080.
::
- cd standalone-fate-master-1.4.4
+ cd standalone-fate-master-1.4.5
bash ./federatedml/test/run_test.sh
If success, the screen shows like blow:
@@ -127,7 +127,7 @@ Http://hostip:8080.
::
- cd standalone-fate-master-1.4.4
+ cd standalone-fate-master-1.4.5
python ./examples/toy_example/run_toy_example.py 10000 10000 0
If success, the screen shows like blow:
diff --git a/standalone-deploy/doc/Fate-standalone_deployment_guide_zh.md b/standalone-deploy/doc/Fate-standalone_deployment_guide_zh.md
index e79046e157..c6fa12ff0a 100644
--- a/standalone-deploy/doc/Fate-standalone_deployment_guide_zh.md
+++ b/standalone-deploy/doc/Fate-standalone_deployment_guide_zh.md
@@ -21,11 +21,11 @@
```
#获取安装包
- wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/docker_standalone-fate-1.4.4.tar.gz
- tar -xzvf docker_standalone-fate-1.4.4.tar.gz
+ wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/docker_standalone-fate-1.4.5.tar.gz
+ tar -xzvf docker_standalone-fate-1.4.5.tar.gz
#执行部署
- cd docker_standalone-fate-1.4.4
+ cd docker_standalone-fate-1.4.5
bash install_standalone_docker.sh
```
@@ -80,14 +80,14 @@
2. 下载独立版本的压缩包并解压缩。
```
- wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.4.tar.gz
- tar -xzvf standalone-fate-master-1.4.4.tar.gz
+ wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.5.tar.gz
+ tar -xzvf standalone-fate-master-1.4.5.tar.gz
```
3. 进入FATE目录并执行init.sh。
```
- cd standalone-fate-master-1.4.4
+ cd standalone-fate-master-1.4.5
source init.sh init
```
@@ -96,7 +96,7 @@
- 单元测试
```
- cd standalone-fate-master-1.4.4
+ cd standalone-fate-master-1.4.5
bash ./federatedml/test/run_test.sh
```
@@ -109,7 +109,7 @@
- Toy测试
```
- cd standalone-fate-master-1.4.4
+ cd standalone-fate-master-1.4.5
python ./examples/toy_example/run_toy_example.py 10000 10000 0
```
diff --git a/tools/debug/check_conf.sh b/tools/debug/check_conf.sh
new file mode 100644
index 0000000000..ecc8ff3f36
--- /dev/null
+++ b/tools/debug/check_conf.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+
+cwd=$(cd `dirname $0`; pwd)
+source ./check_iplist.sh
+
+cd $EGGROLL_HOME
+
+echo "----------------------$EGGROLL_HOME/conf/eggroll.properties--------------------"
+cat $EGGROLL_HOME/conf/eggroll.properties | grep -v ^# | grep -v ^$
+echo ""
+echo "-----------------------$EGGROLL_HOME/conf/route_table.json---------------------"
+cat $EGGROLL_HOME/conf/route_table.json | grep -v ^# | grep -v ^$
+
+for ip in ${iplist[@]};do
+ echo "------------------diff $ip with ./conf/eggroll.properties-------------------------"
+ ssh $user@$ip "cat $EGGROLL_HOME/conf/eggroll.properties" | diff - conf/eggroll.properties
+ echo ""
+done
+
+cd $cwd
diff --git a/tools/debug/check_env.sh b/tools/debug/check_env.sh
new file mode 100644
index 0000000000..c81f0b44d4
--- /dev/null
+++ b/tools/debug/check_env.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+cwd=$(cd `dirname $0`; pwd)
+
+get_property() {
+ property_value=`grep $1 $2 | cut -d '=' -f 2-`
+ test_value $1 $2 ${property_value}
+}
+
+echo_red() {
+ echo -e "\e[1;31m $1\e[0m"
+}
+
+echo_green() {
+ echo -e "\e[1;32m $1\e[0m"
+}
+
+echo_yellow() {
+ echo -e "\e[1;33m $1\e[0m"
+}
+
+check_max_count() {
+ value=`cat $1`
+ if [ $value -ge 65535 ];then
+ echo_green "[OK] $1 is ok."
+ else
+ echo_red "[ERROR] please check $1, no less than 65535."
+ fi
+}
+
+check_file_count() {
+ value=`cat $1 | grep $2 | awk '{print $4}'`
+ for v in ${value[@]};do
+ test_value $1 $2 $v
+ done
+}
+
+test_value() {
+ if [ $3 -ge 65535 ];then
+ echo_green "[OK] $1 in $2 is ok."
+ else
+ echo_red "[ERROR] please check $1 in $2, no less than 65535."
+ fi
+}
+
+echo_green `date +"%Y-%m-%d_%H:%M:%S"`
+
+echo_green "=============check max user processes============"
+check_max_count "/proc/sys/kernel/threads-max"
+get_property "kernel.pid_max" "/etc/sysctl.conf"
+check_max_count "/proc/sys/kernel/pid_max"
+check_max_count "/proc/sys/vm/max_map_count"
+
+echo_green "=============check max files count=============="
+check_file_count "/etc/security/limits.conf" "nofile"
+check_file_count "/etc/security/limits.d/80-nofile.conf" "nofile"
+get_property "fs.file-max" "/etc/sysctl.conf"
+check_max_count "/proc/sys/fs/file-max"
+
+mem_total=`free -m | grep Mem | awk '{print $2}' | tr -cd "[0-9,.]"`
+mem_used=`free -m | grep Mem | awk '{print $3}' | tr -cd "[0-9],."`
+swap_total=`free -m | grep Swap | awk '{print $2}' | tr -cd "[0-9,.]"`
+swap_used=`free -m | grep Swap | awk '{print $3}' | tr -cd "[0-9,.]"`
+
+echo_green "=============Memory used and total==============="
+echo_yellow "[WARNING] MemTotal:`awk 'BEGIN{printf "%.2f%%\n",('$mem_total'/1024)}'`G, MemUsed:`awk 'BEGIN{printf "%.2f%%\n",('$mem_used'/1024)}'`G, MemUsed%:`awk 'BEGIN{printf "%.2f%%\n",('$mem_used'/'$mem_total')*100}'`"
+echo_green "=============SwapMem used and total==============="
+echo_yellow "[WARNING] SwapTotal:`awk 'BEGIN{printf "%.2f%%\n",('$swap_total'/1024)}'`G, SwapUsed:`awk 'BEGIN{printf "%.2f%%\n",('$swap_used'/1024)}'`G, SwapUsed%:`awk 'BEGIN{printf "%.2f%%\n",('$swap_used'/'$swap_total')*100}'`"
+echo_green "=============Disk use and total=================="
+echo_yellow "[WARNING] `df -lh | grep /data`"
+
+
diff --git a/tools/debug/check_iplist.sh b/tools/debug/check_iplist.sh
new file mode 100644
index 0000000000..b753a6382c
--- /dev/null
+++ b/tools/debug/check_iplist.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+
+user=app
+iplist=(xxx xxx)
diff --git a/tools/debug/cluster_env_check.sh b/tools/debug/cluster_env_check.sh
new file mode 100644
index 0000000000..9848f69c0a
--- /dev/null
+++ b/tools/debug/cluster_env_check.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+cwd=$(cd `dirname $0`; pwd)
+source ./check_iplist.sh
+
+for ip in ${iplist[@]};do
+ if ! ssh -tt $user@$ip test -d "${EGGROLL_HOME}/bin/debug"; then
+ echo "${EGGROLL_HOME}/bin/debug in $ip is not exist, mkdir -p ${EGGROLL_HOME}/bin/debug."
+ ssh -tt $user@$ip "mkdir -p ${EGGROLL_HOME}/bin/debug"
+ fi
+
+ if ! ssh -tt $user@$ip test -e ${EGGROLL_HOME}/bin/debug/check_env.sh;then
+ echo "${EGGROLL_HOME}/bin/debug/check_env.sh in $ip is not exist, scp check_env.sh to $ip:${EGGROLL_HOME}/bin/debug"
+ scp ./check_env.sh $user@$ip:${EGGROLL_HOME}/bin/debug
+ fi
+ ssh $user@$ip "sh ${EGGROLL_HOME}/bin/debug/check_env.sh" >> $ip
+ echo "The check result from $ip has saved in $cwd/$ip, please check it."
+done
diff --git a/tools/debug/env_check.py b/tools/debug/env_check.py
new file mode 100644
index 0000000000..815ba7cb55
--- /dev/null
+++ b/tools/debug/env_check.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+import re
+import sys
+import json
+import time
+import socket
+import psutil
+import datetime
+import argparse
+import subprocess
+from eggroll.core.session import ErSession
+from eggroll.roll_pair.roll_pair import RollPairContext
+from eggroll.utils.log_utils import get_logger
+
+L = get_logger()
+
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument("-t","--time", type=int, help="Sleep time wait, default value 0s", default=0)
+arg_parser.add_argument("-n","--nodes", type=int, help="Eggroll session processors per node, default value 1", default=1)
+arg_parser.add_argument("-p","--partitions", type=int, help="Total partitions, default value 1", default=1)
+args = arg_parser.parse_args()
+
+def str_generator(include_key=True, row_limit=10, key_suffix_size=0, value_suffix_size=0):
+ for i in range(row_limit):
+ if include_key:
+ yield str(i) + "s"*key_suffix_size, str(i) + "s"*value_suffix_size
+ else:
+ yield str(i) + "s"*value_suffix_size
+
+def round2(x):
+ return str(round(x / 1024 / 1024 / 1024, 2))
+
+def print_red(str):
+ print("\033[1;31;40m\t" + str + "\033[0m")
+
+def print_green(str):
+ print("\033[1;32;40m\t" + str + "\033[0m")
+
+def print_yellow(str):
+ print("\033[1;33;40m\t" + str + "\033[0m")
+
+def check_actual_max_threads():
+ def getMemInfo(fn):
+ def query_cmd(cmd):
+ p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0].decode().strip().split('\n')
+ return p[0]
+
+ def get_host_ip():
+ try:
+ s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+ s.connect(('8.8.8.8', 80))
+ ip = s.getsockname()[0]
+ finally:
+ s.close()
+ return ip
+
+ mem = psutil.virtual_memory()
+ mem_total = round2(mem.total)
+ mem_used = round2(mem.used)
+ mem_used_per = str(round(mem.percent)) + '%'
+
+ swap_mem = psutil.swap_memory()
+ swap_total = round2(swap_mem.total)
+ swap_used = round2(swap_mem.used)
+ swap_use_per = str(round(swap_mem.percent)) + '%'
+
+ data_disk = psutil.disk_usage('/data')
+ disk_total = round2(data_disk.total)
+ disk_used = round2(data_disk.used)
+ disk_per = str(round(data_disk.percent)) + '%'
+
+ mem_info = {}
+ mem_info["Ip"] = get_host_ip()
+ mem_info["MemTotal"] = mem_total
+ mem_info["MemUsed"] = mem_used
+ mem_info["MemUsedPCT"] = mem_used_per
+
+ mem_info["SwapTotal"] = swap_total
+ mem_info["SwapUsed"] = swap_used
+ mem_info["SwapUsePCT"] = swap_use_per
+
+ mem_info["DiskTotal"] = disk_total
+ mem_info["DiskUsed"] = disk_used
+ mem_info["DiskUsedPCT"] = disk_per
+
+ mem_info["/proc/sys/kernel/threads-max"] = query_cmd("cat /proc/sys/kernel/threads-max")
+ mem_info["/etc/sysctl.conf"] = query_cmd("grep kernel.pid_max /etc/sysctl.conf | awk -F= '{print $2}'")
+ mem_info["/proc/sys/kernel/pid_max"] = query_cmd("cat /proc/sys/kernel/pid_max")
+ mem_info["/proc/sys/vm/max_map_count"] = query_cmd("cat /proc/sys/vm/max_map_count")
+
+ mem_info["/etc/security/limits.conf"] = query_cmd("cat /etc/security/limits.conf | grep nofile | awk '{print $4}'")
+ mem_info["/etc/security/limits.d/80-nofile.conf"] = query_cmd("cat /etc/security/limits.d/80-nofile.conf | grep nofile | awk '{print $4}'")
+ mem_info["/etc/sysctl.conf"] = query_cmd("grep fs.file-max /etc/sysctl.conf | awk -F= '{print $2}'")
+ mem_info["/proc/sys/fs/file-max"] = query_cmd("cat /proc/sys/fs/file-max")
+
+ mem_info["CurrentUseProcesses"] = query_cmd("pstree -p `ps -e |grep egg_pair |awk '{print $1}'` |wc -l")
+ mem_info["NodeProcessors"] = query_cmd("grep eggroll.session.processors.per.node ${EGGROLL_HOME}/conf/eggroll.properties | awk -F= '{print $2}'")
+ mem_info["PoolSize"] = query_cmd("grep eggroll.rollpair.eggpair.server.executor.pool.max.size ${EGGROLL_HOME}/conf/eggroll.properties | awk -F= '{print $2}'")
+
+ rollsite_pid = query_cmd("ps aux | grep ${EGGROLL_HOME} | grep com.webank.eggroll.rollsite.Proxy | grep -v grep | awk '{print $2}'")
+ if rollsite_pid:
+ rollsite_used_memory = psutil.Process(int(rollsite_pid)).memory_info().rss
+ myfile = open(sys.path[1] + '/../../../conf/eggroll.properties')
+ properties = myfile.read()
+ jvm_options = re.findall(r"(?<=MaxHeapSize=).*?(?=G)", properties)
+ if len(jvm_options):
+ rollsite_total_memory = int(jvm_options[0]) * 1024 * 1024 * 1024
+ else:
+ rollsite_total_memory = mem.total
+ myfile.close()
+
+ mem_info["RollsiteUsedPercent"] = '{:.2%}'.format(rollsite_used_memory / (rollsite_total_memory * 4))
+ else:
+ mem_info["RollsiteUsedPercent"] = 0
+
+
+ return mem_info
+
+ session = ErSession(options={"eggroll.session.processors.per.node": args.nodes})
+ try:
+ ctx = RollPairContext(session)
+ rp = ctx.parallelize(str_generator(row_limit=1000), options={'total_partitions': args.partitions})
+ result = rp.with_stores(func=getMemInfo)
+ print_green(str(datetime.datetime.now()))
+ #print(json.dumps(result, indent=1))
+ for node in result:
+ print_green("==============This is node " + str(node[0]) + ":" + node[1]["Ip"] + "===========================================")
+ print_yellow("[WARNING] MemTotal:" + node[1]["MemTotal"] + "G, MemUsed:" + node[1]["MemUsed"] + "G, MemUsedPCT:" + node[1]["MemUsedPCT"])
+ if float(node[1]["SwapTotal"]) < 128:
+ print_red("[ERROR] The swap memory is:" + node[1]["SwapTotal"] + "G, no less than 128G.")
+ else:
+ print_yellow("[WARNING] SwapTotal:" + node[1]["SwapTotal"] + "G, SwapUsed:" + node[1]["SwapUsed"] + "G, SwapUsePCT:" + node[1]["SwapUsePCT"])
+ print_yellow("[WARNING] DiskTotal:" + node[1]["DiskTotal"] + "G, DiskUsed:" + node[1]["DiskUsed"] + "G, DiskUsedPCT:" + node[1]["DiskUsedPCT"])
+ print_green("--------------Max user processes and max file count----------------------------------------")
+ for key in ["/proc/sys/kernel/threads-max", "/etc/sysctl.conf", "/proc/sys/kernel/pid_max", "/proc/sys/vm/max_map_count", "/etc/security/limits.conf", "/etc/security/limits.d/80-nofile.conf", "/etc/sysctl.conf", "/proc/sys/fs/file-max"]:
+ if int(node[1][key]) > 65535:
+ print_green("[OK] " + key + " = " + node[1][key])
+ else:
+ print_red("[ERROR] please check " + key + " = " + node[1][key] + ", no less than 65535.")
+ print_green("--------------Thread count check-----------------------------------------------------------")
+ if len(node[1]["PoolSize"]) == 0:
+ node[1]["PoolSize"] = 500
+ if int(node[1]["CurrentUseProcesses"]) < int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"]):
+ print_green("[OK] The thread count = %s, the total processes = %s * %s = %i" % (node[1]["CurrentUseProcesses"], node[1]["NodeProcessors"] ,node[1]["PoolSize"], int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"])))
+ else:
+ print_red("[ERROR] The thread count = %s, the total processes = %s * %s = %i. eggroll.rollpair.eggpair.server.executor.pool.max.size is not enough, turn it up." % (node[1]["CurrentUseProcesses"], node[1]["NodeProcessors"] ,node[1]["PoolSize"], int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"])))
+ if node[1]["RollsiteUsedPercent"] != 0:
+ print_green("----------Rollsite memory use percent--------------------------------------------------")
+ print_yellow("[WARNING] rollsite memory use: " + node[1]["RollsiteUsedPercent"])
+ print("\n")
+ finally:
+ session.kill()
+
+
+if __name__ == '__main__':
+ if args.time == 0:
+ check_actual_max_threads()
+ else:
+ while 1:
+ check_actual_max_threads()
+ time.sleep(args.time)
diff --git a/tools/debug/env_check.sh b/tools/debug/env_check.sh
new file mode 100644
index 0000000000..3a81cbc4d3
--- /dev/null
+++ b/tools/debug/env_check.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+cwd=$(cd `dirname $0`; pwd)
+nodes=$1
+
+LogLevel=$EGGROLL_LOG_LEVEL
+export EGGROLL_LOG_LEVEL=INFO
+python env_check.py -p $nodes > result_env.log
+export EGGROLL_LOG_LEVEL=$LogLevel
+echo "The check result has saved in $cwd/result_env.log, please check it."
+
diff --git a/tools/debug/grep_logs.sh b/tools/debug/grep_logs.sh
new file mode 100644
index 0000000000..0351e36b4c
--- /dev/null
+++ b/tools/debug/grep_logs.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+cwd=$(cd `dirname $0`; pwd)
+source ./check_iplist.sh
+session=$1
+
+for ip in ${iplist[@]};do
+ mkdir -p $session/$ip
+ scp -r $user@$ip:$EGGROLL_HOME/logs/*$session* $session/$ip
+ echo "The $session logs from $ip has saved in $cwd/$session/$ip, please check it."
+done
+cd $cwd
diff --git a/tools/debug/server_check.py b/tools/debug/server_check.py
new file mode 100644
index 0000000000..d39eda417d
--- /dev/null
+++ b/tools/debug/server_check.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+import re
+import os
+import sys
+import json
+import time
+import socket
+import psutil
+import datetime
+import threading
+import argparse
+import subprocess
+from eggroll.core.session import ErSession
+from eggroll.roll_pair.roll_pair import RollPairContext
+from eggroll.utils.log_utils import get_logger
+
+L = get_logger()
+
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument("-t","--time", type=int, help="Sleep time wait, default value 0s", default=0)
+arg_parser.add_argument("-n","--nodes", type=int, help="Eggroll session processors per node, default value 1", default=1)
+arg_parser.add_argument("-p","--partitions", type=int, help="Total partitions, default value 1", default=1)
+arg_parser.add_argument("-d","--partyid", type=int, help="host partyid", default=0)
+args = arg_parser.parse_args()
+
+def str_generator(include_key=True, row_limit=10, key_suffix_size=0, value_suffix_size=0):
+ for i in range(row_limit):
+ if include_key:
+ yield str(i) + "s"*key_suffix_size, str(i) + "s"*value_suffix_size
+ else:
+ yield str(i) + "s"*value_suffix_size
+
+def round2(x):
+ return str(round(x / 1024 / 1024 / 1024, 2))
+
+def print_red(str):
+ print("\033[1;31;40m\t" + str + "\033[0m")
+
+def print_green(str):
+ print("\033[1;32;40m\t" + str + "\033[0m")
+
+def print_yellow(str):
+ print("\033[1;33;40m\t" + str + "\033[0m")
+
+def check_actual_max_threads():
+ def getMemInfo(fn):
+ def query_cmd(cmd):
+ p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0].decode().strip().split('\n')
+ return p[0]
+
+ def get_host_ip():
+ try:
+ s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+ s.connect(('8.8.8.8', 80))
+ ip = s.getsockname()[0]
+ finally:
+ s.close()
+ return ip
+ fate_flow_client = "/data/projects/fate/python/fate_flow/fate_flow_client.py"
+ mem_info = {}
+ mem_info["Ip"] = get_host_ip()
+ eggroll_home = query_cmd("echo $EGGROLL_HOME")
+ route_file = eggroll_home + "/conf/route_table.json"
+ f = open(route_file, encoding='utf-8')
+ mem_info["route_table"] = json.load(f)
+ mem_info["data_access"] = query_cmd("ps aux |grep data_access_server |grep -v grep |wc -l")
+ if args.partyid != 0:
+ mem_info["data_test"] = query_cmd("curl -X POST --header 'Content-Type: application/json' -d '{\"local\": {\"role\": \"host\", \"party_id\": %s}, \"id_type\":\"phone\", \"encrypt_type\":\"md5\"}' 'http://127.0.0.1:9350/v1/data/query_imported_id_library_info'" %(args.partyid))
+ mem_info["data_num"] = mem_info["data_test"].split(':')[-1].split('}')[0]
+ mem_info["directory"] = query_cmd("if [ -d /data/projects/fdn/FDN-DataAcces ];then echo 1; else echo 0; fi")
+ mem_info["services"] = ['ClusterManagerBootstrap','NodeManagerBootstrap','rollsite','fate_flow_server.py','fateboard','mysql']
+ mem_info["job_run"] = query_cmd("if [ -f %s ];then python %s -f query_job -s running | grep f_job_id |wc -l; else echo -1; fi" %(fate_flow_client,fate_flow_client))
+ mem_info["job_wait"] = query_cmd("if [ -f %s ];then python %s -f query_job -s waiting | grep f_job_id |wc -l; else echo -1; fi" %(fate_flow_client,fate_flow_client))
+ mem_info["job_thread"] = []
+ mem_info["jobs"] = query_cmd("array=(`python %s -f query_job -s running | grep f_job_id |awk -F: '{print $2}' |awk -F '\"' '{print $2}'`);echo ${array[@]}" %(fate_flow_client))
+ mem_info["job_mem"] = []
+ for job_id in mem_info["jobs"]:
+ mem_info["job_thread"] = query_cmd("ps -ef |grep egg_pair |grep -v grep |grep %s |wc -l" %(job_id))
+ mem_info["job_mem"] = query_cmd("ps aux |grep egg_pair |grep %s |awk '{sum+=$6};END {print sum}'" %(job_id))
+ mem_info["server_mem"] = {}
+ mem_info["thread"] = {}
+ for service in mem_info["services"]:
+ mem_info["thread"][service] = query_cmd("ps -ef |grep %s |grep -v grep |wc -l" %(service))
+ mem_info["server_mem"][service] = str(query_cmd("ps aux |grep %s |grep -v grep |awk '{sum+=$6};END {print sum}'" %(service)))
+ return mem_info
+
+ session = ErSession(options={"eggroll.session.processors.per.node": args.nodes})
+ try:
+ ctx = RollPairContext(session)
+ rp = ctx.parallelize(str_generator(row_limit=1000), options={'total_partitions': args.partitions})
+ result = rp.with_stores(func=getMemInfo)
+ print_green(str(datetime.datetime.now()))
+ for node in result:
+ print_green("==============This is node " + str(node[0]) + ":" + node[1]["Ip"] + "===========================================")
+ print_green("-------------default route check-------------------------------------------------------")
+ route_table_dict = node[1]["route_table"]
+ if 'default' not in route_table_dict['route_table']:
+ print_red("[ERROR] eggroll exchange route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed!")
+ else:
+ try:
+ ip = route_table_dict['route_table']['default']['default'][0]['ip']
+ port = route_table_dict['route_table']['default']['default'][0]['port']
+ print_green("[OK] eggroll route configured!")
+ print_green("exchange ip:{}, exchange port:{}".format(ip, port))
+ except KeyError:
+ print_red("[ERROR] eggroll exchange route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed!")
+
+ print_green("--------------data_access service check-------------------------------------------------")
+ if int(node[1]["data_access"]) == 0:
+ if int(node[1]["directory"]) == 0:
+ print_red("[ERROR] data_access service and directory not found, please check if it is installed!")
+ else:
+ print_yellow("[WARNING] data_access not running or check /data/projects/fdn/FDN-DataAcces directory")
+ else:
+ print_green("[OK] Installed and running data_access service!")
+ if args.partyid != 0:
+ if int(node[1]["data_num"]) == 0 or int(node[1]["data_num"]) == 201:
+ print_green("[OK] Route verification success!")
+ else:
+ print_yellow("[WARNING] data_access service not available, please check host and host route!")
+
+ print_green("--------------fate service check-------------------------------------------------------")
+ for server in node[1]["services"]:
+ if int(node[1]["thread"][server]) > 0:
+ print_green("[OK] the " + server.ljust(23) + " service is running , number of processes is : " + str(node[1]["thread"][server]) + "; used memory : " + str(node[1]["server_mem"][server]) + "KB.")
+ else:
+ print_yellow("[WARNING] the " + server + " service not running, please check service status.")
+
+ print_green("--------------fate_flow jobs process and mem info check--------------------------------------------------")
+ if int(node[1]["job_run"]) == -1:
+ print_red("[ERROR] There is no such fate_flow_client.py file, please check fate_flow server if it is running!")
+ else:
+ print_green("[OK] Number of tasks running is " + node[1]["job_run"])
+ print_green("[OK] Number of tasks waiting is " + node[1]["job_wait"])
+ if int(node[1]["job_run"]) > 0:
+ for job_id in node[1]["jobs"].split(" "):
+ print_green("[OK] running task job_id : " + job_id + ", number of egg_pair processes is : " + str(node[1]["job_thread"]) + "; used memory : " + str(node[1]["job_mem"]) + "KB.")
+
+ print("\n")
+ finally:
+ session.kill()
+
+
+if __name__ == '__main__':
+ if args.time == 0:
+ check_actual_max_threads()
+ else:
+ while 1:
+ check_actual_max_threads()
+ time.sleep(args.time)
diff --git a/tools/debug/server_check.sh b/tools/debug/server_check.sh
new file mode 100644
index 0000000000..56cc20416c
--- /dev/null
+++ b/tools/debug/server_check.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+cwd=$(cd `dirname $0`; pwd)
+if [ ! -f 'result_server.log' ];then
+touch result_server.log
+fi
+
+nodes=$1
+party=$2
+LogLevel=$EGGROLL_LOG_LEVEL
+export EGGROLL_LOG_LEVEL=INFO
+if [ -n "$party" ];then
+ python server_check.py -p $nodes -d $party >> result_server.log
+else
+ python server_check.py -p $nodes >> result_server.log
+fi
+export EGGROLL_LOG_LEVEL=$LogLevel
+echo "Check the result in the current directory, Please execute command: cat result_server.log"
diff --git a/tools/debug/test_env.py b/tools/debug/test_env.py
new file mode 100644
index 0000000000..73661f90c0
--- /dev/null
+++ b/tools/debug/test_env.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+import re
+import subprocess
+
+def sub_dict(form_dict, sub_keys, default=None):
+ return dict([(k.strip(), form_dict.get(k.strip(), default)) for k in sub_keys.split(',')])
+
+
+def query_file(file_name, opts=''):
+ mem_info = {}
+ print(file_name)
+ with open(file_name, 'r') as f:
+ data = f.readlines()
+ for i in data:
+ if ':' in i or '=' in i:
+ i = i.replace(':', ',').replace('=', ',')
+ k, v = [x.strip() for x in i.split(',')]
+ mem_info[k] = int(v.split()[0])
+ return sub_dict(mem_info, opts)
+
+
+def query_cmd(cmd, opts=''):
+ if opts:
+ opts = " | grep -E '" + opts.replace(',', '|').replace(' ', '') + "'"
+ print(cmd + opts)
+ p = subprocess.Popen(cmd + opts, stdout=subprocess.PIPE, shell=True)
+ return p.communicate()[0]
+
+def query(cmd, opts='', flags=True):
+ if flags:
+ print(str(query_cmd(cmd, opts)))
+ else:
+ print(str(query_file(cmd, opts)))
+
+if __name__ == "__main__":
+ max_user_processes_params=[('cat /proc/sys/kernel/threads-max',),('/etc/sysctl.conf', 'kernel.pid_max', False),('cat /proc/sys/kernel/pid_max',),('cat /proc/sys/vm/max_map_count',)]
+ print('==============max user processes===============')
+ for p in max_user_processes_params:
+ s = query(*p)
+
+ max_files_count_params=[('cat /etc/security/limits.conf', 'nofile'),('cat /etc/security/limits.d/80-nofile.conf',),('/etc/sysctl.conf','fs.file-max', False),('cat /proc/sys/fs/file-max',)]
+ print('===============max files count=================')
+ for i in max_files_count_params:
+ query(*i)
+
+ memory_params=('/proc/meminfo', 'MemTotal, MemFree, MemAvailable, SwapTotal, SwapFree', False)
+ print('================memory info====================')
+ query(*memory_params)
+
+ disk_params=('df -lh', '/dev/vdb,/dev/vda1')
+ print('================disk info====================')
+ query(*disk_params)
+
diff --git a/tools/debug/time_check.py b/tools/debug/time_check.py
new file mode 100644
index 0000000000..51c5801879
--- /dev/null
+++ b/tools/debug/time_check.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+import os
+import time
+import argparse
+
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument("-t","--time", type=int, help="Sleep time wait, default value 0s", default=0)
+args = arg_parser.parse_args()
+
+if args.time == 0:
+ os.system('sh ./cluster_env_check.sh')
+else:
+ while 1:
+ os.system('sh ./cluster_env_check.sh')
+ time.sleep(args.time)
diff --git "a/tools/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/tools/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md"
new file mode 100644
index 0000000000..2038034f8b
--- /dev/null
+++ "b/tools/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md"
@@ -0,0 +1,394 @@
+# 脚本使用说明
+
+## 一 概述
+
+本工具集提供4个工具,功能如下:
+
+| 工具名称 | 工具功能 | 使用场景 |
+| ---------------- | ------------------------------------------ | -------------------- |
+| 机器基础信息检测 | 验证机器设置是否满足跑fate任务要求 | 部署完成并启动服务后 |
+| fate运行信息检测 | 验证机器当前状态是否适合新建一个fate任务 | 启动fate任务前 |
+| 日志搜集 | 搜集该集群下所有session_id的日志到当前目录 | 跑任务出现错误后 |
+| 集群配置检测 | 搜集展示集群的配置文件信息 | 部署完成启动服务失败 |
+
+名词解释:
+
+| 名词 | 解释 |
+| ----------------------- | -------------------------------------------------------- |
+| $FATE_HOME | 通常在/data/projects/fate |
+| $EGGROLL_HOME | 通常在/data/projects/fate/eggroll |
+| ${集群节点个数} | 如果运行脚本的机器所在集群有3个节点,就取3 |
+| ${host party_id} | 可选参数,检查data_access服务是否可用,取host方partyid值 |
+| ${需要查询的session-id} | 是一个21位左右的长id。如202009031227285073491。 |
+
+
+
+## 二 机器基础信息检测
+
+### 2.1 使用场景
+
+------
+
+此脚本在完成部署并正常启动服务后运行,脚本功能检查系统内存 / 虚拟内存 / 磁盘 / 最大用户进程数 / 文件数 / 线程数设置 / rollsite进程堆内存 等机器基础信息,用于验证机器设置是否满足跑fate任务要求。
+
+### 2.2 工具功能
+
+------
+
+此检测检测提供两种版本:
+
+- 单集群版:基于eggroll服务检测,需要各个节点eggroll服务正常启动后方可使用,用于检测各个nodemanager服务所在节点的集群基础信息,其检测项包含以下所有共7项;
+
+- 跨集群版:无需依赖eggroll服务,可以跨节点检测指定所有ip的基础信息,其检测项仅包括以下列出前5项,**需支持节点间免密登录**。
+
+1)检查系统内存:系统内存总量、系统内存使用量、系统内存使用占比
+
+2)检查虚拟内存:虚拟内存总量、虚拟内存使用量、虚拟内存使用占比
+
+3)检查磁盘使用情况:磁盘总量、磁盘使用量、磁盘使用占比
+
+4)检查系统最大用户进程数
+
+5)检查最大文件数
+
+6)检查线程数设置:检查egg pair线程数eggroll.rollpair.eggpair.server.executor.pool.max.size设置是否充足
+
+7)检查rollsite进程堆内存是否充足:
+
+### 2.3 使用方法
+
+------
+
+#### 2.3.1 单集群版
+
+```shell
+source $FATE_HOME/init_env.sh
+cd $EGGROLL_HOME/bin/debug
+sh env_check.sh ${集群节点个数}
+cat result_env.log
+```
+
+若对几个$开头的变量有疑问,请参考概述中的名词解释。
+
+#### 2.3.2 跨集群版
+
+**需支持节点间免密scp、ssh操作,也可以手动输入密码执行**
+
+1、设置环境变量:
+
+```shell
+source $FATE_HOME/init_env.sh
+```
+
+2、编辑配置文件:
+
+```
+cd $EGGROLL_HOME/bin/debug
+vi check_iplist.sh
+```
+
+参数说明:
+
+```shell
+user=app <远程登录用户名>
+iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表>
+```
+
+3、执行检测脚本:
+
+```python
+python time_check.py
+//查看检测结果,各个ip的检测结果生成于当前目录下以ip命名的文件,直接cat可查看对应ip的返回信息
+cat ./$ip
+```
+
+//若需定时检测观察内存信息变化则加-t参数,可指定间隔秒数定时输出
+
+```
+python time_check.py -t {检测间隔秒数,不填只检测一次}
+```
+
+### 2.4 检测结果说明
+
+------
+
+返回示例信息如下:
+
+*说明:以下信息分为三种提示等级:*
+
+*[OK\] 表示该检查项正常;*
+
+*[WARNING\]表示该项需要注意,仅作关键信息展示,需要自行判断;*
+
+*[ERROR\]表示该项不符合预期结果,需要按提示修改。*
+
+```properties
+//脚本执行时间
+ 2020-09-02 15:00:41.424053
+//返回的节点ip
+ ==============This is node 0:127.0.0.1===========================================
+//系统内存总量、系统内存使用量、系统内存使用占比
+ [WARNING] MemTotal:78.51G, MemUsed:11.5G, MemUsedPCT:15%
+//虚拟内存总量、虚拟内存使用量、虚拟内存使用占比,若小于128G,则提示ERROR,如下所示:
+ [ERROR] The swap memory is:32.0G, no less than 128G. <虚拟内存不足
+ [WARNING] MemTotal:16.51G, MemUsed:128G, MemUsedPCT:12.3% <虚拟内存正常
+//磁盘总量、磁盘使用量、磁盘使用占比
+ [WARNING] DiskTotal:984.18G, DiskUsed:566.53G, DiskUsedPCT:61%
+ --------------Max user processes and max file count------------------------------
+//最大用户进程数与最大文件数各个文件设置值展示,其中不满足65535的项则报[ERROR提示]:
+ [OK] /proc/sys/kernel/threads-max = 642956
+ [OK] /etc/sysctl.conf = 1048576
+ [OK] /proc/sys/kernel/pid_max = 131072
+ [ERROR] please check /proc/sys/vm/max_map_count = 65530, no less than 65535.
+ [OK] /etc/security/limits.conf = 102401
+ [OK] /etc/security/limits.d/80-nofile.conf = 131072
+ [OK] /etc/sysctl.conf = 1048576
+ [OK] /proc/sys/fs/file-max = 1048576
+ --------------Thread count check-------------------------------------------------
+//判断eggroll.properties中eggroll.rollpair.eggpair.server.executor.pool.max.size配置项设置的线程值是否充足,若不充足,则报[ERROR]提示需要调大线程值
+ [OK] The thread count = 1406, the total processes = 16 * 500 = 8000
+ ----------Rollsite memory use percent--------------------------------------------
+//展示rollsite进程占用堆内存与rollsite设置内存上限比值,以判断rollsite内存是否充足,若百分比偏大,则需考虑释放rollsite内存或调高rollsite内存上限
+ [WARNING] rollsite memory use: 0.69%
+```
+
+
+
+## 三 fate运行信息检测
+
+### 3.1 使用场景
+
+------
+
+跑fate任务前,检测fate运行信息。验证机器当前状态是否适合新建一个fate任务
+
+### 3.2 工具功能
+
+------
+
+检测fate运行信息:eggroll路由是不是默认路由、是否已安装data access、fate服务的运行状态、进程数及占用内存情况、当前环境正在运行及等待的job任务数、job任务有多少进程及占用的内存情况。
+
+### 3.3 使用方法
+
+```
+source $FATE_HOME/init_env.sh //FATE_HOME为用户环境的fate目录
+cd $EGGROLL_HOME/bin/debug
+sh server_check.sh ${集群内节点个数} ${host party_id(可选)}
+例:sh server_check.sh 1 10000
+```
+
+可选参数:
+
+ {host party_id} //当需要检查data_assess的服务是否可用时使用。若不提供该参数时不检测。
+
+结果保存在result_server.log文件中
+
+### 3.4 检测结果说明
+
+------
+
+#### 3.4.1 default route check(eggroll路由是不是默认路由)
+
+- 检测通过提示:
+
+ [OK] eggroll route configured!
+
+ "port": 9370, "ip": "127.0.0.1"
+
+- 检测失败提示:
+
+ [ERROR] eggroll route is not configured, please check /data/projects/fate/eggroll/conf/route_table.json file if it is existed!
+
+- 检查方法:
+
+ 检测/data/projects/fate/eggroll/conf/route_table.json 是否有配置default参数。如果有,把ip和端口打印出来。如果无,提示ERROR。
+
+
+
+#### 3.4.2 data_access service check(是否已安装data access)
+
+- 检测通过提示:
+
+ [OK] Installed and running data_access service!
+
+- 检测失败提示:
+
+ [ERROR] data_access service and directory not found, please check if it is installed!
+
+- 检查方法:
+
+ 先检查data_access 进程是否存在或者目录是否存在。若存在,会进一步检查data_access 服务是否可用。详细逻辑是:
+
+ ```
+ 若返回进程数为0,判断检查服务目录的返回值,若为0,则视为没有安装access,提示ERROR;否则,则视为没有启动access,提示WARNING;
+
+ 若返回进程数大于0,判断路由验证返回码,如果返回 "status":0,或 "status":201,则说明 DataAccess 服务以及路由表配置没有问题,否则提示WARNING检查路由设置
+ ```
+
+#### 3.4.3 fate service check(fate服务状态、进程数及占用内存)
+
+- 检测通过提醒:
+
+ [OK] the service is running , number of processes is :; used memory:
+
+- 检测失败提醒:
+
+ [WARNING] the service not running, please check service status.
+
+- 检查方法:
+
+ 检查服务列表:
+
+ 'ClusterManagerBootstrap','NodeManagerBootstrap','rollsite','fate_flow_server.py','fateboard','mysql'
+
+ 检查进程数方法:
+
+ ```
+ thread = ps -ef |grep service |grep -v grep |wc -l
+ ```
+
+ 检查服务占用内存方法:
+
+ ```
+ server_mem = ps aux |grep %s |grep -v grep |awk '{sum+=$6};END {print sum}'
+ ```
+
+
+#### 3.4.4 fate_flow jobs process and mem info check(job任务数检测、job任务进程及占用内存)
+- 检测通过提醒:
+
+ [OK] Number of tasks running is xxx
+
+ [OK] Number of tasks waiting is xxx
+
+ [OK] running task job_id :xxx ,number of egg_pair processes is :xxx; used memory:xxx
+
+- 检测失败提醒:
+
+ [ERROR] There is no such fate_flow_client.py file, please check fate_flow server if it is running!
+
+- 检查方法:
+
+ 通过FATE自带的fate_flow_client 命令查看任务相关信息,通过ps命令查看内存相关信息。
+
+
+
+## 四 日志搜集
+
+### 4.1 使用场景
+
+------
+
+适用于跑任务出现错误后,在开发人员指导下进行错误日志搜集脚本,需要从报错日志中提取关键报错信息。
+
+### 4.2 工具功能
+
+------
+
+拉取指定ip:$EGGROLL_HOME/logs目录下带传入关键字的目录到本机当前目录下
+
+### 4.3 使用方法
+
+**需支持节点间免密scp、ssh操作,也可以手动输入密码执行**
+
+1、设置环境变量:
+
+```shell
+source $FATE_HOME/init_env.sh
+```
+
+2、编辑配置文件:
+
+```
+cd $EGGROLL_HOME/bin/debug
+vi check_iplist.sh
+```
+
+参数说明:
+
+```shell
+user=app <远程登录用户名>
+iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表>
+```
+
+3、执行检测脚本:
+
+```shell
+sh grep_logs.sh ${需要查询的session-id} <带上需要搜集的session-id,支持模糊查询>
+```
+
+执行后该session-id的各个ip的日志便会搜集到当前目录下的$session-id/$ip目录下
+
+### 4.4 结果说明
+
+------
+
+执行完可在当前目录下看到传入的$session_id目录,目录下是各个ip的关于$session_id的日志。
+
+
+
+## 五 集群配置检测
+
+### 5.1 使用场景
+
+------
+
+适用于运维人员部署好项目后,肉眼检查各个机器的eggroll.properties、route_table.json配置是否存在问题。
+
+### 5.2 工具功能
+
+------
+
+拉取指定ip的eggroll.properties、route_table.json配置到本机展示。
+
+### 5.3 使用方法
+
+**需支持节点间免密scp、ssh操作,或手动输入密码执行也可以**
+
+------
+
+1、设置环境变量:
+
+```shell
+source $FATE_HOME/init_env.sh
+```
+
+2、编辑配置文件:
+
+```
+cd $EGGROLL_HOME/bin/debug
+vi check_iplist.sh
+```
+
+参数说明:
+
+```shell
+user=app <远程登录用户名>
+iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表>
+```
+
+3、然后执行脚本:
+
+```shell
+sh check_conf.sh
+```
+
+### 5.4 结果说明
+
+------
+
+该脚本展示配置所有ip与本机的配置对比,说明如下:
+
+```properties
+//展示本机eggroll.properties配置信息
+----------------------$EGGROLL_HOME/conf/eggroll.properties--------------------
+//展示本机route_table.json配置信息
+-----------------------$EGGROLL_HOME/conf/route_table.json---------------------
+//展示ip列表中第一个ip配置与本机的diff结果,若为空则完全相同
+------------------diff $ip1 with ./conf/eggroll.properties-------------------------
+//展示ip列表中第二个ip配置与本机的diff结果,若为空则完全相同
+------------------diff $ip2 with ./conf/eggroll.properties-------------------------
+//展示ip列表中第三个ip配置与本机的diff结果,若为空则完全相同
+------------------diff $ip3 with ./conf/eggroll.properties-------------------------
+```
+