From e88bc8dec12086f4ed23c66f0480b3ada4fabaae Mon Sep 17 00:00:00 2001 From: weiwee Date: Fri, 7 Aug 2020 15:47:55 +0800 Subject: [PATCH 01/35] shutdown log before kill pid Signed-off-by: weiwee --- arch/api/impl/based_2x/federation.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/api/impl/based_2x/federation.py b/arch/api/impl/based_2x/federation.py index b19bd6eed2..e4fcb99171 100644 --- a/arch/api/impl/based_2x/federation.py +++ b/arch/api/impl/based_2x/federation.py @@ -135,10 +135,18 @@ def done_callback(fut): except Exception as e: import os import signal + import traceback + import logging + import sys + exc_info = sys.exc_info() + traceback.print_exception(*exc_info) pid = os.getpid() LOGGER.exception(f"remote fail, terminating process(pid={pid})") - os.kill(pid, signal.SIGTERM) - raise e + try: + logging.shutdown() + finally: + os.kill(pid, signal.SIGTERM) + raise e for future in futures: future.add_done_callback(done_callback) From e7e0b2288b00da61134a366a3b65bea7b43f358d Mon Sep 17 00:00:00 2001 From: zengjice Date: Wed, 2 Sep 2020 16:12:47 +0800 Subject: [PATCH 02/35] add debug tools Signed-off-by: zengjice --- ...77\347\224\250\350\257\264\346\230\216.md" | 321 ++++++++++++++++++ 1 file changed, 321 insertions(+) create mode 100644 "bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" diff --git "a/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" new file mode 100644 index 0000000000..d8e7225564 --- /dev/null +++ "b/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" @@ -0,0 +1,321 @@ +# 脚本使用说明 + +## 一 概述 + +本工具集提供4个工具,功能如下: + +| 工具名称 | 工具功能 | 使用场景 | +| ---------------- | ------------------------------------------ | ---------------- | +| 机器基础信息检测 | 验证机器设置是否满足跑fate任务要求 | 部署完fate后 | +| fate运行信息检测 | 验证机器当前状态是否适合新建一个fate任务 | 启动fate任务前 | +| 日志搜集 | 搜集该集群下所有session_id的日志到当前目录 | 跑任务出现错误后 | +| 集群配置检测 | 搜集展示集群的配置文件信息 | 跑任务出现错误后 | + + + +## 二 机器基础信息检测 + +### 2.1 使用场景 + +此脚本在部署完fate后运行,脚本功能检查系统内存 / 虚拟内存 / 磁盘 / 最大用户进程数 / 文件数 / 线程数设置 / rollsite进程堆内存 等机器基础信息,用于验证机器设置是否满足跑fate任务要求。 + +### 2.2 工具功能 + +1)检查系统内存:系统内存总量、系统内存使用量、系统内存使用占比 + +2)检查虚拟内存:虚拟内存总量、虚拟内存使用量、虚拟内存使用占比 + +3)检查磁盘使用情况:磁盘总量、磁盘使用量、磁盘使用占比 + +4)检查系统最大用户进程数:检查内容包括以下文件 + +```shell +cat /proc/sys/kernel/threads-max +cat /etc/sysctl.conf +cat /proc/sys/kernel/pid_max +cat /proc/sys/vm/max_map_count +``` + +5)检查最大文件数:检查内容包括以下文件 + +```shell +cat /etc/security/limits.conf +cat /etc/security/limits.d/80-nofile.conf +cat /etc/sysctl.conf +cat /proc/sys/fs/file-max +``` + +6)检查线程数设置:检查egg pair线程数eggroll.rollpair.eggpair.server.executor.pool.max.size设置是否充足 + +检查方法: + +```properties +P = pstree -p ps -e |grep egg_pair |awk '{print $1}' |wc -l(统计服务器所有egg pair进程所用线程数) + +n = eggroll.session.processors.per.node(默认16) + +r = eggroll.rollpair.eggpair.server.executor.pool.max.size(默认值500) + +M = n * r + +若所得线程数P +vi check_iplist.sh +user=app <远程登录用户名> +iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> +``` + +然后执行拉取脚本: + +```shell +sh grep_logs.sh ${需要查询的session_id} <带上需要搜集的session_id,支持模糊查询> + +<执行后该session_id的各个ip的日志便会搜集到当前目录下的$session_id/$ip目录下> +``` + +### 4.4 结果说明 + +执行完可在当前目录下看到传入的$session_id目录,目录下是各个ip的关于$session_id的日志。 + +## 五 集群配置检测 + +### 5.1 使用场景 + +适用于运维人员部署好项目后,肉眼检查各个机器的eggroll.properties、route_table.json配置是否存在问题。 + +### 5.2 工具功能 + +拉取指定ip的eggroll.properties、route_table.json配置到本机展示。 + +### 5.3 使用方法 + +先source init_env.sh ,init_env.sh位于部署目录下,设置环境变量等关键信息,配置需要拉取的ip信息: + +```shell +cd $EGGROLL_HOME/bin/debug + +<配置iplist以及EGGROLL_HOME> +vi check_iplist.sh +user=app <远程登录用户名> +iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> +``` + +然后执行脚本: + +```shell +sh check_conf.sh +``` + +### 5.4 结果说明 + +该脚本展示配置所有ip与本机的配置对比,说明如下: + +```properties +//展示本机eggroll.properties配置信息 +----------------------$EGGROLL_HOME/conf/eggroll.properties-------------------- +//展示本机route_table.json配置信息 +-----------------------$EGGROLL_HOME/conf/route_table.json--------------------- +//展示ip列表中第一个ip配置与本机的diff结果,若为空则完全相同 +------------------diff $ip1 with ./conf/eggroll.properties------------------------- +//展示ip列表中第二个ip配置与本机的diff结果,若为空则完全相同 +------------------diff $ip2 with ./conf/eggroll.properties------------------------- +//展示ip列表中第三个ip配置与本机的diff结果,若为空则完全相同 +------------------diff $ip3 with ./conf/eggroll.properties------------------------- +``` + From 185dafe569a3cd4b987b5e418ee27297b412571d Mon Sep 17 00:00:00 2001 From: zengjice Date: Wed, 2 Sep 2020 16:29:38 +0800 Subject: [PATCH 03/35] upgrade eggroll version Signed-off-by: zengjice --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 7c9dcff1f0..8d32dbed61 100644 --- a/.gitmodules +++ b/.gitmodules @@ -5,4 +5,4 @@ [submodule "eggroll"] path = eggroll url = https://github.com/WeBankFinTech/eggroll.git - branch = v2.0.1 + branch = release-2.0.2-build-5 From 60027d10954534a3d0985d096088f38f98b0f4ea Mon Sep 17 00:00:00 2001 From: zzzcq <1270934223@qq.com> Date: Fri, 4 Sep 2020 10:44:35 +0800 Subject: [PATCH 04/35] add check scripts --- bin/debug/check_conf.sh | 35 ++++ bin/debug/check_env.sh | 86 +++++++++ bin/debug/check_iplist.sh | 19 ++ bin/debug/cluster_env_check.sh | 31 ++++ bin/debug/env_check.py | 175 ++++++++++++++++++ bin/debug/env_check.sh | 25 +++ bin/debug/grep_logs.sh | 24 +++ bin/debug/server_check.py | 157 ++++++++++++++++ bin/debug/test_env.py | 67 +++++++ bin/debug/time_check.py | 29 +++ ...77\347\224\250\350\257\264\346\230\216.md" | 144 ++++++++++---- 11 files changed, 756 insertions(+), 36 deletions(-) create mode 100644 bin/debug/check_conf.sh create mode 100644 bin/debug/check_env.sh create mode 100644 bin/debug/check_iplist.sh create mode 100644 bin/debug/cluster_env_check.sh create mode 100644 bin/debug/env_check.py create mode 100644 bin/debug/env_check.sh create mode 100644 bin/debug/grep_logs.sh create mode 100644 bin/debug/server_check.py create mode 100644 bin/debug/test_env.py create mode 100644 bin/debug/time_check.py diff --git a/bin/debug/check_conf.sh b/bin/debug/check_conf.sh new file mode 100644 index 0000000000..ecc8ff3f36 --- /dev/null +++ b/bin/debug/check_conf.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# + +cwd=$(cd `dirname $0`; pwd) +source ./check_iplist.sh + +cd $EGGROLL_HOME + +echo "----------------------$EGGROLL_HOME/conf/eggroll.properties--------------------" +cat $EGGROLL_HOME/conf/eggroll.properties | grep -v ^# | grep -v ^$ +echo "" +echo "-----------------------$EGGROLL_HOME/conf/route_table.json---------------------" +cat $EGGROLL_HOME/conf/route_table.json | grep -v ^# | grep -v ^$ + +for ip in ${iplist[@]};do + echo "------------------diff $ip with ./conf/eggroll.properties-------------------------" + ssh $user@$ip "cat $EGGROLL_HOME/conf/eggroll.properties" | diff - conf/eggroll.properties + echo "" +done + +cd $cwd diff --git a/bin/debug/check_env.sh b/bin/debug/check_env.sh new file mode 100644 index 0000000000..c81f0b44d4 --- /dev/null +++ b/bin/debug/check_env.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +cwd=$(cd `dirname $0`; pwd) + +get_property() { + property_value=`grep $1 $2 | cut -d '=' -f 2-` + test_value $1 $2 ${property_value} +} + +echo_red() { + echo -e "\e[1;31m $1\e[0m" +} + +echo_green() { + echo -e "\e[1;32m $1\e[0m" +} + +echo_yellow() { + echo -e "\e[1;33m $1\e[0m" +} + +check_max_count() { + value=`cat $1` + if [ $value -ge 65535 ];then + echo_green "[OK] $1 is ok." + else + echo_red "[ERROR] please check $1, no less than 65535." + fi +} + +check_file_count() { + value=`cat $1 | grep $2 | awk '{print $4}'` + for v in ${value[@]};do + test_value $1 $2 $v + done +} + +test_value() { + if [ $3 -ge 65535 ];then + echo_green "[OK] $1 in $2 is ok." + else + echo_red "[ERROR] please check $1 in $2, no less than 65535." + fi +} + +echo_green `date +"%Y-%m-%d_%H:%M:%S"` + +echo_green "=============check max user processes============" +check_max_count "/proc/sys/kernel/threads-max" +get_property "kernel.pid_max" "/etc/sysctl.conf" +check_max_count "/proc/sys/kernel/pid_max" +check_max_count "/proc/sys/vm/max_map_count" + +echo_green "=============check max files count==============" +check_file_count "/etc/security/limits.conf" "nofile" +check_file_count "/etc/security/limits.d/80-nofile.conf" "nofile" +get_property "fs.file-max" "/etc/sysctl.conf" +check_max_count "/proc/sys/fs/file-max" + +mem_total=`free -m | grep Mem | awk '{print $2}' | tr -cd "[0-9,.]"` +mem_used=`free -m | grep Mem | awk '{print $3}' | tr -cd "[0-9],."` +swap_total=`free -m | grep Swap | awk '{print $2}' | tr -cd "[0-9,.]"` +swap_used=`free -m | grep Swap | awk '{print $3}' | tr -cd "[0-9,.]"` + +echo_green "=============Memory used and total===============" +echo_yellow "[WARNING] MemTotal:`awk 'BEGIN{printf "%.2f%%\n",('$mem_total'/1024)}'`G, MemUsed:`awk 'BEGIN{printf "%.2f%%\n",('$mem_used'/1024)}'`G, MemUsed%:`awk 'BEGIN{printf "%.2f%%\n",('$mem_used'/'$mem_total')*100}'`" +echo_green "=============SwapMem used and total===============" +echo_yellow "[WARNING] SwapTotal:`awk 'BEGIN{printf "%.2f%%\n",('$swap_total'/1024)}'`G, SwapUsed:`awk 'BEGIN{printf "%.2f%%\n",('$swap_used'/1024)}'`G, SwapUsed%:`awk 'BEGIN{printf "%.2f%%\n",('$swap_used'/'$swap_total')*100}'`" +echo_green "=============Disk use and total==================" +echo_yellow "[WARNING] `df -lh | grep /data`" + + diff --git a/bin/debug/check_iplist.sh b/bin/debug/check_iplist.sh new file mode 100644 index 0000000000..b753a6382c --- /dev/null +++ b/bin/debug/check_iplist.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# + +user=app +iplist=(xxx xxx) diff --git a/bin/debug/cluster_env_check.sh b/bin/debug/cluster_env_check.sh new file mode 100644 index 0000000000..196cadbda0 --- /dev/null +++ b/bin/debug/cluster_env_check.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +cwd=$(cd `dirname $0`; pwd) +source ./check_iplist.sh + +for ip in ${iplist[@]};do + if [[ ! -d "${EGGROLL_HOME}/bin/debug" ]]; then + echo "${EGGROLL_HOME}/bin/debug in $ip is not exist, mkdir -p ${EGGROLL_HOME}/bin/debug." + mkdir -p ${EGGROLL_HOME}/bin/debug + fi + + if ! ssh -tt app@$ip test -e ${EGGROLL_HOME}/bin/debug/check_env.sh;then + echo "check_env.sh in $ip:${EGGROLL_HOME}/bin/debug is not exist, scp check_env.sh to $ip:${EGGROLL_HOME}/bin/debug" + scp ./check_env.sh $user@$ip:${EGGROLL_HOME}/bin/debug + fi + ssh app@$ip "sh ${EGGROLL_HOME}/bin/debug/check_env.sh" >> $ip +done diff --git a/bin/debug/env_check.py b/bin/debug/env_check.py new file mode 100644 index 0000000000..815ba7cb55 --- /dev/null +++ b/bin/debug/env_check.py @@ -0,0 +1,175 @@ +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +import re +import sys +import json +import time +import socket +import psutil +import datetime +import argparse +import subprocess +from eggroll.core.session import ErSession +from eggroll.roll_pair.roll_pair import RollPairContext +from eggroll.utils.log_utils import get_logger + +L = get_logger() + +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument("-t","--time", type=int, help="Sleep time wait, default value 0s", default=0) +arg_parser.add_argument("-n","--nodes", type=int, help="Eggroll session processors per node, default value 1", default=1) +arg_parser.add_argument("-p","--partitions", type=int, help="Total partitions, default value 1", default=1) +args = arg_parser.parse_args() + +def str_generator(include_key=True, row_limit=10, key_suffix_size=0, value_suffix_size=0): + for i in range(row_limit): + if include_key: + yield str(i) + "s"*key_suffix_size, str(i) + "s"*value_suffix_size + else: + yield str(i) + "s"*value_suffix_size + +def round2(x): + return str(round(x / 1024 / 1024 / 1024, 2)) + +def print_red(str): + print("\033[1;31;40m\t" + str + "\033[0m") + +def print_green(str): + print("\033[1;32;40m\t" + str + "\033[0m") + +def print_yellow(str): + print("\033[1;33;40m\t" + str + "\033[0m") + +def check_actual_max_threads(): + def getMemInfo(fn): + def query_cmd(cmd): + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0].decode().strip().split('\n') + return p[0] + + def get_host_ip(): + try: + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.connect(('8.8.8.8', 80)) + ip = s.getsockname()[0] + finally: + s.close() + return ip + + mem = psutil.virtual_memory() + mem_total = round2(mem.total) + mem_used = round2(mem.used) + mem_used_per = str(round(mem.percent)) + '%' + + swap_mem = psutil.swap_memory() + swap_total = round2(swap_mem.total) + swap_used = round2(swap_mem.used) + swap_use_per = str(round(swap_mem.percent)) + '%' + + data_disk = psutil.disk_usage('/data') + disk_total = round2(data_disk.total) + disk_used = round2(data_disk.used) + disk_per = str(round(data_disk.percent)) + '%' + + mem_info = {} + mem_info["Ip"] = get_host_ip() + mem_info["MemTotal"] = mem_total + mem_info["MemUsed"] = mem_used + mem_info["MemUsedPCT"] = mem_used_per + + mem_info["SwapTotal"] = swap_total + mem_info["SwapUsed"] = swap_used + mem_info["SwapUsePCT"] = swap_use_per + + mem_info["DiskTotal"] = disk_total + mem_info["DiskUsed"] = disk_used + mem_info["DiskUsedPCT"] = disk_per + + mem_info["/proc/sys/kernel/threads-max"] = query_cmd("cat /proc/sys/kernel/threads-max") + mem_info["/etc/sysctl.conf"] = query_cmd("grep kernel.pid_max /etc/sysctl.conf | awk -F= '{print $2}'") + mem_info["/proc/sys/kernel/pid_max"] = query_cmd("cat /proc/sys/kernel/pid_max") + mem_info["/proc/sys/vm/max_map_count"] = query_cmd("cat /proc/sys/vm/max_map_count") + + mem_info["/etc/security/limits.conf"] = query_cmd("cat /etc/security/limits.conf | grep nofile | awk '{print $4}'") + mem_info["/etc/security/limits.d/80-nofile.conf"] = query_cmd("cat /etc/security/limits.d/80-nofile.conf | grep nofile | awk '{print $4}'") + mem_info["/etc/sysctl.conf"] = query_cmd("grep fs.file-max /etc/sysctl.conf | awk -F= '{print $2}'") + mem_info["/proc/sys/fs/file-max"] = query_cmd("cat /proc/sys/fs/file-max") + + mem_info["CurrentUseProcesses"] = query_cmd("pstree -p `ps -e |grep egg_pair |awk '{print $1}'` |wc -l") + mem_info["NodeProcessors"] = query_cmd("grep eggroll.session.processors.per.node ${EGGROLL_HOME}/conf/eggroll.properties | awk -F= '{print $2}'") + mem_info["PoolSize"] = query_cmd("grep eggroll.rollpair.eggpair.server.executor.pool.max.size ${EGGROLL_HOME}/conf/eggroll.properties | awk -F= '{print $2}'") + + rollsite_pid = query_cmd("ps aux | grep ${EGGROLL_HOME} | grep com.webank.eggroll.rollsite.Proxy | grep -v grep | awk '{print $2}'") + if rollsite_pid: + rollsite_used_memory = psutil.Process(int(rollsite_pid)).memory_info().rss + myfile = open(sys.path[1] + '/../../../conf/eggroll.properties') + properties = myfile.read() + jvm_options = re.findall(r"(?<=MaxHeapSize=).*?(?=G)", properties) + if len(jvm_options): + rollsite_total_memory = int(jvm_options[0]) * 1024 * 1024 * 1024 + else: + rollsite_total_memory = mem.total + myfile.close() + + mem_info["RollsiteUsedPercent"] = '{:.2%}'.format(rollsite_used_memory / (rollsite_total_memory * 4)) + else: + mem_info["RollsiteUsedPercent"] = 0 + + + return mem_info + + session = ErSession(options={"eggroll.session.processors.per.node": args.nodes}) + try: + ctx = RollPairContext(session) + rp = ctx.parallelize(str_generator(row_limit=1000), options={'total_partitions': args.partitions}) + result = rp.with_stores(func=getMemInfo) + print_green(str(datetime.datetime.now())) + #print(json.dumps(result, indent=1)) + for node in result: + print_green("==============This is node " + str(node[0]) + ":" + node[1]["Ip"] + "===========================================") + print_yellow("[WARNING] MemTotal:" + node[1]["MemTotal"] + "G, MemUsed:" + node[1]["MemUsed"] + "G, MemUsedPCT:" + node[1]["MemUsedPCT"]) + if float(node[1]["SwapTotal"]) < 128: + print_red("[ERROR] The swap memory is:" + node[1]["SwapTotal"] + "G, no less than 128G.") + else: + print_yellow("[WARNING] SwapTotal:" + node[1]["SwapTotal"] + "G, SwapUsed:" + node[1]["SwapUsed"] + "G, SwapUsePCT:" + node[1]["SwapUsePCT"]) + print_yellow("[WARNING] DiskTotal:" + node[1]["DiskTotal"] + "G, DiskUsed:" + node[1]["DiskUsed"] + "G, DiskUsedPCT:" + node[1]["DiskUsedPCT"]) + print_green("--------------Max user processes and max file count----------------------------------------") + for key in ["/proc/sys/kernel/threads-max", "/etc/sysctl.conf", "/proc/sys/kernel/pid_max", "/proc/sys/vm/max_map_count", "/etc/security/limits.conf", "/etc/security/limits.d/80-nofile.conf", "/etc/sysctl.conf", "/proc/sys/fs/file-max"]: + if int(node[1][key]) > 65535: + print_green("[OK] " + key + " = " + node[1][key]) + else: + print_red("[ERROR] please check " + key + " = " + node[1][key] + ", no less than 65535.") + print_green("--------------Thread count check-----------------------------------------------------------") + if len(node[1]["PoolSize"]) == 0: + node[1]["PoolSize"] = 500 + if int(node[1]["CurrentUseProcesses"]) < int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"]): + print_green("[OK] The thread count = %s, the total processes = %s * %s = %i" % (node[1]["CurrentUseProcesses"], node[1]["NodeProcessors"] ,node[1]["PoolSize"], int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"]))) + else: + print_red("[ERROR] The thread count = %s, the total processes = %s * %s = %i. eggroll.rollpair.eggpair.server.executor.pool.max.size is not enough, turn it up." % (node[1]["CurrentUseProcesses"], node[1]["NodeProcessors"] ,node[1]["PoolSize"], int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"]))) + if node[1]["RollsiteUsedPercent"] != 0: + print_green("----------Rollsite memory use percent--------------------------------------------------") + print_yellow("[WARNING] rollsite memory use: " + node[1]["RollsiteUsedPercent"]) + print("\n") + finally: + session.kill() + + +if __name__ == '__main__': + if args.time == 0: + check_actual_max_threads() + else: + while 1: + check_actual_max_threads() + time.sleep(args.time) diff --git a/bin/debug/env_check.sh b/bin/debug/env_check.sh new file mode 100644 index 0000000000..8ceec7a935 --- /dev/null +++ b/bin/debug/env_check.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +cwd=$(cd `dirname $0`; pwd) +nodes=$1 + +LogLevel=$EGGROLL_LOG_LEVEL +export EGGROLL_LOG_LEVEL=INFO +python env_check.py -p $nodes > result_env.log +export EGGROLL_LOG_LEVEL=$LogLevel + + diff --git a/bin/debug/grep_logs.sh b/bin/debug/grep_logs.sh new file mode 100644 index 0000000000..09835b7874 --- /dev/null +++ b/bin/debug/grep_logs.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +cwd=$(cd `dirname $0`; pwd) +source ./check_iplist.sh + +for ip in ${iplist[@]};do + mkdir -p $1/$ip + scp -r $user@$ip:$EGGROLL_HOME/logs/*$1* $1/$ip +done +cd $cwd diff --git a/bin/debug/server_check.py b/bin/debug/server_check.py new file mode 100644 index 0000000000..80d2f432e5 --- /dev/null +++ b/bin/debug/server_check.py @@ -0,0 +1,157 @@ +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +import re +import sys +import json +import time +import socket +import psutil +import datetime +import threading +import argparse +import subprocess +from eggroll.core.session import ErSession +from eggroll.roll_pair.roll_pair import RollPairContext +from eggroll.utils.log_utils import get_logger + +L = get_logger() + +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument("-t","--time", type=int, help="Sleep time wait, default value 0s", default=0) +arg_parser.add_argument("-n","--nodes", type=int, help="Eggroll session processors per node, default value 1", default=1) +arg_parser.add_argument("-p","--partitions", type=int, help="Total partitions, default value 1", default=1) +args = arg_parser.parse_args() + +def str_generator(include_key=True, row_limit=10, key_suffix_size=0, value_suffix_size=0): + for i in range(row_limit): + if include_key: + yield str(i) + "s"*key_suffix_size, str(i) + "s"*value_suffix_size + else: + yield str(i) + "s"*value_suffix_size + +def round2(x): + return str(round(x / 1024 / 1024 / 1024, 2)) + +def print_red(str): + print("\033[1;31;40m\t" + str + "\033[0m") + +def print_green(str): + print("\033[1;32;40m\t" + str + "\033[0m") + +def print_yellow(str): + print("\033[1;33;40m\t" + str + "\033[0m") + +def check_actual_max_threads(): + def getMemInfo(fn): + def query_cmd(cmd): + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0].decode().strip().split('\n') + return p[0] + + def get_host_ip(): + try: + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.connect(('8.8.8.8', 80)) + ip = s.getsockname()[0] + finally: + s.close() + return ip + fate_flow_client = "/data/projects/fate/python/fate_flow/fate_flow_client.py" + mem_info = {} + mem_info["Ip"] = get_host_ip() + mem_info["route_table"] = query_cmd("if [ -f $EGGROLL_HOME/conf/route_table.json ];then array=(`cat $EGGROLL_HOME/conf/route_table.json |grep -E 'ip|port'`); echo ${array[@]}; else echo 0; fi") + mem_info["data_access"] = query_cmd("ps aux |grep data_access_server |grep -v grep |wc -l") + mem_info["data_test"] = query_cmd("curl -X POST --header 'Content-Type: application/json' -d '{\"local\": {\"role\": \"host\", \"party_id\": 10000}, \"id_type\":\"phone\", \"encrypt_type\":\"md5\"}' 'http://127.0.0.1:9350/v1/data/query_imported_id_library_info'") + mem_info["directory"] = query_cmd("if [ -d /data/projects/fdn/FDN-DataAcces ];then echo 1; else echo 0; fi") + mem_info["services"] = ['ClusterManagerBootstrap','NodeManagerBootstrap','rollsite','fate_flow_server.py','fateboard','mysql'] + mem_info["job_run"] = query_cmd("if [ -f %s ];then python %s -f query_job -s running | grep f_job_id |wc -l; else echo -1; fi" %(fate_flow_client,fate_flow_client)) + mem_info["job_wait"] = query_cmd("if [ -f %s ];then python %s -f query_job -s waiting | grep f_job_id |wc -l; else echo -1; fi" %(fate_flow_client,fate_flow_client)) + mem_info["job_thread"] = [] + mem_info["jobs"] = query_cmd("array=(`python %s -f query_job -s running | grep f_job_id |awk -F: '{print $2}' |awk -F '\"' '{print $2}'`);echo ${array[@]}" %(fate_flow_client)) + mem_info["job_mem"] = [] + mem_info["data_num"] = mem_info["data_test"].split(':')[-1].split('}')[0] + for job_id in mem_info["jobs"]: + mem_info["job_thread"] = query_cmd("ps -ef |grep egg_pair |grep -v grep |grep %s |wc -l" %(job_id)) + mem_info["job_mem"] = query_cmd("ps aux |grep egg_pair |grep %s |awk '{sum+=$6};END {print sum}'" %(job_id)) + mem_info["server_mem"] = {} + mem_info["thread"] = {} + for service in mem_info["services"]: + mem_info["thread"][service] = query_cmd("ps -ef |grep %s |grep -v grep |wc -l" %(service)) + mem_info["server_mem"][service] = str(query_cmd("ps aux |grep %s |grep -v grep |awk '{sum+=$6};END {print sum}'" %(service))) + return mem_info + + session = ErSession(options={"eggroll.session.processors.per.node": args.nodes}) + try: + ctx = RollPairContext(session) + rp = ctx.parallelize(str_generator(row_limit=1000), options={'total_partitions': args.partitions}) + result = rp.with_stores(func=getMemInfo) + print_green(str(datetime.datetime.now())) + for node in result: + print_green("==============This is node " + str(node[0]) + ":" + node[1]["Ip"] + "===========================================") + print_green("-------------default route check-------------------------------------------------------") + if node[1]["route_table"] == 0: + print_red("[ERROR] eggroll route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed!") + else: + print_green("[OK] eggroll route configured!") + print_green(node[1]["route_table"]) + + print_green("--------------data_access service check-------------------------------------------------") + if int(node[1]["data_access"]) == 0: + if int(node[1]["directory"]) == 0: + print_red("[ERROR] data_access service and directory not found, please check if it is installed!") + else: + print_yellow("[WARNING] data_access not running or check /data/projects/fdn/FDN-DataAcces directory") + else: + if int(node[1]["data_num"]) == 0 or int(node[1]["data_num"]) == 201: + print_green(node[1]["data_test"]) + print_green("[OK] Installed and running data_access service!") + else: + print_yellow(node[1]["data_test"]) + print_yellow("[WARNING] data_access service not available, please check host and host route!") + + print_green("--------------fate service check-------------------------------------------------------") + for server in node[1]["services"]: + if int(node[1]["thread"][server]) > 0: + print_green("[OK] the " + server.ljust(23) + " service is running , number of processes is : " + str(node[1]["thread"][server]) + "; used memory : " + str(node[1]["server_mem"][server]) + "KB.") + else: + print_yellow("[WARNING] the " + server + " service not running, please check service status.") + + print_green("--------------fate_flow jobs process and mem info check--------------------------------------------------") + if int(node[1]["job_run"]) == -1: + print_red("[ERROR] There is no such fate_flow_client.py file, please check fate_flow server if it is running!") + else: + print_green("[OK] Number of tasks running is " + node[1]["job_run"]) + print_green("[OK] Number of tasks waiting is " + node[1]["job_wait"]) + if int(node[1]["job_run"]) > 0: + for job_id in node[1]["jobs"].split(" "): + print_green("[OK] running task job_id : " + job_id + " run " + str(node[1]["job_thread"]) + " processes; used memory : " + str(node[1]["job_mem"]) + "KB.") + + print("\n") + finally: + session.kill() + + +if __name__ == '__main__': + if args.time == 0: + check_actual_max_threads() + else: + while 1: + check_actual_max_threads() + time.sleep(args.time) +s() + else: + while 1: + check_actual_max_threads() + time.sleep(args.time) diff --git a/bin/debug/test_env.py b/bin/debug/test_env.py new file mode 100644 index 0000000000..73661f90c0 --- /dev/null +++ b/bin/debug/test_env.py @@ -0,0 +1,67 @@ +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +import re +import subprocess + +def sub_dict(form_dict, sub_keys, default=None): + return dict([(k.strip(), form_dict.get(k.strip(), default)) for k in sub_keys.split(',')]) + + +def query_file(file_name, opts=''): + mem_info = {} + print(file_name) + with open(file_name, 'r') as f: + data = f.readlines() + for i in data: + if ':' in i or '=' in i: + i = i.replace(':', ',').replace('=', ',') + k, v = [x.strip() for x in i.split(',')] + mem_info[k] = int(v.split()[0]) + return sub_dict(mem_info, opts) + + +def query_cmd(cmd, opts=''): + if opts: + opts = " | grep -E '" + opts.replace(',', '|').replace(' ', '') + "'" + print(cmd + opts) + p = subprocess.Popen(cmd + opts, stdout=subprocess.PIPE, shell=True) + return p.communicate()[0] + +def query(cmd, opts='', flags=True): + if flags: + print(str(query_cmd(cmd, opts))) + else: + print(str(query_file(cmd, opts))) + +if __name__ == "__main__": + max_user_processes_params=[('cat /proc/sys/kernel/threads-max',),('/etc/sysctl.conf', 'kernel.pid_max', False),('cat /proc/sys/kernel/pid_max',),('cat /proc/sys/vm/max_map_count',)] + print('==============max user processes===============') + for p in max_user_processes_params: + s = query(*p) + + max_files_count_params=[('cat /etc/security/limits.conf', 'nofile'),('cat /etc/security/limits.d/80-nofile.conf',),('/etc/sysctl.conf','fs.file-max', False),('cat /proc/sys/fs/file-max',)] + print('===============max files count=================') + for i in max_files_count_params: + query(*i) + + memory_params=('/proc/meminfo', 'MemTotal, MemFree, MemAvailable, SwapTotal, SwapFree', False) + print('================memory info====================') + query(*memory_params) + + disk_params=('df -lh', '/dev/vdb,/dev/vda1') + print('================disk info====================') + query(*disk_params) + diff --git a/bin/debug/time_check.py b/bin/debug/time_check.py new file mode 100644 index 0000000000..51c5801879 --- /dev/null +++ b/bin/debug/time_check.py @@ -0,0 +1,29 @@ +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +import os +import time +import argparse + +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument("-t","--time", type=int, help="Sleep time wait, default value 0s", default=0) +args = arg_parser.parse_args() + +if args.time == 0: + os.system('sh ./cluster_env_check.sh') +else: + while 1: + os.system('sh ./cluster_env_check.sh') + time.sleep(args.time) diff --git "a/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" index d8e7225564..83502acf8c 100644 --- "a/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" +++ "b/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" @@ -4,12 +4,12 @@ 本工具集提供4个工具,功能如下: -| 工具名称 | 工具功能 | 使用场景 | -| ---------------- | ------------------------------------------ | ---------------- | -| 机器基础信息检测 | 验证机器设置是否满足跑fate任务要求 | 部署完fate后 | -| fate运行信息检测 | 验证机器当前状态是否适合新建一个fate任务 | 启动fate任务前 | -| 日志搜集 | 搜集该集群下所有session_id的日志到当前目录 | 跑任务出现错误后 | -| 集群配置检测 | 搜集展示集群的配置文件信息 | 跑任务出现错误后 | +| 工具名称 | 工具功能 | 使用场景 | +| ---------------- | ------------------------------------------ | -------------------- | +| 机器基础信息检测 | 验证机器设置是否满足跑fate任务要求 | 部署完成并启动服务后 | +| fate运行信息检测 | 验证机器当前状态是否适合新建一个fate任务 | 启动fate任务前 | +| 日志搜集 | 搜集该集群下所有session_id的日志到当前目录 | 跑任务出现错误后 | +| 集群配置检测 | 搜集展示集群的配置文件信息 | 部署完成启动服务失败 | @@ -17,10 +17,20 @@ ### 2.1 使用场景 -此脚本在部署完fate后运行,脚本功能检查系统内存 / 虚拟内存 / 磁盘 / 最大用户进程数 / 文件数 / 线程数设置 / rollsite进程堆内存 等机器基础信息,用于验证机器设置是否满足跑fate任务要求。 +------ + +此脚本在完成部署并正常启动服务后运行,脚本功能检查系统内存 / 虚拟内存 / 磁盘 / 最大用户进程数 / 文件数 / 线程数设置 / rollsite进程堆内存 等机器基础信息,用于验证机器设置是否满足跑fate任务要求。 ### 2.2 工具功能 +------ + +此检测检测提供两种版本: + +- 完整版:基于eggroll服务检测,需要各个节点eggroll服务正常启动后方可使用,用于检测各个nodemanager服务所在节点的集群基础信息,其检测项包含以下所有共7项; + +- 简约版:无需依赖eggroll服务,可以跨节点检测指定所有ip的基础信息,其检测项仅包括以下列出前5项,需支持节点间免密登录。 + 1)检查系统内存:系统内存总量、系统内存使用量、系统内存使用占比 2)检查虚拟内存:虚拟内存总量、虚拟内存使用量、虚拟内存使用占比 @@ -67,13 +77,17 @@ M = n * r ```shell 通过 ps aux | grep ${EGGROLL_HOME} | grep com.webank.eggroll.rollsite.Proxy | grep -v grep | awk '{print $2}' 得到rollsite进程的pid -若pid不存在,则不返回信息; +若pid不存在,即该节点不存在rollsite服务,则不返回信息; 若pid存在,则将其进程占用内存输出,并与rollsite设置最大使用内存按占用内存/设置最大内存作百分比。(最大内存默认值为系统内存的1/4) 最终rollsite内存占用百分比返回,作黄色提醒信息 ``` ### 2.3 使用方法 +------ + +#### 2.3.1 完整版 + 先source init_env.sh ,init_env.sh位于部署目录下,设置环境变量等关键信息,然后执行: ```shell @@ -83,14 +97,43 @@ sh env_check.sh ${集群节点个数} cat result_env.log ``` +#### 2.3.2 简约版 + +先source init_env.sh ,init_env.sh位于部署目录下,设置环境变量等关键信息,配置需要拉取的ip信息: + +```shell +cd $EGGROLL_HOME/bin/debug + +<配置iplist以及EGGROLL_HOME> +vi check_iplist.sh +user=app <远程登录用户名> +iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> +``` + +然后执行检测脚本: + +```python +python time_check.py +//若需定时检测观察内存信息变化则加-t参数,可指定间隔秒数定时输出 +python time_check.py -t {检测间隔秒数,不填只检测一次} + +//查看检测结果,各个ip的检测结果生成于当前目录下以ip命名的文件,直接cat可查看对应ip的返回信息 +cat ./$ip +``` + ### 2.4 检测结果说明 +------ + 返回示例信息如下: -[^说明:以下信息分为三种提示等级:]: -[^[OK\] 表示该检查项正常;]: -[^[WARNING\]表示该项需要注意,仅作关键信息展示,需要自行判断;]: -[^[ERROR\]表示该项不符合预期结果,需要按提示修改。]: +***说明:以下信息分为三种提示等级:*** + +***[OK\] 表示该检查项正常;*** + +***[WARNING\]表示该项需要注意,仅作关键信息展示,需要自行判断;*** + +***[ERROR\]表示该项不符合预期结果,需要按提示修改。*** ```properties //脚本执行时间 @@ -128,25 +171,29 @@ cat result_env.log ### 3.1 使用场景 +------ + 跑fate任务前,检测fate运行信息。验证机器当前状态是否适合新建一个fate任务 ### 3.2 工具功能 -1)检测机器基础信息:系统内存 / 虚拟内存 / 磁盘 / 最大用户进程数 / 文件数/线程数设置/rollsite进程堆内存 +------ -2)检测fate运行信息:eggroll路由是不是默认路由/是否已安装data access/fate服务状态、进程数及占用内存/当前环境正在运行及等待的job任务数、job任务有多少进程及占用的内存 。 +检测fate运行信息:eggroll路由是不是默认路由、是否已安装data access、fate服务的运行状态、进程数及占用内存情况、当前环境正在运行及等待的job任务数、job任务有多少进程及占用的内存情况。 ### 3.3 使用方法 -source $EGGROLL_HOME/init.sh 导入环境变量 +source init_env.sh ,init_env.sh位于部署目录下,设置环境变量等关键信息 -export EGGROLL_LOG_LEVEL=INFO 修改日志输出INFO模式 +(可选)export EGGROLL_LOG_LEVEL=INFO 修改日志输出模式,只查看结果检测信息 python server_ckeck.py -p {集群内节点个数} -t {检测间隔秒数,不填只检测一次} ### 3.4 检测结果说明 -![1599015506806](C:\Users\v_wbxzheng\AppData\Roaming\Typora\typora-user-images\1599015506806.png) +------ + +![1599048793905](C:\Users\v_wbxzheng\AppData\Roaming\Typora\typora-user-images\1599048793905.png) #### 3.4.1 default route check(eggroll路由是不是默认路由) @@ -162,8 +209,10 @@ python server_ckeck.py -p {集群内节点个数} -t {检测间隔秒数,不 - 检查方法: - if [ -f $EGGROLL_HOME/conf/route_table.json ];then array=(`cat $EGGROLL_HOME/conf/route_table.json |grep -E 'ip|port'`); echo ${array[@]}; else echo 0; fi - + ``` +if [ -f $EGGROLL_HOME/conf/route_table.json ];then array=(`cat $EGGROLL_HOME/conf/route_table.json |grep -E 'ip|port'`); echo ${array[@]}; else echo 0; fi + ``` + 若返回值为0,则视为未配置eggroll route,提示ERROR:没有设置eggroll route。 #### 3.4.2 data_access service check(是否已安装data access) @@ -178,12 +227,17 @@ python server_ckeck.py -p {集群内节点个数} -t {检测间隔秒数,不 - 检查方法: - ps aux |grep data_access_server |grep -v grep |wc -l //获取data_access进程数 - - if [ -d /data/projects/fdn/FDN-DataAcces ];then echo 1; else echo 0; fi //检查服务目录 - - 若返回进程数为0,判断检查服务目录的返回值,若为0,则视为没有安装access,提示ERROR;否则,则视为没有启动access,提示WARNING。 + ``` +ps aux |grep data_access_server |grep -v grep |wc -l //获取data_access进程数 +if [ -d /data/projects/fdn/FDN-DataAcces ];then echo 1; else echo 0; fi //检查服务目录 + + curl -X POST --header 'Content-Type: application/json' -d '{"local": {"role": "host", "party_id": 10000}, "id_type":"phone", "encrypt_type":"md5"}' 'http://127.0.0.1:9350/v1/data/query_imported_id_library_info' //路由验证 + ``` + + 若返回进程数为0,判断检查服务目录的返回值,若为0,则视为没有安装access,提示ERROR;否则,则视为没有启动access,提示WARNING; + + 若返回进程数大于0,判断路由验证返回码,如果返回 "status":0,或 "status":201,则说明 DataAccess 服务以及路由表配置没有问题,否则提示WARNING检查路由设置 #### 3.4.3 fate service check(fate服务状态、进程数及占用内存) - 检测通过提醒: @@ -196,13 +250,13 @@ python server_ckeck.py -p {集群内节点个数} -t {检测间隔秒数,不 - 检查方法: - services = ['ClusterManagerBootstrap','NodeManagerBootstrap','rollsite','fate_flow_server.py','fateboard','mysql'] //fate服务列表 - - for service in services: - - ​ thread = ps -ef |grep service |grep -v grep |wc -l //获取服务进程数 - - ​ server_mem = ps aux |grep %s |grep -v grep |awk '{sum+=$6};END {print sum}' //获取服务占用内存 + ``` +services = ['ClusterManagerBootstrap','NodeManagerBootstrap','rollsite','fate_flow_server.py','fateboard','mysql'] //fate服务列表 + +for service in services: + thread = ps -ef |grep service |grep -v grep |wc -l //获取服务进程数 + server_mem = ps aux |grep %s |grep -v grep |awk '{sum+=$6};END {print sum}' //获取服务占用内存 + ``` 判断thread值,若大于0,则服务正在运行,展示进程数及占用内存;否则,视为服务未启动。 #### 3.4.4 fate_flow jobs process and mem info check(job任务数检测、job任务进程及占用内存) @@ -220,6 +274,7 @@ python server_ckeck.py -p {集群内节点个数} -t {检测间隔秒数,不 - 检查方法: + ``` fate_flow_client = "/data/projects/fate/python/fate_flow/fate_flow_client.py" job_run = if [ -f $fate_flow_client ];then python $fate_flow_client -f query_job -s running | grep f_job_id |wc -l; else echo -1; fi //获取running job任务数 @@ -229,10 +284,9 @@ python server_ckeck.py -p {集群内节点个数} -t {检测间隔秒数,不 jobs = python $fate_flow_client -f query_job -s running | grep f_job_id |awk -F: '{print $2}' |awk -F '\"' '{print $2}'`);echo ${array[@]} //获取running状态job任务的job_id for job_id in jobs: - - ​ job_thread = ps -ef |grep egg_pair |grep -v grep |grep job_id |wc -l //获取job任务的进程数 - - ​ job_mem = ps aux |grep egg_pair |grep job_id |awk '{sum+=$6};END {print sum}' //获取job任务的占用内存 + job_thread = ps -ef |grep egg_pair |grep -v grep |grep job_id |wc -l //获取job任务的进程数 + job_mem = ps aux |grep egg_pair |grep job_id |awk '{sum+=$6};END {print sum}' //获取job任务的占用内存 + ``` 判断:若job_run为-1,则提示ERROR,找不到fate_flow客户端文件,确认fate_flow是否运行;否则,依此展示job任务数以及job任务的进程数和占用内存。 @@ -242,14 +296,20 @@ python server_ckeck.py -p {集群内节点个数} -t {检测间隔秒数,不 ### 4.1 使用场景 -适用于出现错误后,在开发人员指导下进行错误日志搜集脚本,需要从报错日志中提取关键报错信息。 +------ + +适用于跑任务出现错误后,在开发人员指导下进行错误日志搜集脚本,需要从报错日志中提取关键报错信息。 ### 4.2 工具功能 +------ + 拉取指定ip:$EGGROLL_HOME/logs目录下带传入关键字的目录到本机当前目录下 ### 4.3 使用方法 +------ + 先source init_env.sh ,init_env.sh位于部署目录下,设置环境变量等关键信息,配置需要拉取的ip信息: ```shell @@ -271,20 +331,30 @@ sh grep_logs.sh ${需要查询的session_id} <带上需要搜集的session_id, ### 4.4 结果说明 +------ + 执行完可在当前目录下看到传入的$session_id目录,目录下是各个ip的关于$session_id的日志。 + + ## 五 集群配置检测 ### 5.1 使用场景 +------ + 适用于运维人员部署好项目后,肉眼检查各个机器的eggroll.properties、route_table.json配置是否存在问题。 ### 5.2 工具功能 +------ + 拉取指定ip的eggroll.properties、route_table.json配置到本机展示。 ### 5.3 使用方法 +------ + 先source init_env.sh ,init_env.sh位于部署目录下,设置环境变量等关键信息,配置需要拉取的ip信息: ```shell @@ -304,6 +374,8 @@ sh check_conf.sh ### 5.4 结果说明 +------ + 该脚本展示配置所有ip与本机的配置对比,说明如下: ```properties From ded924e3744218d539f0ac797d8f3d6140deb442 Mon Sep 17 00:00:00 2001 From: zzzcq <1270934223@qq.com> Date: Fri, 4 Sep 2020 10:46:35 +0800 Subject: [PATCH 05/35] update check scripts --- ...77\347\224\250\350\257\264\346\230\216.md" | 225 +++++++++--------- 1 file changed, 106 insertions(+), 119 deletions(-) diff --git "a/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" index 83502acf8c..9fbba5ad9f 100644 --- "a/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" +++ "b/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" @@ -11,6 +11,16 @@ | 日志搜集 | 搜集该集群下所有session_id的日志到当前目录 | 跑任务出现错误后 | | 集群配置检测 | 搜集展示集群的配置文件信息 | 部署完成启动服务失败 | +名词解释: + +| 名词 | 解释 | +| ---------------------- | -------------------------------------------------------- | +| $FATE_HOME | 通常在/data/projects/fate | +| $EGGROLL_HOME | 通常在/data/projects/fate/eggroll | +| ${集群节点个数} | 如果运行脚本的机器所在集群有3个节点,就取3 | +| ${host party_id} | 可选参数,检查data_access服务是否可用,取host方partyid值 | +| {需要查询的session-id} | 是一个21位左右的长id。如202009031227285073491。 | + ## 二 机器基础信息检测 @@ -27,9 +37,9 @@ 此检测检测提供两种版本: -- 完整版:基于eggroll服务检测,需要各个节点eggroll服务正常启动后方可使用,用于检测各个nodemanager服务所在节点的集群基础信息,其检测项包含以下所有共7项; +- 单集群版:基于eggroll服务检测,需要各个节点eggroll服务正常启动后方可使用,用于检测各个nodemanager服务所在节点的集群基础信息,其检测项包含以下所有共7项; -- 简约版:无需依赖eggroll服务,可以跨节点检测指定所有ip的基础信息,其检测项仅包括以下列出前5项,需支持节点间免密登录。 +- 跨集群版:无需依赖eggroll服务,可以跨节点检测指定所有ip的基础信息,其检测项仅包括以下列出前5项,需支持节点间免密登录。 1)检查系统内存:系统内存总量、系统内存使用量、系统内存使用占比 @@ -37,103 +47,76 @@ 3)检查磁盘使用情况:磁盘总量、磁盘使用量、磁盘使用占比 -4)检查系统最大用户进程数:检查内容包括以下文件 - -```shell -cat /proc/sys/kernel/threads-max -cat /etc/sysctl.conf -cat /proc/sys/kernel/pid_max -cat /proc/sys/vm/max_map_count -``` - -5)检查最大文件数:检查内容包括以下文件 +4)检查系统最大用户进程数 -```shell -cat /etc/security/limits.conf -cat /etc/security/limits.d/80-nofile.conf -cat /etc/sysctl.conf -cat /proc/sys/fs/file-max -``` +5)检查最大文件数 6)检查线程数设置:检查egg pair线程数eggroll.rollpair.eggpair.server.executor.pool.max.size设置是否充足 -检查方法: - -```properties -P = pstree -p ps -e |grep egg_pair |awk '{print $1}' |wc -l(统计服务器所有egg pair进程所用线程数) - -n = eggroll.session.processors.per.node(默认16) - -r = eggroll.rollpair.eggpair.server.executor.pool.max.size(默认值500) - -M = n * r - -若所得线程数P +``` +cd $EGGROLL_HOME/bin/debug vi check_iplist.sh -user=app <远程登录用户名> -iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> ``` -然后执行检测脚本: +参数说明: + +​ user=app <远程登录用户名> +​ iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> + +3、执行检测脚本: ```python python time_check.py -//若需定时检测观察内存信息变化则加-t参数,可指定间隔秒数定时输出 -python time_check.py -t {检测间隔秒数,不填只检测一次} - //查看检测结果,各个ip的检测结果生成于当前目录下以ip命名的文件,直接cat可查看对应ip的返回信息 cat ./$ip ``` +//若需定时检测观察内存信息变化则加-t参数,可指定间隔秒数定时输出 + +``` +python time_check.py -t {检测间隔秒数,不填只检测一次} +``` + ### 2.4 检测结果说明 ------ 返回示例信息如下: -***说明:以下信息分为三种提示等级:*** +*说明:以下信息分为三种提示等级:* -***[OK\] 表示该检查项正常;*** +*[OK\] 表示该检查项正常;* -***[WARNING\]表示该项需要注意,仅作关键信息展示,需要自行判断;*** +*[WARNING\]表示该项需要注意,仅作关键信息展示,需要自行判断;* -***[ERROR\]表示该项不符合预期结果,需要按提示修改。*** +*[ERROR\]表示该项不符合预期结果,需要按提示修改。* ```properties //脚本执行时间 @@ -183,37 +166,38 @@ cat ./$ip ### 3.3 使用方法 -source init_env.sh ,init_env.sh位于部署目录下,设置环境变量等关键信息 +``` +source $FATE_HOME/init_env.sh //FATE_HOME为用户环境的fate目录 +cd $EGGROLL_HOME/bin/debug +sh server_check.sh ${集群内节点个数} ${host party_id(可选)} +例:sh server_check.sh 1 10000 +``` + +可选参数: -(可选)export EGGROLL_LOG_LEVEL=INFO 修改日志输出模式,只查看结果检测信息 +​ {host party_id} //当需要检查data_assess的服务是否可用时使用 -python server_ckeck.py -p {集群内节点个数} -t {检测间隔秒数,不填只检测一次} +结果保存在result_server.log文件中 ### 3.4 检测结果说明 ------ -![1599048793905](C:\Users\v_wbxzheng\AppData\Roaming\Typora\typora-user-images\1599048793905.png) - #### 3.4.1 default route check(eggroll路由是不是默认路由) - 检测通过提示: [OK] eggroll route configured! - "port": 9801, "ip": "172.16.153.25" "port": 9810, "ip": "172.16.153.25" "port": 9801, "ip": "172.16.153.95" + "port": 9801, "ip": "172.16.153.25" - 检测失败提示: - [ERROR] eggroll route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed! + [ERROR] eggroll route is not configured, please check /data/projects/fate/eggroll/conf/route_table.json file if it is existed! - 检查方法: - ``` -if [ -f $EGGROLL_HOME/conf/route_table.json ];then array=(`cat $EGGROLL_HOME/conf/route_table.json |grep -E 'ip|port'`); echo ${array[@]}; else echo 0; fi - ``` - - 若返回值为0,则视为未配置eggroll route,提示ERROR:没有设置eggroll route。 + 检测/data/projects/fate/eggroll/conf/route_table.json 是否有配置default参数。如果有,把ip和端口打印出来。如果无提示ERROR。 #### 3.4.2 data_access service check(是否已安装data access) @@ -227,17 +211,14 @@ if [ -f $EGGROLL_HOME/conf/route_table.json ];then array=(`cat $EGGROLL_HOME/con - 检查方法: + 先检查data_access 进程是否存在或者目录是否存在。若存在,会进一步检查data_access 服务是否可用。详细逻辑是: + ``` -ps aux |grep data_access_server |grep -v grep |wc -l //获取data_access进程数 - -if [ -d /data/projects/fdn/FDN-DataAcces ];then echo 1; else echo 0; fi //检查服务目录 - - curl -X POST --header 'Content-Type: application/json' -d '{"local": {"role": "host", "party_id": 10000}, "id_type":"phone", "encrypt_type":"md5"}' 'http://127.0.0.1:9350/v1/data/query_imported_id_library_info' //路由验证 - ``` - 若返回进程数为0,判断检查服务目录的返回值,若为0,则视为没有安装access,提示ERROR;否则,则视为没有启动access,提示WARNING; 若返回进程数大于0,判断路由验证返回码,如果返回 "status":0,或 "status":201,则说明 DataAccess 服务以及路由表配置没有问题,否则提示WARNING检查路由设置 + ``` + #### 3.4.3 fate service check(fate服务状态、进程数及占用内存) - 检测通过提醒: @@ -250,23 +231,31 @@ if [ -d /data/projects/fdn/FDN-DataAcces ];then echo 1; else echo 0; fi //检 - 检查方法: + 检查服务列表: + + 'ClusterManagerBootstrap','NodeManagerBootstrap','rollsite','fate_flow_server.py','fateboard','mysql' + + 检查进程数方法: + ``` -services = ['ClusterManagerBootstrap','NodeManagerBootstrap','rollsite','fate_flow_server.py','fateboard','mysql'] //fate服务列表 - -for service in services: - thread = ps -ef |grep service |grep -v grep |wc -l //获取服务进程数 - server_mem = ps aux |grep %s |grep -v grep |awk '{sum+=$6};END {print sum}' //获取服务占用内存 + thread = ps -ef |grep service |grep -v grep |wc -l ``` - 判断thread值,若大于0,则服务正在运行,展示进程数及占用内存;否则,视为服务未启动。 + 检查服务占用内存方法: + + ``` + server_mem = ps aux |grep %s |grep -v grep |awk '{sum+=$6};END {print sum}' + ``` + + #### 3.4.4 fate_flow jobs process and mem info check(job任务数检测、job任务进程及占用内存) - 检测通过提醒: - [OK] Number of tasks running is + [OK] Number of tasks running is xxx - [OK] Number of tasks waiting is + [OK] Number of tasks waiting is xxx - [OK] running task job_id : ,number of processes is :; used memory: + [OK] running task job_id :xxx ,number of egg_pair processes is :xxx; used memory:xxx - 检测失败提醒: @@ -274,23 +263,9 @@ for service in services: - 检查方法: - ``` - fate_flow_client = "/data/projects/fate/python/fate_flow/fate_flow_client.py" - - job_run = if [ -f $fate_flow_client ];then python $fate_flow_client -f query_job -s running | grep f_job_id |wc -l; else echo -1; fi //获取running job任务数 - - job_wait = if [ -f $fate_flow_client ];then python $fate_flow_client -f query_job -s waiting | grep f_job_id |wc -l; else echo -1; fi //获取waiting job任务数 - - jobs = python $fate_flow_client -f query_job -s running | grep f_job_id |awk -F: '{print $2}' |awk -F '\"' '{print $2}'`);echo ${array[@]} //获取running状态job任务的job_id - - for job_id in jobs: - job_thread = ps -ef |grep egg_pair |grep -v grep |grep job_id |wc -l //获取job任务的进程数 - job_mem = ps aux |grep egg_pair |grep job_id |awk '{sum+=$6};END {print sum}' //获取job任务的占用内存 - ``` - - 判断:若job_run为-1,则提示ERROR,找不到fate_flow客户端文件,确认fate_flow是否运行;否则,依此展示job任务数以及job任务的进程数和占用内存。 - - + 通过FATE自带的fate_flow_client 命令查看任务相关信息,通过ps命令查看内存相关信息。 + + ## 四 日志搜集 @@ -308,27 +283,32 @@ for service in services: ### 4.3 使用方法 ------- - -先source init_env.sh ,init_env.sh位于部署目录下,设置环境变量等关键信息,配置需要拉取的ip信息: +1、设置环境变量: ```shell -cd $EGGROLL_HOME/bin/debug +source $FATE_HOME/init_env.sh +``` + +2、编辑配置文件: -<配置iplist以及EGGROLL_HOME> +``` +cd $EGGROLL_HOME/bin/debug vi check_iplist.sh -user=app <远程登录用户名> -iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> ``` -然后执行拉取脚本: +参数说明: -```shell -sh grep_logs.sh ${需要查询的session_id} <带上需要搜集的session_id,支持模糊查询> +​ user=app <远程登录用户名> +​ iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> + +3、执行检测脚本: -<执行后该session_id的各个ip的日志便会搜集到当前目录下的$session_id/$ip目录下> +```shell +sh grep_logs.sh ${需要查询的session-id} <带上需要搜集的session-id,支持模糊查询> ``` +执行后该session-id的各个ip的日志便会搜集到当前目录下的$session-id/$ip目录下 + ### 4.4 结果说明 ------ @@ -355,18 +335,25 @@ sh grep_logs.sh ${需要查询的session_id} <带上需要搜集的session_id, ------ -先source init_env.sh ,init_env.sh位于部署目录下,设置环境变量等关键信息,配置需要拉取的ip信息: +1、设置环境变量: ```shell -cd $EGGROLL_HOME/bin/debug +source $FATE_HOME/init_env.sh +``` -<配置iplist以及EGGROLL_HOME> +2、编辑配置文件: + +``` +cd $EGGROLL_HOME/bin/debug vi check_iplist.sh -user=app <远程登录用户名> -iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> ``` -然后执行脚本: +参数说明: + +​ user=app <远程登录用户名> +​ iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> + +3、然后执行脚本: ```shell sh check_conf.sh From 0b2b524a54e234fdf0050f0bc510caa03b0d85cb Mon Sep 17 00:00:00 2001 From: zzzcq <1270934223@qq.com> Date: Fri, 4 Sep 2020 11:04:41 +0800 Subject: [PATCH 06/35] update check scripts --- bin/debug/cluster_env_check.sh | 3 ++- bin/debug/env_check.sh | 2 +- bin/debug/grep_logs.sh | 6 ++++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/bin/debug/cluster_env_check.sh b/bin/debug/cluster_env_check.sh index 196cadbda0..038e0e5c84 100644 --- a/bin/debug/cluster_env_check.sh +++ b/bin/debug/cluster_env_check.sh @@ -24,8 +24,9 @@ for ip in ${iplist[@]};do fi if ! ssh -tt app@$ip test -e ${EGGROLL_HOME}/bin/debug/check_env.sh;then - echo "check_env.sh in $ip:${EGGROLL_HOME}/bin/debug is not exist, scp check_env.sh to $ip:${EGGROLL_HOME}/bin/debug" + echo "${EGGROLL_HOME}/bin/debug/check_env.sh in $ip is not exist, scp check_env.sh to $ip:${EGGROLL_HOME}/bin/debug" scp ./check_env.sh $user@$ip:${EGGROLL_HOME}/bin/debug fi ssh app@$ip "sh ${EGGROLL_HOME}/bin/debug/check_env.sh" >> $ip + echo "The check result from $ip has saved in $cwd/$ip, please check it." done diff --git a/bin/debug/env_check.sh b/bin/debug/env_check.sh index 8ceec7a935..3a81cbc4d3 100644 --- a/bin/debug/env_check.sh +++ b/bin/debug/env_check.sh @@ -21,5 +21,5 @@ LogLevel=$EGGROLL_LOG_LEVEL export EGGROLL_LOG_LEVEL=INFO python env_check.py -p $nodes > result_env.log export EGGROLL_LOG_LEVEL=$LogLevel - +echo "The check result has saved in $cwd/result_env.log, please check it." diff --git a/bin/debug/grep_logs.sh b/bin/debug/grep_logs.sh index 09835b7874..0351e36b4c 100644 --- a/bin/debug/grep_logs.sh +++ b/bin/debug/grep_logs.sh @@ -16,9 +16,11 @@ # cwd=$(cd `dirname $0`; pwd) source ./check_iplist.sh +session=$1 for ip in ${iplist[@]};do - mkdir -p $1/$ip - scp -r $user@$ip:$EGGROLL_HOME/logs/*$1* $1/$ip + mkdir -p $session/$ip + scp -r $user@$ip:$EGGROLL_HOME/logs/*$session* $session/$ip + echo "The $session logs from $ip has saved in $cwd/$session/$ip, please check it." done cd $cwd From 82c060698e7f3270a60df86c9cbcaba950b408a4 Mon Sep 17 00:00:00 2001 From: zzzcq <1270934223@qq.com> Date: Fri, 4 Sep 2020 11:07:47 +0800 Subject: [PATCH 07/35] update check scripts --- ...\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git "a/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" index 9fbba5ad9f..7bfd218598 100644 --- "a/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" +++ "b/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" @@ -189,7 +189,7 @@ sh server_check.sh ${集群内节点个数} ${host party_id(可选)} [OK] eggroll route configured! - "port": 9801, "ip": "172.16.153.25" + "port": 9370, "ip": "127.0.0.1" - 检测失败提示: From d326c813b896e417673d6e5ce1c54a41969cd4b8 Mon Sep 17 00:00:00 2001 From: zzzcq <1270934223@qq.com> Date: Fri, 4 Sep 2020 11:15:20 +0800 Subject: [PATCH 08/35] update check scripts --- ...34\254\344\275\277\347\224\250\350\257\264\346\230\216.md" | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git "a/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" index 7bfd218598..056150687e 100644 --- "a/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" +++ "b/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" @@ -59,7 +59,7 @@ ------ -#### 2.3.1 完整版 +#### 2.3.1 单集群版 ```shell source $FATE_HOME/init_env.sh @@ -70,7 +70,7 @@ cat result_env.log 若对几个$开头的变量有疑问,请参考概述中的名词解释。 -#### 2.3.2 简约版 +#### 2.3.2 跨集群版 1、设置环境变量: From 52f221ead93218beec3c4ac6b06344189e995d68 Mon Sep 17 00:00:00 2001 From: zzzcq <1270934223@qq.com> Date: Fri, 4 Sep 2020 11:22:12 +0800 Subject: [PATCH 09/35] update check scripts --- bin/debug/cluster_env_check.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/debug/cluster_env_check.sh b/bin/debug/cluster_env_check.sh index 038e0e5c84..e709ce81ca 100644 --- a/bin/debug/cluster_env_check.sh +++ b/bin/debug/cluster_env_check.sh @@ -23,10 +23,10 @@ for ip in ${iplist[@]};do mkdir -p ${EGGROLL_HOME}/bin/debug fi - if ! ssh -tt app@$ip test -e ${EGGROLL_HOME}/bin/debug/check_env.sh;then + if ! ssh -tt $user@$ip test -e ${EGGROLL_HOME}/bin/debug/check_env.sh;then echo "${EGGROLL_HOME}/bin/debug/check_env.sh in $ip is not exist, scp check_env.sh to $ip:${EGGROLL_HOME}/bin/debug" scp ./check_env.sh $user@$ip:${EGGROLL_HOME}/bin/debug fi - ssh app@$ip "sh ${EGGROLL_HOME}/bin/debug/check_env.sh" >> $ip + ssh $user@$ip "sh ${EGGROLL_HOME}/bin/debug/check_env.sh" >> $ip echo "The check result from $ip has saved in $cwd/$ip, please check it." done From 6615a6f36bfeedd2e1cd86fd800a5c50353160c7 Mon Sep 17 00:00:00 2001 From: zzzcq <1270934223@qq.com> Date: Fri, 4 Sep 2020 11:37:40 +0800 Subject: [PATCH 10/35] update check scripts --- ...\254\344\275\277\347\224\250\350\257\264\346\230\216.md" | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git "a/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" index 056150687e..f5886fc7ca 100644 --- "a/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" +++ "b/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" @@ -39,7 +39,7 @@ - 单集群版:基于eggroll服务检测,需要各个节点eggroll服务正常启动后方可使用,用于检测各个nodemanager服务所在节点的集群基础信息,其检测项包含以下所有共7项; -- 跨集群版:无需依赖eggroll服务,可以跨节点检测指定所有ip的基础信息,其检测项仅包括以下列出前5项,需支持节点间免密登录。 +- 跨集群版:无需依赖eggroll服务,可以跨节点检测指定所有ip的基础信息,其检测项仅包括以下列出前5项,**需支持节点间免密登录**。 1)检查系统内存:系统内存总量、系统内存使用量、系统内存使用占比 @@ -283,6 +283,8 @@ sh server_check.sh ${集群内节点个数} ${host party_id(可选)} ### 4.3 使用方法 +**需支持节点间免密scp、ssh操作,或手动输入密码执行也可以** + 1、设置环境变量: ```shell @@ -333,6 +335,8 @@ sh grep_logs.sh ${需要查询的session-id} <带上需要搜集的session-id, ### 5.3 使用方法 +**需支持节点间免密scp、ssh操作,或手动输入密码执行也可以** + ------ 1、设置环境变量: From 3d41bc36153b86fc78e16263b9aa5e0eec890213 Mon Sep 17 00:00:00 2001 From: paulbaogang <1111@qq.com> Date: Fri, 4 Sep 2020 15:04:43 +0800 Subject: [PATCH 11/35] =?UTF-8?q?1=E3=80=81add=20deploy=20doc=20for=20vers?= =?UTF-8?q?ion=201.4.5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...te-allinone_deployment_guide_install_zh.md | 10 +- ...e-allinone_deployment_guide_install_zh.rst | 10 +- .../doc/Fate-exchange_deployment_guide_zh.md | 2 +- .../doc/Fate-exchange_deployment_guide_zh.rst | 2 +- .../doc/Fate_cluster_install_guide_ansible.md | 760 ++++++++++++++++++ .../doc/Fate_guest_install_guide_ansible.md | 647 +++++++++++++++ .../doc/Fate_step_by_step_install_zh.md | 8 +- .../doc/Fate_step_by_step_install_zh.rst | 8 +- cluster-deploy/images/deploy_cluster.png | Bin 0 -> 25518 bytes cluster-deploy/images/deploy_guest.png | Bin 0 -> 13503 bytes 10 files changed, 1427 insertions(+), 20 deletions(-) create mode 100644 cluster-deploy/doc/Fate_cluster_install_guide_ansible.md create mode 100644 cluster-deploy/doc/Fate_guest_install_guide_ansible.md create mode 100644 cluster-deploy/images/deploy_cluster.png create mode 100644 cluster-deploy/images/deploy_guest.png diff --git a/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.md b/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.md index 5205491b0e..57c955ae62 100644 --- a/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.md +++ b/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.md @@ -255,8 +255,8 @@ Swap: 131071 0 131071 ``` cd /data/projects/ -wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/fate-cluster-install-1.4.3-release-c7-u18.tar.gz -tar xzf fate-cluster-install-1.4.3-release-c7-u18.tar.gz +wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/fate-cluster-install-1.4.5-release-c7-u18.tar.gz +tar xzf fate-cluster-install-1.4.5-release-c7-u18.tar.gz ``` ## 5.2 部署前检查 @@ -292,7 +292,7 @@ vi fate-cluster-install/allInone/conf/setup.conf | 配置项 | 配置项值 | 说明 | | ---------------- | --------------------------------------------- | ------------------------------------------------------------ | | roles | 默认:"host" "guest" | 部署的角色,有HOST端、GUEST端 | -| version | 默认:1.4.3 | Fate 版本号 | +| version | 默认:1.4.5 | Fate 版本号 | | pbase | 默认: /data/projects | 项目根目录 | | lbase | 默认:/data/logs | 保持默认不要修改 | | ssh_user | 默认:app | ssh连接目标机器的用户,也是部署后文件的属主 | @@ -321,7 +321,7 @@ vi fate-cluster-install/allInone/conf/setup.conf #to install role roles=( "host" "guest" ) -version="1.4.3" +version="1.4.5" #project base pbase="/data/projects" @@ -377,7 +377,7 @@ basemodules=( "base" "java" "python" "eggroll" "fate" ) #to install role roles=( "host" ) -version="1.4.3" +version="1.4.5" #project base pbase="/data/projects" diff --git a/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.rst b/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.rst index 6020aada93..fa739a5d1f 100644 --- a/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.rst +++ b/cluster-deploy/doc/Fate-allinone_deployment_guide_install_zh.rst @@ -275,8 +275,8 @@ ssh app@192.168.0.2 :: cd /data/projects/ - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/fate-cluster-install-1.4.3-release-c7-u18.tar.gz - tar xzf fate-cluster-install-1.4.3-release-c7-u18.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/fate-cluster-install-1.4.5-release-c7-u18.tar.gz + tar xzf fate-cluster-install-1.4.5-release-c7-u18.tar.gz 5.2 部署前检查 -------------- @@ -314,7 +314,7 @@ ssh app@192.168.0.2 +======================+=================================================+==============================================================================+ | roles | 默认:"host" "guest" | 部署的角色,有HOST端、GUEST端 | +----------------------+-------------------------------------------------+------------------------------------------------------------------------------+ -| version | 默认:1.4.3 | Fate 版本号 | +| version | 默认:1.4.5 | Fate 版本号 | +----------------------+-------------------------------------------------+------------------------------------------------------------------------------+ | pbase | 默认: /data/projects | 项目根目录 | +----------------------+-------------------------------------------------+------------------------------------------------------------------------------+ @@ -366,7 +366,7 @@ ssh app@192.168.0.2 #to install role roles=( "host" "guest" ) - version="1.4.3" + version="1.4.5" #project base pbase="/data/projects" @@ -421,7 +421,7 @@ ssh app@192.168.0.2 #to install role roles=( "host" ) - version="1.4.3" + version="1.4.5" #project base pbase="/data/projects" diff --git a/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.md b/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.md index d598a19492..4980426d29 100644 --- a/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.md +++ b/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.md @@ -146,7 +146,7 @@ fi ``` cd /data/projects/install wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/jdk-8u192-linux-x64.tar.gz -wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.3-release.tar.gz +wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.5-release.tar.gz ``` ## 5.2 操作系统参数检查 diff --git a/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.rst b/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.rst index 5fe7c60c57..afc58587bc 100644 --- a/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.rst +++ b/cluster-deploy/doc/Fate-exchange_deployment_guide_zh.rst @@ -160,7 +160,7 @@ ufw status cd /data/projects/install wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/jdk-8u192-linux-x64.tar.gz - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.3-release.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.5-release.tar.gz 5.2 操作系统参数检查 -------------------- diff --git a/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md b/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md new file mode 100644 index 0000000000..97c40312d7 --- /dev/null +++ b/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md @@ -0,0 +1,760 @@ +# Fate cluster部署指南 + +# 1.总体介绍 + +### 1.1.系统介绍 + +1)FATE + +FATE (Federated AI Technology Enabler) 是微众银行AI部门发起的开源项目,提供了一种基于数据隐私保护的安全计算框架,为机器学习、深度学习、迁移学习算法提供强有力的安全计算支持。 安全底层支持同态加密、秘密共享、哈希散列等多种多方安全计算机制,算法层支持多方安全计算模式下的逻辑回归、Boosting、联邦迁移学习等。 + +2)EggRoll + +Eggroll 是一个适用于机器学习和深度学习的大规模分布式架构,包括了计算、存储和通信等模块。为FATE框架提供底层支撑。 + +3)FATE官方网站:https://fate.fedai.org/ + +本文将介绍使用ansible部署脚本进行FATE集群的部署。 + +### 1.2.组件说明 + +| 软件产品 | 组件 | 端口 | 说明 | +| -------- | -------------- | --------- | -------------------------------------- | +| fate | fate_flow | 9360;9380 | 联合学习任务流水线管理模块 | +| fate | fateboard | 8080 | 联合学习过程可视化模块 | +| fate | FederatedML | | 算法代码包 | +| eggroll | clustermanager | 4670 | cluster manager管理集群 | +| eggroll | nodemanger | 4671 | node manager管理每台机器资源 | +| eggroll | rollsite | 9370 | 跨站点或者跨party通讯组件 | +| mysql | mysql | 3306 | 数据存储,clustermanager和fateflow依赖 | + +### 1.3.系统架构 + +
+ +
+ +# 2.详细设计 + +## 2.1.部署规划 + + 本示例是每端只有一台主机,每端可以多台主机,目前只支持nodemanager多节点部署,其他组件都是单节点。 + +| role | partyid | IP地址 | 操作系统 | 主机配置 | 存储 | 部署模块 | +| ----- | ------- | --------------------- | ----------------------- | -------- | ---- | ------------------------------------------------------------ | +| host | 10000 | 192.168.0.1 (有外网) | CentOS 7.2/Ubuntu 16.04 | 8C16G | 500G | fate_flow,fateboard,clustermanager,nodemanger,rollsite,mysql | +| guest | 9999 | 192.168.0.2 | CentOS 7.2/Ubuntu 16.04 | 8C16G | 500G | fate_flow,fateboard,clustermanager,nodemanger,rollsite,mysql | + +## 2.2.主机资源和操作系统要求 + +| **类别** | **说明** | +| -------- | ------------------------------------------------------------ | +| 主机配置 | 不低于8C16G500G,千兆网卡 | +| 操作系统 | CentOS linux 7.2及以上同时低于8/Ubuntu 16.04 或 Ubuntu 18.04 | +| 依赖包 | 需要安装如下依赖包:
#centos
gcc gcc-c++ make openssl-devel gmp-devel mpfr-devel libmpc-devel libaio
numactl autoconf automake libtool libffi-devel ansible jq supervisor
#ubuntu
gcc g++ make openssl supervisor ansible jq libgmp-dev libmpfr-dev libmpc-dev
libaio libaio-dev numactl autoconf automake libtool libffi-dev ansible jq supervisor
cd /usr/lib/x86_64-linux-gnu
if [ ! -f "libssl.so.10" ];then
ln -s libssl.so.1.0.0 libssl.so.10
ln -s libcrypto.so.1.0.0 libcrypto.so.10
fi | +| 用户 | 用户:app,属主:apps(app用户需可以sudo su root而无需密码) | +| 文件系统 | 1、数据盘挂载在/data目录下。
2、创建/data/projects目录,目录属主为:app:apps。
3、根目录空闲空间不低于20G。 | +| 虚拟内存 | 不低于128G | +| 系统参数 | 1、文件句柄数不低于65535。
2、用户进程数不低于65535。 | + +## 2.3.网络要求 + +| 类别 | 说明 | +| ------------ | ------------------------------------------------------------ | +| 防火墙策略 | 1、如果通过公网互通,两端rollsite所部署主机需互相开通出和入防火墙策略。
2、防火墙设备需要支持长连接和需要对连接数无限制。 | +| 外网带宽 | 入通过公网互通,公网带宽不低于20Mb | +| 负载均衡设备 | 1、如果出或入口有负载均衡设备则此设备需要支持grpc或者支持透明转发。
2、fate为双向通讯,需支持出和入方向主动请求。 | + +3.基础环境配置 +============== + +3.1 hostname配置(可选) +---------------- + +**1)修改主机名** + +**在192.168.0.1 root用户下执行:** + +hostnamectl set-hostname VM_0_1_centos + +**在192.168.0.2 root用户下执行:** + +hostnamectl set-hostname VM_0_2_centos + +**2)加入主机映射** + +**在目标服务器(192.168.0.1 192.168.0.2)root用户下执行:** + +vim /etc/hosts + +192.168.0.1 VM_0_1_centos + +192.168.0.2 VM_0_2_centos + +3.2 关闭selinux(可选) +--------------- + +**在目标服务器(192.168.0.1 192.168.0.2)root用户下执行:** + +确认是否已安装selinux + +centos系统执行:rpm -qa | grep selinux + +ubuntu系统执行:apt list --installed | grep selinux + +如果已安装了selinux就执行:setenforce 0 + +3.3 修改Linux系统参数 +--------------------------- + +**在目标服务器(192.168.0.1 192.168.0.2)root用户下执行:** + +1)vim /etc/security/limits.conf + +\* soft nofile 65535 + +\* hard nofile 65535 + +2)vim /etc/security/limits.d/20-nproc.conf + +\* soft nproc unlimited + +3.4 关闭防火墙 +-------------- + +**在目标服务器(192.168.0.1 192.168.0.2 )root用户下执行** + +如果是Centos系统: + +systemctl disable firewalld.service + +systemctl stop firewalld.service + +systemctl status firewalld.service + +如果是Ubuntu系统: + +ufw disable + +ufw status + +3.5 软件环境初始化 +------------------ + +**1)创建用户** + +**在目标服务器(192.168.0.1 192.168.0.2)root用户下执行** + +``` +groupadd -g 6000 apps +useradd -s /bin/bash -g apps -d /home/app app +passwd app +``` + +**2)配置sudo** + +**在目标服务器(192.168.0.1 192.168.0.2)root用户下执行** + +vim /etc/sudoers.d/app + +app ALL=(ALL) ALL + +app ALL=(ALL) NOPASSWD: ALL + +Defaults !env_reset + +**3)配置ssh无密登录** + +**a. 在目标服务器(192.168.0.1 192.168.0.2)app用户下执行** + +su app + +ssh-keygen -t rsa + +cat \~/.ssh/id_rsa.pub \>\> /home/app/.ssh/authorized_keys + +chmod 600 \~/.ssh/authorized_keys + +**b.合并id_rsa_pub文件** + +拷贝192.168.0.1的authorized_keys 到192.168.0.2 +\~/.ssh目录下,追加到192.168.0.2的id_rsa.pub到authorized_keys,然后再拷贝到192.168.0.1 + +**在192.168.0.1 app用户下执行** + +scp \~/.ssh/authorized_keys app\@192.168.0.2:/home/app/.ssh + +输入密码 + +**在192.168.0.2 app用户下执行** + +cat \~/.ssh/id_rsa.pub \>\> /home/app/.ssh/authorized_keys + +scp \~/.ssh/authorized_keys app\@192.168.0.1:/home/app/.ssh + +覆盖之前的文件 + +**c. 在目标服务器(192.168.0.1 192.168.0.2)app用户下执行ssh 测试** + +ssh app\@192.168.0.1 + +ssh app\@192.168.0.2 + +## 3.6 增加虚拟内存 + +**目标服务器(192.168.0.1 192.168.0.2)root用户执行** + +生产环境使用时,因内存计算需要增加128G虚拟内存,执行前需检查存储空间是否足够。 + +``` +cd /data +dd if=/dev/zero of=/data/swapfile128G bs=1024 count=134217728 +mkswap /data/swapfile128G +swapon /data/swapfile128G +cat /proc/swaps +echo '/data/swapfile128G swap swap defaults 0 0' >> /etc/fstab +``` + +## 3.7 安装ansible + +**目标服务器(192.168.0.1) root用户执行** + +``` +#判断是否已安装ansible +ansible --version +#没有则执行 +yum install -y ansible +``` + + + +4.项目部署 +========== + +### 4.1 部署示意图 + +
+ +
+ +### 4.2 系统检查 + +**在目标服务器(192.168.0.1 192.168.0.2)app用户下执行** + +``` +#虚拟内存,size不低于128G,如不满足需参考4.6章节重新设置 +cat /proc/swaps +Filename Type Size Used Priority +/data/swapfile128G file 134217724 384 -1 + +#文件句柄数,不低于65535,如不满足需参考4.3章节重新设置 +ulimit -n +65535 + +#用户进程数,不低于64000,如不满足需参考4.3章节重新设置 +ulimit -u +65535 + +#检查进程是否有fate进程残留,如有则需要停止服务 +ps -ef| grep -i fate + +netstat -tlnp | grep 4670 +netstat -tlnp | grep 4671 +netstat -tlnp | grep 9370 +netstat -tlnp | grep 9360 +netstat -tlnp | grep 8080 +netstat -tlnp | grep 3306 + +#检查部署目录,如有需先进行mv +ls -ld /data/projects/fate +ls -ld /data/projects/data +ls -ld /data/projects/snmp + +#检查supervisord配置文件,如有则需要mv或者删除掉 +ls -lrt /data/projects/common/supervisord/supervisord.d/fate-*.conf + +``` + +### 4.3 获取项目 + +**在目标服务器(192.168.0.1 具备外网环境)app用户下执行** + +进入执行节点的/data/projects/目录,执行: + +``` +#注意:URL链接有换行,拷贝的时候注意整理成一行 +cd /data/projects/ +wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/ansible_nfate_1.4.5_release-1.0.0.tar.gz +tar xzf ansible_nfate_1.4.5_release-1.0.0.tar.gz +``` + +### 4.4 配置文件修改和示例 + +#### 4.4.1 初始化配置文件 + +``` +cd ansible-nfate-* +#init.sh文件不需要修改,主要是辅助生成一些配置文件 + +#测试环境加test参数执行 + sh ./tools/init.sh test + +#生产环境加prod参数执行 + sh ./tools/init.sh prod + +>sh ./tools/init.sh prod +clean old config +init environments/prod +init var_files/prod +init project_prod.yml +``` + +#### 4.4.2 修改配置文件 + +**1)修改初始化主机IP** + +``` +vi /data/projects/ansible-nfate-1.*/environments/prod/hosts + +#ansible格式配置文件 +[init] ---把需要部署的主机IP填入init组 +192.168.0.1 +192.168.0.2 + +[all:vars] +ansible_connection=ssh +ansible_ssh_port=22 ---根据实际情况修改 +ansible_ssh_user=app +#ansible_ssh_pass=test ---如果未做免密登陆需提供密码 +##method: sudo or su +ansible_become_method=sudo +ansible_become_user=root +ansible_become_pass= ---各个主机未做免密sudo需填写root密码 + +``` + +**2)修改部署模式** + +``` +vi /data/projects/ansible-nfate-1.*/var_files/prod/fate_init + +#只修改如下参数,其他参数默认不变 +deploy_mode: "install" ---默认为空,修改为install,表示新部署 +``` + +**3)修改host方参数** + +``` +#不部署host方则不用修改 +#除了nodemanger可以设置多个IP外,其他都是单个IP +vi /data/projects/ansible-nfate-1.*/var_files/prod/fate_host + +host: + partyid: 10000 ---host端partyid,根据实际规划修改 + rollsite: + enable: True + ips: ---IP列表,目前rollsite只支持部署到一台服务器 + - 192.168.0.1 + port: 9370 + pool_size: 600 ---线程池大小 + max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如8G + default_rules: ---本party指向exchange或者其他party的IP,端口路由配置 + - name: default + ip: 192.168.0.3 ---exchange或者对端party rollsite IP + port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370 + rules: ---本party自身路由配置 + - name: default + ip: 192.168.0.1 + port: 9370 + - name: fateflow + ip: 192.168.0.1 + port: 9360 + clustermanager: + enable: True + ips: + - 192.168.0.1 ---只支持部署一台主机 + port: 4670 + nodemanager: + enable: True + ips: ---支持部署多台 + - 192.168.0.1 + - 192.168.0.x + port: 4671 + eggroll: + dbname: "eggroll_meta" + egg: 2 + fate_flow: + enable: True + ips: + - 192.168.0.1 ---只支持部署一台主机 + grpcPort: 9360 + httpPort: 9380 + dbname: "fate_flow" + fateboard: + enable: True + ips: + - 192.168.0.1 ---只支持部署一台主机 + port: 8080 + dbname: "fate_flow" + mysql: + enable: True + ips: + - 192.168.0.1 ---只支持部署一台主机 + port: 3306 + dbuser: "fate" + dbpasswd: "fate_deV2999" + zk: + enable: False + lists: + - ip: 192.168.0.1 + port: 2181 + use_acl: false + user: "fate" + passwd: "fate" +``` + +**4)修改guest参数** + +``` +#不部署guest方则不用修改 +#除了nodemanger可以设置多个IP外,其他都是单个IP +vi /data/projects/ansible-nfate-1.*/var_files/prod/fate_guest + +guest: + partyid: 9999 ---根据实际规划修改 + rollsite: + enable: True + ips: ---IP列表,目前rollsite只支持部署到一台服务器 + - 192.168.0.2 + port: 9370 + pool_size: 600 ---线程池大小 + max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如8G + default_rules: ---本party指向exchange或者其他party的IP,端口路由配置 + - name: default + ip: 192.168.0.3 ---exchange或者对端party rollsite IP + port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370 + rules: ---本party自身路由配置 + - name: default + ip: 192.168.0.2 + port: 9370 + - name: fateflow + ip: 192.168.0.2 + port: 9360 + clustermanager: + enable: True + ips: ---只支持部署一台主机 + - 192.168.0.2 + port: 4670 + nodemanager: + enable: True + ips: ---支持部署多台主机 + - 192.168.0.2 + - 192.168.0.x + port: 4671 + eggroll: + dbname: "eggroll_meta" + egg: 2 + fate_flow: + enable: True + ips: ---只支持部署一台主机 + - 192.168.0.2 + grpcPort: 9360 + httpPort: 9380 + dbname: "fate_flow" + fateboard: + enable: True + ips: ---只支持部署一台主机 + - 192.168.0.2 + port: 8080 + dbname: "fate_flow" + mysql: + enable: True + ips: ---只支持部署一台主机 + - 192.168.0.2 + port: 3306 + dbuser: "fate" + dbpasswd: "fate_deV2999" + zk: + enable: False + lists: + - ip: 192.168.0.2 + port: 2181 + use_acl: false + user: "fate" + passwd: "fate" +``` + +**5)修改exchange参数** + +``` +#不部署exchange则不需要修改 +vi /data/projects/ansible-nfate-1.*/var_files/prod/fate_exchange + +exchange: + enable: True + rollsite: + ips: + - 192.168.0.3 + port: 9370 + pool_size: 600 + max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如8G + partys: ---指向各party的路由配置 + - id: 10000 + rules: + - name: default + ip: 192.168.0.1 + port: 9367 + - id: 9999 + rules: + - name: default + ip: 192.168.0.2 + port: 9370 +``` + + + +### 4.5 部署 + +按照上述配置含义修改对应的配置项后,然后执行部署脚本: + +``` +#相对ansible-nfate-*目录 +cd /data/projects/ansible-nfate-1.* +#测试环境加test参数执行 + nohup sh ./boot.sh test -D > logs/boot.log 2>&1 & + +#生产环境加prod参数执行 +nohup sh ./boot.sh prod -D > logs/boot.log 2>&1 & +``` + +部署日志输出在logs目录下,实时查看是否有报错: + +``` +#相对ansible-nfate-*目录 +cd logs +tail -f ansible.log (实时查看部署情况,如果没有这个日志文件,需要查看是否有安装ansible) + +检查项不通过提示列表: +1、"Warning: now swap is 0, need to turn up" + ---没有设置虚拟内存,请参考前面章节进行设置,不低于128G。 +2、"Warning: key fate process exists, please has a check and clean" + ---环境没有清理干净,需要把以前部署的fate进程停掉。 +3、"Warning: these ports: 4670 4671 9360 9370 9380 have been used" + ---环境没有清理干净,需要把以前部署的fate进程停掉。 +4、"Warning: if reinstall mysql, please stop mysql, and rename /etc/my.cnf" + ---mysql没有停止,需要停止。如果有/etc/my.cnf文件,需要mv改名。 +5、"Waring: please rename /data/projects/fate" + ---fate目录存在,需要先mv。 +6、"Warning: please rename /data/projects/data/fate/mysql" + ---/data/projects/data存在,需要mv。 +7、"Warning: supervisor_fate_conf exists, please remove ls /data/projects/common/supervisord/supervisord.d/fate-*.conf" + ---/data/projects/common目录存在,需要mv。 +``` + +### 4.6 问题定位 + +1)eggroll日志 + + /data/logs/fate/eggroll/bootstrap.clustermanager.err + +/data/logs/fate/eggroll/logs/eggroll/clustermanager.jvm.err.log + +/data/logs/fate/eggroll/logs/eggroll/nodemanager.jvm.err.log + +/data/logs/fate/eggroll/logs/eggroll/bootstrap.nodemanager.err + +/data/logs/fate/eggroll/logs/eggroll/bootstrap.rollsite.err + +/data/logs/fate/eggroll/logs/eggroll/rollsite.jvm.err.log + +2)fateflow日志 + +/data/logs/fate/python/logs/fate_flow/ + +3)fateboard日志 + +/data/logs/fate/fate/fateboard/logs + +5.测试 +====== + +5.1 Toy_example部署验证 +----------------------- + +此测试您需要设置3个参数:guest_partyid,host_partyid,work_mode。 + +### 5.1.1 单边测试 + +1)192.168.0.1上执行,guest_partyid和host_partyid都设为10000: + +``` +source /data/projects/fate/init_env.sh +cd /data/projects/fate/python/examples/toy_example/ +python run_toy_example.py 10000 10000 1 +``` + +类似如下结果表示成功: + +"2020-04-28 18:26:20,789 - secure_add_guest.py[line:126] - INFO: success to calculate secure_sum, it is 1999.9999999999998" + +2)192.168.0.2上执行,guest_partyid和host_partyid都设为9999: + +``` +source /data/projects/fate/init_env.sh +cd /data/projects/fate/python/examples/toy_example/ +python run_toy_example.py 9999 9999 1 +``` + +类似如下结果表示成功: + +"2020-04-28 18:26:20,789 - secure_add_guest.py[line:126] - INFO: success to calculate secure_sum, it is 1999.9999999999998" + +### 5.1.2 双边测试 + +选定9999为guest方,在192.168.0.2上执行: + +``` +source /data/projects/fate/init_env.sh +cd /data/projects/fate/python/examples/toy_example/ +python run_toy_example.py 9999 10000 1 +``` + +类似如下结果表示成功: + +"2020-04-28 18:26:20,789 - secure_add_guest.py[line:126] - INFO: success to calculate secure_sum, it is 1999.9999999999998" + +5.2 最小化测试 +-------------- + +### **5.2.1 上传预设数据:** + +分别在192.168.0.1和192.168.0.2上执行: + +``` +source /data/projects/fate/init_env.sh +cd /data/projects/fate/python/examples/scripts/ +python upload_default_data.py -m 1 +``` + +更多细节信息,敬请参考[脚本README](../../examples/scripts/README.rst) + +### **5.2.2 快速模式:** + +请确保guest和host两方均已分别通过给定脚本上传了预设数据。 + +快速模式下,最小化测试脚本将使用一个相对较小的数据集,即包含了569条数据的breast数据集。 + +选定9999为guest方,在192.168.0.2上执行: + +``` +source /data/projects/fate/init_env.sh +cd /data/projects/fate/python/examples/min_test_task/ +python run_task.py -m 1 -gid 9999 -hid 10000 -aid 10000 -f fast +``` + +其他一些可能有用的参数包括: + +1. -f: 使用的文件类型. "fast" 代表 breast数据集, "normal" 代表 default credit 数据集. +2. --add_sbt: 如果被设置为1, 将在运行完lr以后,启动secureboost任务,设置为0则不启动secureboost任务,不设置此参数系统默认为1。 + +若数分钟后在结果中显示了“success”字样则表明该操作已经运行成功了。若出现“FAILED”或者程序卡住,则意味着测试失败。 + +### **5.2.3 正常模式**: + +只需在命令中将“fast”替换为“normal”,其余部分与快速模式相同。 + +5.3 Fateboard testing +---------------------- + +Fateboard是一项Web服务。如果成功启动了fateboard服务,则可以通过访问 http://192.168.0.1:8080 和 http://192.168.0.2:8080 来查看任务信息,如果本地办公电脑和服务器之间有防火墙则需开通。 + +6.系统运维 +================ + +6.1 服务管理 +------------ + +**在目标服务器(192.168.0.1 192.168.0.2)app用户下执行** + +### 6.1.1 服务管理 + +``` +cd /data/projects/common/supervisord +``` + +启动/关闭/查看所有: + +``` +sh service.sh start/stop/status all +``` + +启动/关闭/查看单个模块(可选:clustermanager,nodemanager,rollsite,fateflow,fateboard,mysql): + +``` +sh service.sh start/stop/status fate-clustermanager +``` + +## 6.2 查看进程和端口 + +**在目标服务器(192.168.0.1 192.168.0.2 )app用户下执行** + +### 6.2.1 查看进程 + +``` +#根据部署规划查看进程是否启动 +ps -ef | grep -i clustermanager +ps -ef | grep -i nodemanager +ps -ef | grep -i rollsite +ps -ef | grep -i fate_flow_server.py +ps -ef | grep -i fateboard +``` + +### 6.2.2 查看进程端口 + +``` +#根据部署规划查看进程端口是否存在 +#clustermanager +netstat -tlnp | grep 4670 +#nodemanager +netstat -tlnp | grep 4671 +#rollsite +netstat -tlnp | grep 9370 +#fate_flow_server +netstat -tlnp | grep 9360 +#fateboard +netstat -tlnp | grep 8080 +``` + + + +## 6.2.3 服务日志 + +| 服务 | 日志路径 | +| -------------------------------- | ------------------------------ | +| eggroll | /data/logs/fate/eggroll/logs | +| fate_flow&任务日志(fateflow节点) | /data/logs/fate/python/logs | +| fateboard | /data/logs/fate/fateboard/logs | +| mysql | /data/logs/fate/mysql/ | + +### 6.2.4 文件目录说明 + +| 文件路径 | 说明 | +| --------------------------------- | ------------------------------ | +| /data/projects/fate | 软件部署路径 | +| /data/projects/data | mysql数据存放路径 | +| /data/logs | 日志路径 | +| /data/projects/common/supervisord | 进程管理工具supervisor安装路径 | + +# 7. 附录 + +## 7.1 Eggroll参数调优 + +配置文件路径:/data/projects/fate/eggroll/conf/eggroll.properties + +配置参数:eggroll.session.processors.per.node + +假定 CPU核数(cpu cores)为 c, Nodemanager的数量为 n,需要同时运行的任务数为 p,则: + +egg_num=eggroll.session.processors.per.node = c * 0.8 / p + +partitions (roll pair分区数)= egg_num * n \ No newline at end of file diff --git a/cluster-deploy/doc/Fate_guest_install_guide_ansible.md b/cluster-deploy/doc/Fate_guest_install_guide_ansible.md new file mode 100644 index 0000000000..6267c7de70 --- /dev/null +++ b/cluster-deploy/doc/Fate_guest_install_guide_ansible.md @@ -0,0 +1,647 @@ +# Fate guest端单party部署指南 + +# 1.总体介绍 + +### 1.1.系统介绍 + +1)FATE + +FATE (Federated AI Technology Enabler) 是微众银行AI部门发起的开源项目,提供了一种基于数据隐私保护的安全计算框架,为机器学习、深度学习、迁移学习算法提供强有力的安全计算支持。 安全底层支持同态加密、秘密共享、哈希散列等多种多方安全计算机制,算法层支持多方安全计算模式下的逻辑回归、Boosting、联邦迁移学习等。 + +2)EggRoll + +Eggroll 是一个适用于机器学习和深度学习的大规模分布式架构,包括了计算、存储和通信等模块。为FATE框架提供底层支撑。 + +3)FATE官方网站:https://fate.fedai.org/ + +本文将介绍使用ansible部署脚本进行FATE guest端单party的部署。 + +### 1.2.组件说明 + +| 软件产品 | 组件 | 端口 | 说明 | +| -------- | -------------- | --------- | -------------------------------------- | +| fate | fate_flow | 9360;9380 | 联合学习任务流水线管理模块 | +| fate | fateboard | 8080 | 联合学习过程可视化模块 | +| fate | FederatedML | | 算法代码包 | +| eggroll | clustermanager | 4670 | cluster manager管理集群 | +| eggroll | nodemanger | 4671 | node manager管理每台机器资源 | +| eggroll | rollsite | 9370 | 跨站点或者跨party通讯组件 | +| mysql | mysql | 3306 | 数据存储,clustermanager和fateflow依赖 | + +### 1.3.系统架构 + +
+ +
+# 2.详细设计 + +## 2.1.部署规划 + +| role | partyid | IP地址 | 操作系统 | 主机配置 | 存储 | 外网IP | 外网带宽 | 部署模块 | +| ----- | ---------------------- | --------------------- | ----------------------- | -------- | ---- | ----------- | -------- | ----------------------------------------------------- | +| guest | 9999(根据实际规划修改) | 192.168.0.1 (有外网) | CentOS 7.2/Ubuntu 16.04 | 8C16G | 500G | xx.xx.xx.xx | >=20Mb | fate_flow,fateboard,clustermanager,rollsite,mysql | +| guest | 9999(根据实际规划修改) | 192.168.0.2 | CentOS 7.2/Ubuntu 16.04 | 16C32G | 2T | | | nodemanger | + +## 2.2.主机资源和操作系统要求 + +| **类别** | **说明** | +| -------- | ------------------------------------------------------------ | +| 主机配置 | 不低于8C16G500G,千兆网卡 | +| 操作系统 | CentOS linux 7.2及以上同时低于8/Ubuntu 16.04 或 Ubuntu 18.04 | +| 依赖包 | 需要安装如下依赖包:
#centos
gcc gcc-c++ make openssl-devel gmp-devel mpfr-devel libmpc-devel libaio
numactl autoconf automake libtool libffi-devel ansible jq supervisor
#ubuntu
gcc g++ make openssl supervisor ansible jq libgmp-dev libmpfr-dev libmpc-dev
libaio libaio-dev numactl autoconf automake libtool libffi-dev ansible jq supervisor
cd /usr/lib/x86_64-linux-gnu
if [ ! -f "libssl.so.10" ];then
ln -s libssl.so.1.0.0 libssl.so.10
ln -s libcrypto.so.1.0.0 libcrypto.so.10
fi | +| 用户 | 用户:app,属主:apps(app用户需可以sudo su root而无需密码) | +| 文件系统 | 1、数据盘挂载在/data目录下。
2、创建/data/projects目录,目录属主为:app:apps。
3、根目录空闲空间不低于20G。 | +| 虚拟内存 | 不低于128G | +| 系统参数 | 1、文件句柄数不低于65535。
2、用户进程数不低于65535。 | + +## 2.3.网络要求 + +| 类别 | 说明 | +| ------------ | ------------------------------------------------------------ | +| 防火墙策略 | 1、公网主机需要和host端互相开通出和入防火墙策略。
2、防火墙设备需要支持长连接和需要对连接数无限制。 | +| 外网带宽 | 不低于20Mb | +| 负载均衡设备 | 1、如果出或入口有负载均衡设备则此设备需要支持grpc或者支持透明转发。
2、fate为双向通讯,需支持出和入方向主动请求。 | + +3.基础环境配置 +============== + +3.1 hostname配置(可选) +---------------- + +**1)修改主机名** + +**在192.168.0.1 root用户下执行:** + +hostnamectl set-hostname VM_0_1_centos + +**在192.168.0.2 root用户下执行:** + +hostnamectl set-hostname VM_0_2_centos + +**2)加入主机映射** + +**在目标服务器(192.168.0.1 192.168.0.2)root用户下执行:** + +vim /etc/hosts + +192.168.0.1 VM_0_1_centos + +192.168.0.2 VM_0_2_centos + +3.2 关闭selinux(可选) +--------------- + +**在目标服务器(192.168.0.1 192.168.0.2)root用户下执行:** + +确认是否已安装selinux + +centos系统执行:rpm -qa | grep selinux + +ubuntu系统执行:apt list --installed | grep selinux + +如果已安装了selinux就执行:setenforce 0 + +3.3 修改Linux系统参数 +--------------------------- + +**在目标服务器(192.168.0.1 192.168.0.2)root用户下执行:** + +1)vim /etc/security/limits.conf + +\* soft nofile 65535 + +\* hard nofile 65535 + +2)vim /etc/security/limits.d/20-nproc.conf + +\* soft nproc unlimited + +3.4 关闭防火墙 +-------------- + +**在目标服务器(192.168.0.1 192.168.0.2 )root用户下执行** + +如果是Centos系统: + +systemctl disable firewalld.service + +systemctl stop firewalld.service + +systemctl status firewalld.service + +如果是Ubuntu系统: + +ufw disable + +ufw status + +3.5 软件环境初始化 +------------------ + +**1)创建用户** + +**在目标服务器(192.168.0.1 192.168.0.2)root用户下执行** + +``` +groupadd -g 6000 apps +useradd -s /bin/bash -g apps -d /home/app app +passwd app +``` + +**2)配置sudo** + +**在目标服务器(192.168.0.1 192.168.0.2)root用户下执行** + +vim /etc/sudoers.d/app + +app ALL=(ALL) ALL + +app ALL=(ALL) NOPASSWD: ALL + +Defaults !env_reset + +**3)配置ssh无密登录** + +**a. 在目标服务器(192.168.0.1 192.168.0.2)app用户下执行** + +su app + +ssh-keygen -t rsa + +cat \~/.ssh/id_rsa.pub \>\> /home/app/.ssh/authorized_keys + +chmod 600 \~/.ssh/authorized_keys + +**b.合并id_rsa_pub文件** + +拷贝192.168.0.1的authorized_keys 到192.168.0.2 +\~/.ssh目录下,追加到192.168.0.2的id_rsa.pub到authorized_keys,然后再拷贝到192.168.0.1 + +**在192.168.0.1 app用户下执行** + +scp \~/.ssh/authorized_keys app\@192.168.0.2:/home/app/.ssh + +输入密码 + +**在192.168.0.2 app用户下执行** + +cat \~/.ssh/id_rsa.pub \>\> /home/app/.ssh/authorized_keys + +scp \~/.ssh/authorized_keys app\@192.168.0.1:/home/app/.ssh + +覆盖之前的文件 + +**c. 在目标服务器(192.168.0.1 192.168.0.2)app用户下执行ssh 测试** + +ssh app\@192.168.0.1 + +ssh app\@192.168.0.2 + +## 3.6 增加虚拟内存 + +**目标服务器(192.168.0.1 192.168.0.2)root用户执行** + +生产环境使用时,因内存计算需要增加128G虚拟内存,执行前需检查存储空间是否足够。 + +``` +cd /data +dd if=/dev/zero of=/data/swapfile128G bs=1024 count=134217728 +mkswap /data/swapfile128G +swapon /data/swapfile128G +cat /proc/swaps +echo '/data/swapfile128G swap swap defaults 0 0' >> /etc/fstab +``` + +## 3.7 安装ansible + +**目标服务器(192.168.0.1) root用户执行** + +``` +#判断是否已安装ansible +ansible --version +#没有则执行 +yum install -y ansible +``` + +4 项目部署 +========== + +### 4.1 部署示意图 + +
+ +
+ + +### 4.2 系统检查 + +**在目标服务器(192.168.0.1 192.168.0.2)app用户下执行** + +``` +#虚拟内存,size不低于128G,如不满足需参考4.6章节重新设置 +cat /proc/swaps +Filename Type Size Used Priority +/data/swapfile128G file 134217724 384 -1 + +#文件句柄数,不低于65535,如不满足需参考4.3章节重新设置 +ulimit -n +65535 + +#用户进程数,不低于64000,如不满足需参考4.3章节重新设置 +ulimit -u +65535 + +#检查进程是否有fate进程残留,如有则需要停止服务,supervisor管理的进程需用supervisor停止 +ps -ef| grep -i fate + +netstat -tlnp | grep 4670 +netstat -tlnp | grep 4671 +netstat -tlnp | grep 9370 +netstat -tlnp | grep 9360 +netstat -tlnp | grep 8080 +netstat -tlnp | grep 3306 + +#检查部署目录,如有需先进行mv +ls -ld /data/projects/fate +ls -ld /data/projects/data +ls -ld /data/projects/snmp + +#检查supervisord配置文件,如有则需要mv或者删除掉 +ls -lrt /data/projects/common/supervisord/supervisord.d/fate-*.conf + +``` + +4.3 获取项目 +------------ + +**在目标服务器(192.168.0.1 具备外网环境)app用户下执行** + +进入执行节点的/data/projects/目录,执行: + +``` +#注意:URL链接有换行,拷贝的时候注意整理成一行 +cd /data/projects/ +wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/ansible_nfate_1.4.5_release-1.0.0.tar.gz +tar xzf ansible_nfate_1.4.5_release-1.0.0.tar.gz +``` + +4.4 配置文件修改和示例 +---------------- + +### 4.4.1 初始化配置文件 + +``` +cd ansible-nfate-* +#init.sh文件不需要修改,主要是辅助生成一些配置文件 + +#测试环境加test参数执行 + sh ./tools/init.sh test + +#生产环境加prod参数执行 + sh ./tools/init.sh prod + +>sh ./tools/init.sh prod +clean old config +init environments/prod +init var_files/prod +init project_prod.yml +``` + +### 4.4.2 修改配置文件 + +**1)修改初始化主机IP** + +``` +vi /data/projects/ansible-nfate-1.*/environments/prod/hosts + +#ansible格式配置文件 +[init] ---把需要部署的主机IP填入init组 +192.168.0.1 +192.168.0.2 + +[all:vars] +ansible_connection=ssh +ansible_ssh_port=22 ---根据实际主机ssh协议端口修改 +ansible_ssh_user=app +#ansible_ssh_pass=test ---如果未做免密登陆需提供密码 +##method: sudo or su +ansible_become_method=sudo +ansible_become_user=root +ansible_become_pass= ---各个主机未做免密sudo需填写root密码 + +``` + +**2)修改部署模式** + +``` +vi /data/projects/ansible-nfate-1.*/var_files/prod/fate_init + +#只修改如下参数,其他参数默认不变 +deploy_mode: "install" ---默认为空,修改为install,表示新部署 +``` + +**3)修改guest参数** + +``` +#除了nodemanger可以设置多个IP外,其他都是单个IP +vi /data/projects/ansible-nfate-1.*/var_files/prod/fate_guest + +guest: + partyid: 9999 ---根据实际规划修改,这个参数务必需要和webank确认后修改 + rollsite: + enable: True ---是否部署rollsite模块,True为部署,False为否 + ips: ---IP列表,目前rollsite只支持部署到一台服务器 + - 192.168.0.1 + port: 9370 ---rollsite端口 + pool_size: 600 ---线程池大小 + max_memory: 8G ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如8G + default_rules: ---默认路由,本party指向exchange或者其他party的IP,端口 + - name: default ---名称,默认即可 + ip: 192.168.0.3 ---exchange或者对端party rollsite IP,和webank确认后修改 + port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370,和webank确认后修改 + rules: ---本party自身路由配置 + - name: default ---本party rollsite所在主机IP和端口 + ip: 192.168.0.1 + port: 9370 + - name: fateflow ---本party fateflow所在主机IP和端口 + ip: 192.168.0.1 + port: 9360 + clustermanager: + enable: True ---是否部署clustermanager模块,True为部署,False为否 + ips: ---只支持部署一台主机 + - 192.168.0.1 + port: 4670 ---服务端口 + nodemanager: + enable: True ---是否部署nodemanager模块,True为部署,False为否 + ips: ---支持部署多台主机 + - 192.168.0.2 + port: 4671 ---服务端口 + eggroll: + dbname: "eggroll_meta" ---eggroll服务使用的数据库名称,默认即可 + egg: 2 ---任务运行时默认每个nodemanager节点启动的并发计算进程数,默认即可。 + fate_flow: + enable: True ---是否部署fate_flow模块,True为部署,False为否 + type: install ---install是新安装;update则是升级,从低版本升级到当前版本; + ips: ---只支持部署一台主机 + - 192.168.0.1 + grpcPort: 9360 ---服务grpc端口 + httpPort: 9380 ---服务http端口 + dbname: "fate_flow" ---fate_flow服务使用的数据库名称,默认即可 + fateboard: + enable: True ---是否部署fateboard模块,True为部署,False为否 + type: install ---install是新安装;update则是升级,从低版本升级到当前版本; + ips: ---只支持部署一台主机 + - 192.168.0.1 + port: 8080 ---服务端口 + dbname: "fate_flow" ---fateboard服务使用的数据库名称,默认即可 + mysql: + enable: True ---是否部署mysql模块,True为部署,False为否 + ips: ---只支持部署一台主机 + - 192.168.0.1 + port: 3306 ---服务端口 + dbuser: "fate" ---mysql数据库业务用户名 + dbpasswd: "fate_deV2999" ---mysql数据库业务用户密码 + zk: + enable: False ---是否部署zk模块,True为部署,False为否,默认即可 + lists: + - ip: 192.168.0.1 + port: 2181 + use_acl: false + user: "fate" + passwd: "fate" +``` + +4.5 部署 +-------- + +按照上述配置含义修改对应的配置项后,然后执行部署脚本: + +``` +#相对ansible-nfate-*目录 +cd /data/projects/ansible-nfate-1.* +#测试环境加test参数执行 + nohup sh ./boot.sh test -D > logs/boot.log 2>&1 & + +#生产环境加prod参数执行 +nohup sh ./boot.sh prod -D > logs/boot.log 2>&1 & +``` + +部署日志输出在logs目录下,实时查看是否有报错: + +``` +#相对ansible-nfate-*目录 +cd logs +tail -f ansible.log (实时查看部署情况,如果没有这个日志文件,需要查看是否有安装ansible) + +检查项不通过提示列表: +1、"Warning: now swap is 0, need to turn up" + ---没有设置虚拟内存,请参考前面章节进行设置,不低于128G。 +2、"Warning: key fate process exists, please has a check and clean" + ---环境没有清理干净,需要把以前部署的fate进程停掉。 +3、"Warning: these ports: 4670 4671 9360 9370 9380 have been used" + ---环境没有清理干净,需要把以前部署的fate进程停掉。 +4、"Warning: if reinstall mysql, please stop mysql, and rename /etc/my.cnf" + ---mysql没有停止,需要停止。如果有/etc/my.cnf文件,需要mv改名。 +5、"Waring: please rename /data/projects/fate" + ---fate目录存在,需要先mv。 +6、"Warning: please rename /data/projects/data/fate/mysql" + ---/data/projects/data存在,需要mv。 +7、"Warning: supervisor_fate_conf exists, please remove ls /data/projects/common/supervisord/supervisord.d/fate-*.conf" + ---/data/projects/common目录存在,需要mv。 +``` + +## 4.6 问题定位 + +1)eggroll日志 + + /data/logs/fate/eggroll/bootstrap.clustermanager.err + +/data/logs/fate/eggroll/logs/eggroll/clustermanager.jvm.err.log + +/data/logs/fate/eggroll/logs/eggroll/nodemanager.jvm.err.log + +/data/logs/fate/eggroll/logs/eggroll/bootstrap.nodemanager.err + +/data/logs/fate/eggroll/logs/eggroll/bootstrap.rollsite.err + +/data/logs/fate/eggroll/logs/eggroll/rollsite.jvm.err.log + +2)fateflow日志 + +/data/logs/fate/python/logs/fate_flow/ + +3)fateboard日志 + +/data/logs/fate/fate/fateboard/logs + +5.测试 +====== + +5.1 Toy_example部署验证 +----------------------- + +此测试您需要设置3个参数:guest_partyid,host_partyid,work_mode。 + +### 5.1.1 单边测试 + +1)192.168.0.1上执行,guest_partyid和host_partyid都设为9999: + +``` +source /data/projects/fate/init_env.sh +cd /data/projects/fate/python/examples/toy_example/ +python run_toy_example.py 9999 9999 1 +``` + +类似如下结果表示成功: + +"2020-04-28 18:26:20,789 - secure_add_guest.py[line:126] - INFO: success to calculate secure_sum, it is 1999.9999999999998" + +### 5.1.2 双边联调测试 + +需要和webank沟通获取host端partyid,选定本端9999为guest方,在192.168.0.1上执行: + +``` +source /data/projects/fate/init_env.sh +cd /data/projects/fate/python/examples/toy_example/ +python run_toy_example.py 9999 ${host_partyid} 1 +``` + +类似如下结果表示成功: + +"2020-04-28 18:26:20,789 - secure_add_guest.py[line:126] - INFO: success to calculate secure_sum, it is 1999.9999999999998" + +5.2 最小化测试 +-------------- + +### **5.2.1 上传预设数据:** + +分别在192.168.0.1上执行: + +``` +source /data/projects/fate/init_env.sh +cd /data/projects/fate/python/examples/scripts/ +python upload_default_data.py -m 1 +``` + +更多细节信息,敬请参考[脚本README](../../examples/scripts/README.rst) + +### **5.2.2 快速模式:** + +需要和webank沟通获取host端partyid,并确保guest和host两方均已分别通过给定脚本上传了预设数据。 + +快速模式下,最小化测试脚本将使用一个相对较小的数据集,即包含了569条数据的breast数据集。 + +选定本端9999为guest方,在192.168.0.1上执行: + +``` +source /data/projects/fate/init_env.sh +cd /data/projects/fate/python/examples/min_test_task/ +python run_task.py -m 1 -gid 9999 -hid ${host_partyid} -aid ${host_partyid} -f fast +``` + +其他一些可能有用的参数包括: + +1. -f: 使用的文件类型. "fast" 代表 breast数据集, "normal" 代表 default credit 数据集. +2. --add_sbt: 如果被设置为1, 将在运行完lr以后,启动secureboost任务,设置为0则不启动secureboost任务,不设置此参数系统默认为1。 + +若数分钟后在结果中显示了“success”字样则表明该操作已经运行成功了。若出现“FAILED”或者程序卡住,则意味着测试失败。 + +### **5.2.3 正常模式**: + +只需在命令中将“fast”替换为“normal”,其余部分与快速模式相同。 + +5.3 Fateboard testing +---------------------- + +Fateboard是一项Web服务。如果成功启动了fateboard服务,则可以通过访问 http://192.168.0.1:8080 来查看任务信息,如果本地办公电脑和服务器之间有防火墙则需开通。 + +6.系统运维 +================ + +6.1 服务管理 +------------ + +**在目标服务器(192.168.0.1 192.168.0.2)app用户下执行** + +### 6.1.1 服务管理 + +``` +cd /data/projects/common/supervisord +``` + +启动/关闭/查看所有: + +``` +sh service.sh start/stop/status all +``` + +启动/关闭/查看单个模块(可选:clustermanager,nodemanager,rollsite,fateflow,fateboard,mysql): + +``` +sh service.sh start/stop/status fate-clustermanager +``` + +## 6.2 查看进程和端口 + +**在目标服务器(192.168.0.1 192.168.0.2 )app用户下执行** + +### 6.2.1 查看进程 + +``` +#根据部署规划查看进程是否启动 +ps -ef | grep -i clustermanager +ps -ef | grep -i nodemanager +ps -ef | grep -i rollsite +ps -ef | grep -i fate_flow_server.py +ps -ef | grep -i fateboard +``` + +### 6.2.2 查看进程端口 + +``` +#根据部署规划查看进程端口是否存在 +#clustermanager +netstat -tlnp | grep 4670 +#nodemanager +netstat -tlnp | grep 4671 +#rollsite +netstat -tlnp | grep 9370 +#fate_flow_server +netstat -tlnp | grep 9360 +#fateboard +netstat -tlnp | grep 8080 +``` + + + +## 6.2.3 服务日志 + +| 服务 | 日志路径 | +| -------------------------------- | ------------------------------ | +| eggroll | /data/logs/fate/eggroll/logs | +| fate_flow&任务日志(fateflow节点) | /data/logs/fate/python/logs | +| fateboard | /data/logs/fate/fateboard/logs | +| mysql | /data/logs/fate/mysql/ | + +### 6.2.4 文件目录说明 + +| 文件路径 | 说明 | +| --------------------------------- | ------------------------------ | +| /data/projects/fate | 软件部署路径 | +| /data/projects/data | mysql数据存放路径 | +| /data/logs | 日志路径 | +| /data/projects/common/supervisord | 进程管理工具supervisor安装路径 | + +# 7. 附录 + +## 7.1 Eggroll参数调优 + +配置文件路径:/data/projects/fate/eggroll/conf/eggroll.properties + +配置参数:eggroll.session.processors.per.node + +假定 CPU核数(cpu cores)为 c, Nodemanager的数量为 n,需要同时运行的任务数为 p,则: + +egg_num=eggroll.session.processors.per.node = c * 0.8 / p + +partitions (roll pair分区数)= egg_num * n \ No newline at end of file diff --git a/cluster-deploy/doc/Fate_step_by_step_install_zh.md b/cluster-deploy/doc/Fate_step_by_step_install_zh.md index b55677da99..fa1a0e8821 100644 --- a/cluster-deploy/doc/Fate_step_by_step_install_zh.md +++ b/cluster-deploy/doc/Fate_step_by_step_install_zh.md @@ -180,10 +180,10 @@ echo '/data/swapfile128G swap swap defaults 0 0' >> /etc/fstab ``` mkdir -p /data/projects/install cd /data/projects/install -wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.3-release.tar.gz +wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.5-release.tar.gz wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/jdk-8u192-linux-x64.tar.gz -wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.3-release.tar.gz -wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.3-release.tar.gz +wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.5-release.tar.gz +wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.5-release.tar.gz #传输到192.168.0.2和192.168.0.3 scp *.tar.gz app@192.168.0.2:/data/projects/install @@ -336,7 +336,7 @@ sh Miniconda3-4.5.4-Linux-x86_64.sh -b -p /data/projects/fate/common/miniconda3 tar xvf pip-packages-fate-*.tar.gz source /data/projects/fate/common/python/venv/bin/activate pip install setuptools-42.0.2-py2.py3-none-any.whl -pip install -r pip-packages-fate-1.4.3/requirements.txt -f ./pip-packages-fate-1.4.3 --no-index +pip install -r pip-packages-fate-1.4.5/requirements.txt -f ./pip-packages-fate-1.4.5 --no-index pip list | wc -l #结果应为161 ``` diff --git a/cluster-deploy/doc/Fate_step_by_step_install_zh.rst b/cluster-deploy/doc/Fate_step_by_step_install_zh.rst index 57371ab610..f8f5c342fb 100644 --- a/cluster-deploy/doc/Fate_step_by_step_install_zh.rst +++ b/cluster-deploy/doc/Fate_step_by_step_install_zh.rst @@ -202,10 +202,10 @@ ufw status mkdir -p /data/projects/install cd /data/projects/install - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.3-release.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.5-release.tar.gz wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/jdk-8u192-linux-x64.tar.gz - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.3-release.tar.gz - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.3-release.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.5-release.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.5-release.tar.gz #传输到192.168.0.2和192.168.0.3 scp *.tar.gz app@192.168.0.2:/data/projects/install @@ -359,7 +359,7 @@ ufw status tar xvf pip-packages-fate-*.tar.gz source /data/projects/fate/common/python/venv/bin/activate pip install setuptools-42.0.2-py2.py3-none-any.whl - pip install -r pip-packages-fate-1.4.3/requirements.txt -f ./pip-packages-fate-1.4.3 --no-index + pip install -r pip-packages-fate-1.4.5/requirements.txt -f ./pip-packages-fate-1.4.5 --no-index pip list | wc -l #结果应为161 diff --git a/cluster-deploy/images/deploy_cluster.png b/cluster-deploy/images/deploy_cluster.png new file mode 100644 index 0000000000000000000000000000000000000000..b42664ca13b54be50f83cd1d7c3f83a6e9eb76f3 GIT binary patch literal 25518 zcmc$GcRbba-@hVKp%SMuQ%G`D_9n?58CeIV?0M`>RL03($tu}GnFn!F4l=W{M~7pN z9D5zV>!8oLzxRDV9`|3rKR%z+dtLAAdR?#A^Ywg-S`CbzYoVpY+OPo@-9jG|3bR~~G^pIqg;3_}A|k1{{M6e|`@x_#wJx|F<)TgZ=ZK|I_gQ z@qVOWzW84=Ivaa1!~Z-(^k!;4o&Z?9D{PY@59=Bk^Bl`hsc1#GT#P(j#Ziyb! zaG;uyLer@27v-g;SSPqqrb%&e&L<<=%*Vk@bkMM)mF__ob?j8ge~yeGw8i+=n|t&1 zvs8cf_KstYwuEm2lc1DYP@v-2gDcL<Urhy@#4pXa{Y8z)_wUI%ngiu@k{m!Wa#Jk`Fm5mn)`9(-_US^-F!jk#VN6M z{rJH=z*RH&^XKGT>9D~){*wwHZF7>FPdmaFIG|)tCL!@=PE9iX`Nc(4e@%TkTiPlx z;*l?0*_loj66kVG$LtpG;Dt#{rAmYX3iUJGZsd*l6ziHpO93#RE` z44f84CVYnN>%T@cH}@7AaYGBd8@wTfd3nW-E44O65Hc`3*k^Teg)(^w0Re%$>+H(q zQ4{?f<=>y)sh{+2r&TVW;%@kB6+>v}O#3Xdii;JZ2@YW~U(4X4`i?6Ow~&HIYM6hZ zX*|YvLggXD=zsy6l$bqUUS7dW$|FhB%wr-iJ@{E@RQTOt!Mr97n&Nb^=-JSX*WcQ< z!Vc7b3{$cl`wL_Fg@S^jmP@7 zWbE%5841eA&8(6K)7f}HRNuGF^%jPwmjk1kf_@bmNMdM(!(sH`O^~b}?oSBW6xR>E zGsviTU=VweZzyX^sznp7Ee|->`x@R94SjlK-2jYH9#8`E8M*Yp#-TMn!Mt`1PH^Sw z#Evvm58ACR0iOph3(_oP54E)+mx6M{P~T*@UTpvC@x^eb7V##iEDSdFKtB3{ssmh% z(c6_r457j?y&4)h)d~yopTE(G9=fzWgLKTr$I^S7SF~WW^OblU~ZV|GjV{ zL+A)Vy0m!-4Zk2+*yoy>ktJ1Q_K(r9&-wXW9Na0qJRZNR?f>4C48%?V0~((Es((|+dtj(k@G!`~vpFpGPvpBvcF^O;R$%lVLU zgY>4#&344c*^(PIJ1G>8*T+wmi`tcS=<2qlma)#ssV)0>sz?vE(OtgmG%M}NNMlBq zHCjO_+LP1N%3c)SdTx>TZ=IBo&=d|39QknmW{@MIS9s-2fqeaZ102pb$*l5Ix30GZ z!xH#wuCTGBKT)R?6)Hih8D_LO z?vw1V8~)t*=fWTVdco;aB5>Wkc}{2rvlC3-hAtP4=<(M02~W2p)G@qM7X^G8#1%9D zer?v#cJk^rv~7XgB1ciBUfaVcTB1!jp3->M;mHrMF-#RdX%phxIU!$l-7g$DfEeq( zk5WV34_6-5=$4)9qKOQhNDpw~#G5UR!k6i_8}&=Yr_A6m7=GyEbKN<>E7mF#yyEmY zmvNC;ht1IkM_3wvzwMs7jgW>6^b8#4yN~nwINCUiL=G`F^4Qqe6ftehpq^|ehS1z? z57>I$mdatI^*bMOzWPD!y{>{G?QI>0`KtM3CbI-!#U7Tk}gY&C&F4%L{oo0e&z(To-UZJGnS{ zIdZ@owx@5?sba@o;pSp-GSk-L{NNj>UUF*e$$-mTs3}n#FSKCE_N7lqm&l+1FK=UD z6dj$IHcrE*+>U3$96Kqs@isuW)i-(@ae9&!x9D)jr`%_%5v$o)+s93w(sN@j)1N?a z#xSv4TaL;c=J!Ou&qfs$Z{K|&KjogR^us4VJAYT{va-2LL9#lDXR5GlNb+!aR!L7* z^a#s$G`7!)(P=Q;hM|}7j{mM=d50&F=e4}grPgnYe>4`%SD5%i3+yaw5D`WbstbY9 zPLs)n+^ekeqn;O%G>)hj_i>>rtGqk8CX-kUrWJSI-G(h1URm3?GO6lFB&?sd+q_h0 zyac0OaA7ejGpc(2y#|1xQNbH4)B_dB21?A{quuJpK*h2eL%D3ufV;U^`8mjjU3AI& zunIDJ!;BnT_Qy<@;QGo2^#?o(_Bj_e&LPa zx967nM6@qCpRb->7NXJUn!VBL5%72<+YgVK?GHnU7>KWa8+fl7?NLu=Ao{vJ4q>c5 z-j+AdO>fPsAAZCYFF^~M@`SUu%`2IwdW?o1;ra)Xg=dIL^by5(v#K54D_qP!T&B#?fO(tc&79XtlvWWYoGj7Btq6U?ueyx)}y`cINyec1PEsH#Jd~L*iF^JyWoE;9E+nQVPVhH1?MKUoh zp6xbWb#~|%bTV~Xl|hI24f#G|!=vF8CEskJi<=kU8&i$m{DPLb)RVOUv7B-!R*iK| zHW(Dwd|=qcEsDVuB_zv~O@)#R7o6I5fDPg8wm&5bXDR<~nWGP~h*N95gF1|)u#!#A z0;gTz52Sm3j#Z_*!auqDwlWLd7-S2Wvz@(NH+uW^jDQF^lp&X)p{ok%o3g-RqTXgP zJ<%NCczHY3fU*TOY&@)dz|0lH3v?2A67>8Kh|Bx{V$9LkhTHQ{ZVH&;x2LDyr*Yp# zY)%I{czmC7f+wTUs};RUs039p4(`RmwI`|a`z=;fv@1}SA=5b5{hO>@1134QE3vmc@?GP0 z`TUodMBmh-WjK3Y_Q0-|)6Qgt(VIn;8{2VdRO>GQFFrmrx4W=MM7s(fdwz2nO+Z{dqiqjo$tB-S%NFU-266BslRB+$4Lxho_h;cfY71G1CNOF!e8 zk>jbM*3=$a64tf~zf8y#*~1$XQw?|CEnq^Tnd3(#-t6YJJe>+o4E2OsuV?Ij!Fv1| zZC7x$L$L;N+L3C}`Ap5@(;GR@hKpCaRsebD@O9l}m+q=g(jmMUMt{l)Zt{Ck#xpGz z%x+z^I&1$t2MRtF&_%3lw{E?h0V@;b>GqEyBv!^S5rgRH&;648Sh)6*anAh(`bfc+?KdfYDd!nk1-*~Vn;dS|N?~qUMrm#9;Q~eOq zKF}29NapfOZx%xp-S@ja!Q_&B2_wKhy9nXUgHw~B1?ln;BI-W2>1$wZMok;AJ}->v z6VoDUj)U-;-trtzqMHJweKLVr;8oiym`0M}dJ7FfRxomE zR9wZ96{4mUx8{c18QbQkFG(aAE+6hmHVvd5%G&5FRAThG;v-4Luf?Ih!1uBd4&d=G z=yz<>^cMOx2+pgi2Hrck6&L^ArhBrFtJFClTV;DAqW6AcQAHH-RN!WN_!l(O9n`I< zYmMQn)kX9cDHReuJU-`-7Mt6Tu96UX_`zG499IoV)vuY(i`l=q;-Im?X4EuTU@PI9 z=k4I~`ZOV7iWJmLK{RvA4B2#0PrR%k+bB?SqF)#dKG;SmCDM(BGm?2D!>j#8YfVh^ zP70j`L-Ne{`NQm)Nz<180_PN_KUOJX&@M7zUKCz@`#P2k+I#g#WI~-HJ{~d#Fx?Ik zcMlE;p6)xm!_6{Bd(wz5&WmCe5Y>6jgzR$mcstXY!c`NqR8>NAw2-DovAUNxny~6e?0@HWZ=c{ zM<7G!*p7snvb@0sk}dS7@w}CvaE|zuh=lnVI?^UmFxnXAV-YOXtNa=?#b8#lQ>fFF zy^+d$wX=JwTzi4Y*4JfkNN(;~^BS05#mB9>*82zcV-MGnB3&}P8H?ZqDtrwY#OaqQ z@e&zhlra6;(>}Zt%BLtl7ZhRvsy6leYK!XLNg9se| zP~^Z`k%HGMH}NAjCY}-0#Q_Vwh9AxDxcjc!Unil4cy! zpFvWwS-yZYo-&y$y@748`&>5l)zqko3RWN?&?zk3CA&<}H4p?B5<#?Yuu9p!HhSHx zOT>pVh zuLv_A!>F;oE;tv9m+~jaeex9G+?GaX>`v`TRNm>hy)nj!z6jn92@I)Q4ayD+=K**# z{XYKH$C{Bn>nS;X8mF?jc4vSD^u1DVEt8&Png0Oh1~XJjGD0D0C-9M zS*M~r4SQpkV#Tg@*}Lq+xx2iEKhv7Q^j`)#D_*s4(Y`4kX}9&NCLZzOK%7Mr7Suwp|-4Ax;7=-&a-)rXbjNRERGu0vM5<9t;@h{kI$E*2AGC*!N zVXigm7w-OQG}E@48gJ)rF<}!KS2xsP2V4n=_5APRmgZ70f1rO`n{Sx+u|IqwyWarB zbl=Gu5fS^bYT8Z<9ErhzhiSHnd1bsDm^1+C&7{kH2F1{V)v(kN&t;(}sdh#_HT)~J zf4R3)@}W+X2lqbKGT5z8rm|eWHk3>!(z&4!%HGJpv2@W(dUr71;kNU#b3YLQpuni+ zP+;+u^{OYx9a7)TjPOWv!8r>16KTOzVf6OP#&`WiS+EW_T5ja8KLJS0Zp&}VCL;6z zTt*6xVVYl%xIYK@_V@B-##+=)Ll>p}MKfZ`4Y#C?ni3BX$K53D$zrXT$_4Nwlrq>U ztmGF)K7v{G+r<2D0OdB?mN6@VGtP z-T>zF%lg`{YmFNlwt4+CeHtTQ8uNz_p-F(}&rZ0F_*k$vVmoeck{WQUD^y%Za$|9y zfhPUB2gC$ij36{6aV~mkpjkbNS@nC%UQWt#bRdt27e_o90aK8I-Peax>A#^@dYV!< z_Jc^a6y)4@N+E5dmZ?b$lZS)wNiA(!z&;bnMQ~GBcvjr$fS351{qc9Q1T}^PUUpb^ z6(9N5-+21`MBY{G)O|f!vu!~jL_jh4sx|@fNeVsAEM<6e5;A!wIvEIgvEpNYJdQYi zj)1Tr7X%@LfJbj)QS<{6z>ft3d|&E%iv-R&S+TN>Gw9GCI&UKH~*_83~Vw%LjVfifsBq; zg%*E01(w1Eh_X1n*!g%|`=W2tk${(Tfr2Nb_P`DzgN< zIMVifUU<^V--8|pn8X1!yqiR_l_;iP0G4IJ@$SE4L5BlfpoV=e*(ApuD408Wf~Wmb z)4e~K3dn&1v}Ob2m=to}VZ4pZLFAtQ=Q9~F+t77b-@M}1V0`MKuWC4)aNHaQH>Mdg zv@1+H3iRn^%K`WRoI0bh1vtdvvi%U+!~w3{QOw-F9KZny0m%KDe901!T5!wkcsh9hb6XuhQUXLHXFxid0xM*q*^+(C6 zkpaRECITUY(Ki8eiEXQ!A3MaB9rcm!>KOp_gcU#n2QXQ^K0yd=S|EmjOAo=VjT7_s zuqMxX)jKK>!X2|yKV-RoVQ3n5zYcK+t^>`vNshY_fuoWQr~iykc;-;oA<@5w%4&!@)6o(4xBXcIK-2Nis9o2_8WM|pTu2Dp~AepvD{m?Rr;wx zOfqTdDds=D0E%Z0G@lZCoE(??bP})GzWE&)Lx|7-a3kS$CgQFIp z1JMo8I-p615JoFNgEc5-+iBrJG*v0pO_tkuBFke;x$R~p?$y!JK^XKR5=r{s%|A>z zb7N;`$@iD4FOSg0eo*HJV6uo=y;Z;lK#0&7QU;LI zCII7gRmH1PhF0p(U%SAYRuAoj*8n&^gZWt3PYX7KjChW6 zG7N?VgWIhSMWC8d^oj9iEmBQ!MZWH_yDW(bVj|E25t74wbnqi_&tHD@*k??ow`zZs zlX?#Sz8$rj@y#>!6A$(fe{oyB`>iq?p@MmRHb1)Q=)R>Sdv_1kNj)8ST7-p|rQ%+h zHbhqz4+dVbYN#-U6r0~Rjv+aQbTXrc!?CW2#s^S)|Gb!eKz14R3XjgoXtM76Uvs$u&!?!f}sWqrFJF!G9 zV-NPP6w)yC{z_cGrQCov*)LL7>j!B-puNJdU671O3Z1?G6hY!1d(|b+-8@;Gb$C30 z%c$$pEdWt1N-iLr5*tYrte#i!>UHAl7{wa2s44c>Gr8m(&tFU_-l!PjZ&aCx52b@7 zkF=HMArjrSK_&`}u6IBPx{qIL%%gWUZoR)Vux)IZYKfoZ^%*w%HpJ)kx0=es?TvUQh-~C z`{o%S>V8!ts6CCm)=}L>w{fjT&0hPyn-i-08!y;VMbmXIt`~~uGxUm5BjQ4%GjFsD zbya*vSZz27xuMvVK8By_x_q2qX8t8LNVirDdn2Gyw{D9xCghxll$&UqNiFwIyKh@G zn8-Ad?N7K$?9i#si_}OdqJIn2_n8d^VQ26xfV)<#3 zFLr#8K*7gbP$H|prXvG-jhwb)nx&gsezj0;Ee%(EdX3oGa$AY;&2WMnaLF06q<#f) z9j1tLOIz`@HXYcwY*B~=-^aw_2jc)k^}1crCegIrP7yZuEUSf3X4{@@K1-eu!+}Sm zWeI_e9+FLn9$8i^XZdpKg))S!`uw54SHlOn1H^^Ws`!8QIP2ciUxkg2fffuG$-Ljg z_bXcMwG5@sUXpMZES65+1zJr9w2o@YyFQLUKzM~jcHOl5t=*10_};XrLu144?$?11 z7J|v=UW~MRV1It1$FG3Lj*VVv26Nvr)uYLIn%f1RlAM~dnT5~@9^9~->~L|e8XDkI zsDRQ62T#pM_b?Xs#vT{ow~_QvW^UXy)Rqj$o2d~vQ% zi}O%BbG;+m?{oI2+G#}!tx`mao|<-ckQ#`Jp8-S>5_rdTS(>T3uAhhrxZgls|B=)} zH9W77L%?efd!N`K#31M9w5L!yjG0Hg*7yh7c|H@mpWt)x`XO=GgsS-Ot}U90$PUOw z#4}X7gx|=~`4BzTrbnI2%MZ03>MWYBdLzCe5GN+OHSw)xK%Od@y+hzpUi=ooS3q)s zo{fNT-PNey)nYE;UEI4F!~GXjSr1nYU}3u9Mog2es}GJHxRUILyCm)))_u?8os@vP zE9yC3-!~v^{AobqUWpd39=q_W46MA{XQF0p!0S`E!sn9|Lo*?Y72R^-OUwSjTPHoT z&WXw$()H})auL!G7U%9Yf#lNJqg0Ur|6ABGg7UXdD=|SSw?ZzEJ6xv)#MBi)#4d>+qa({iIUe;;!B*C|NtvHNY)*Uid?10gob z){khRlzE5?@4%+s^QV6(*Muc)k9Vjb79BvXG~FY_Q%JXjj=oL%+5WfGvUi4R7h2~l zjN9A1j@L5GzS6@d8Kw0KvlnERU{&3-lXp@UM0ZIfpp;lyKn?)eRRUxBfA9abQa%7e zqA%!X#@pkYn#jlk24NZ78Y;6IF7}@MhO6fbtq0fbV=3GXeI9VTc8=ga3&~X%MtC=^ zmwAamYd%AHk%AYNYr?>){bL4+Y36Y*g2nF-a}ywd8N*~y!{NH9r9jcE`yuTbRVfLb2+Qg1j&o_nd^3IIdqy<@AO4 zzY`jOaoR8AB)j?L>rO1BF)WEQ0+br;zxK`~5ik#w%n$ThoFwGUx@-1+tTt=@{l`o% zzvqb1K}y3z9+AqNqf)y7I2fRyJjA3QvSSo)uVlbd;`f{Qldc0PXrJNvjLXUTYEaqw zrhU0*yo3zlZrB_2Q>o13^P;RzQ}ml2T5hVanQpD=38UJPDUU&Bs$s1tM6+3>%~yzx z3=GCtM||NNyA=L_wtj26?dY2eqydl>L*5?-vL-fRkQx|Q|?OD4qK7l0w^PWe=tH?M#`UF;W~ERI&reY zXW<3dK6bM}kU#K4K&WVRALyVOf=r7vDnE4Cn$c~cgOC-+?C<-Bs9Je|hMb)J-~bh; z=`%!Hka)4i72)d|KEl;jtI|R<>oImAh;O;lVC=6$0iN1y!eQ%6XA}r#3ZjiMj122{ zwD*FF1H5B8E5)I_qlTp{D38<1c<|MH_v&Y{A3S@Z&+|KGF~S0AY?F?ZL$A!X;qdFk zz^M8a01cH873gfbkOU@K=!eMb$b@mqRt3-{{2bLnF0@ptP#3f}4qkFCkF;Saj%_d# z7Vy@b;JUYHQv9gbWq&h15;E#)!3nH^2pBS#!%qA`cDe%^I0+wf5F;4ECJr{Fi;-0x z*cf};Y&*1Re_U$3YWr5T)V?o>Aave@^k;u2{nKKX=zFLN9R>GJX6qXRQ5W zO+l3JZ}sToW&Q`HuJhbqvb>6`AJEdBydL0TPHZd>n&v7-0l{$P}cV zu1Zm2zXiP5LdLkBskuV;G3+MTmz)mW6MzmQHU+28++ZoiCyg3c3%|Z2hMs*N<=w3; zlV#)X)o~LNp56zC>&%=j&YIv38d>g=9o=2rtSsEcr21-5Pk^9}wnn5Phrj%TN0yqfd6Ug75pn$v_vVYSG!1 z&!}g}yRf?}B`5c_OlWw`8}VA{tO-rQr|K?_l=x!hg<_!Vl)miQ#spz&&*)A{zdw6cb; zf55F)Q}+;r2UhrJMaabEVd7`R4p|!A}Li|r>$Wh5mqRO(nXTAOl^YDb|?i2g429h_>lg#h9+MB&g>)%p4;a&;JG3c-q zcIL?K(b%_|HqV6;_Z-FGJ|6rH<-$ZnJ@Xi0_Hr)FIY@6mIbs^))27O-&SG!@V-vDR<@0+?pk{9= ze3zOln#p?W&2I04R6#ijW~erjZrezxJH74E`s2ptQlH;DpMJ=mPzP1HB#oxS6y;kF z16Y+wqDrdI=4t-JEgpWAv9yt`dxT+#%-D^^zry6*pOSKa$Y zpjtpbwK$SM&DZ;#*Q~O`Zv>kd3^s{j^gd?O=)bV+PfuJPZ5s&Mw zN)=1$?>MO1`B^%_EtJOajEQwgKriI!MdOf=8&zW>p=3gQ0k|KWQ(k(19db{XWX zk%DzUmi(_Z0rnjeo*s46vuml4*FjlnhJN2P8yl@EzPVP2TCojS;6;$*=z5tdJ=;d6 zGP~Y94<34+c;hP)nc#W?5{QAj+RlSe5YDkbkIw>LESnY*XyOQx4rSDP8cT3~=H(Rr zL{YD=sUaemqRQ$KnM9j=F*fJg#V-f$wea+^kM>}-DVMw_2VU;OC?74N3?YfH8ilNj zqYCh87xcK(U4f$k3?=x8R0w0^lRcQDILfNk(_hp zRppi`dgP_{RpNV*$A@@)?m)3Omw>+@2EE(TQqY3v6B%&$0_9?hK(3g4QZ1utzhdbG zYcxMO4$b#H+|F&v$lG;w!9bomu3`Ii!e=C7c!DVJ)G|;0l?&2Mpt9Q8UHv^C2Qr`S zqn2jLhdB+uY{XzuUh&&aV&Q=I5y>Y$wscFo*qwEc4O_g$X2Vg+)>w*CxI!Hbk`*Yj zsmJLk!4KE|V*7-0G$Aq4*oBT4a?z;MCR3i(vD{%5mX1WaIo+{Sj%Z#7;r>tKu-(p| zU(^C0of>sr1}LC30m>;VO;}3DgE}k|66R~0?(CEqPJM!qyzTJ6I#en|UtRD5IBExb(?GIM2I zBa8TE_|}Vx!#)AvcA%7evQu)24>y*=87HkJ8MBu{nlqE8414o%Q@an9H$;+Ue-w-~ zFTIZ{txO+ecw$X(7{?weH7%SlZ++ssR0g~kp?lyufj0j@VXk^B zX4+6nXfgI$igT)b#fq#ave#AKGa|8}zb$@gf3(Ut@E3jl(1YnzuyE?b;z}~3`5o!F zg$7XC1LTt+#%wL8WOp781;aU%rOQwUd@vyN_7v|U1ODl54$qKSsxmKSL%|wHPfAFy z5JhQpWep#ovRqiZwOt>zCae*tCD&IRs9~Vqd3f=pRTQGUgyjBX()fI?7PPaSVjv)5 zvzwv4>kOlhCYeF->gnPm+yR<=L5tJ9N&Ic#WaIu(akiEHGcT9_h|*3}A>E33+|yKY zuiKTZPOVC8w^p#@-(32!Wf^Ol-l(Lst2crZej13`8-{1#Qieqkc$ z9?Vu&{`Pu-%laNkSJxe|l6MaVGc>-YKiYf+dpu!e9KXJQH4rh^r)+@d*#je~^ZZBE z5lr|Ps9gOF*;$pOHX}YhJs_5wUb`G)dN;Pd9v!N$N9(QKHHEJ)3S>|9>eCjPdAD+H zn>5J-+lO<NCK=l`6;r{e*a}$yn!W7*>rWReL0oXXG(&z}Cr=B3X}&xdG@KxoSq7d-41P(S1THI>Za% zzhek3f;NFHFQF1}W~4fMZ3*OKG1ihjfct)7-bjNV6y+!8Z-LA=A(`;j9K7In#%I^v zRZy1&`4}$_yQ<2)!{X-Q4QGaC20+<^M98o8kUN7Y(`AY6&j2R}K*8)~{^c}gr|1*6 z)O|3f;~={9=8&+jz=Ku<5TqX6`KaQ`BlgA(L$3nYUEZt{k_iXp@%oJ2b+cZ;NI^i< z2$~p574~LInH>1v@-I_Q9MaY?#Vt_sI_TW-8qLTDaV{tq96m9P0iF!b|27V=+lZH- zSan!2Cp7Pj!;TTkOT-h$$Q}H=GIo>0H~Lnd4~)M<_nHucD2uQ-W-7S|;)&)TPD=^z zXphdlJqaYD6BYxY>z{!#$Tfm$A}FB+O`m{Lo_$UtBv}jkbCT&SWST%XTVrSry$}r% z<+a9RAuL!>Y8AguXgV-Vn2wMHRSPI(=^~-jLPT=uvu6iCIoPukM~yiLWpU7`aJLEQ z1Lz(glY8p(crjjj2WZ02x@NF_Z?<>&%}C$g;@Ah0Algdzu#^BQsOt*_pzGo)7Qi-*v8l~_e_}Z*aL0LtEl?wgF8-Eu9VAk7Vf3Ly zji7i!#Mv~zy9&A{KqC^U;BKjpxHyAG3qsbCh>rq1r}YSsVLQi755=-~<9+mJC-${P;gxj=`85B~fhGucn1 z!{*~pND~9p72+Y{WXFOkg#ACdbJjp@nJ|YKZcvAm=loDx1reenYL;!Oa!hYki6}Sn zeCBE3r**O#6huK;k^(6Ja#{9pzgaV<%yzGtbc82L!s8-2kq#dRWQF$cR-MC}AjiSv zGC;*$07`iAt%J+F#!u-9%O@4a{EL`njc>_o|kE@G=9t!qT zphbr@Y}8G8jycC$kdBDck_hNhlXBBScFDID^ZhnXY6dwWVAdE=*U#5d1v%6<(k~%* z0j`_@xB@c3poi$-?Qi_R+l3Uj4$Ir+KR|^VbjbXzE|Mky3*IJGhzP9znm%_V>D)^a zQa+$J2JDnWG!7(Xo{kD71~yh6R=){aCMm-*xrd*A{K>wn5|NB}?`TTY6apv@sI(uH zwt${Q3JG-PYJD%XWx`k!#XJx0uLX70J|lmy>5A6n^R;_J3Oh8Yqq`no=C`o4~jZQ0OZ&5SWSjhzFJmv=2HMQ5lU&vqyufRffIgBLOqww?l+?pH|1WHF!QHzDq}#4^h)68g_7&@0Td=u%~IZ%%4Vy zbYQ@Q%ajsQ_~v(e1lM@U?-XlRj2W9qF!+oG)E}-x7;m=t3VPHrQmlhCimZ55Q^K)q z^J^hsVT=UOsi@ll4Ulp|2eut^8WgU@LF*&XhTL$_I{})u_EzTs5+_{;bmbJeb?C~t zfZ@0`k>)ATd;r2|wftAZRqlTpZUC|fAEnd))r`I6wvqO7BsoDOA|Pzvv0s&gH4z-s z!E8U=07F_TH?hvj)X3C&b3=3?CbB@F0C*m)5uOP+%z9sncgUd^d;m<71LR3fStZ&? zC27Xk8wtq?E+Re_EI|HG@iV2pb7F@rfYdJ#((YFt=RBwcb>#}XxCXNg32X_GCNF4b zV`K=hyG49iGL*UYs6z+rLLd&P#bSQS=!Uc=WAas-BqqNbZ+D`ffZivwyPkmb0WfIw z1$`Bb@x-7PlnJ%`X3rqO9i&GAVb#h#PFPUJJG0$%ri4z1_BK^V2cp~nkfs9@1C9wG z4ros#;ITspk>#_$*ystx3eC$&_d#s-sxB7wV-dKOwlk8k9U?xUfk-j>11J^>N)Yi~ z$NuHb(vH>IwfMxjyBoViPzNETm-Wrf&T$JPh&5S#{9#Oh0gCSdkw*OG+cF{@81&eB zP6&hTT>hl}(;M^}CY|-0t=TK1p4b)ynW%(gl1%~c8zVpr<*NS?ZsH?NbaLd22!iJAIMNM%@gE;`d((T_ zGO*W*rdxXr^~o_v&1)}@Px!7gG3xJI1@8Bvw(Rg@b9VcP{Q_+M6204+eiE%E@!OH# zOm_IheIYmEeKS;_8^+H2g`Tm{+I{6J!O+{`Nzd-BNYAf~N!bwb8QLXz%{E((Ncil# zt~25n@%47)Hv4w?g-n$5e4^1Fx^R!Io2vPPfaL(1gH_Z)+}tO~n^82I;%)sL1$-H` z%d}6rwM3IEU=O#{d24O&ee61yq`Y5e3@t1PdAy0V7vk4>Jci^8I+*jVHBm2wa20%m z$aCV}Ig^07(~=g)6poK!>S)yX{pL=`(7wG!1XY0@Ga*LS99;VM(Z6r}_tC!|Q`57D zV@_SC`?7XI;k4R$A_#kdU(qp`(c2OtRxm-NApArOMS(hsf9y{$0C(2$qp1;QZSVc; zaMnT^MBqklqKp&Q$^8)2+LHZ51>9xKUmlfvnPMFq??|98ur4|WneJ;l7j9|f>N7UcW*EF{+nif5%=@SlP4SkaRD>4v#0P@;6q{XHK?j%31RWu zk*wz?WYeAxRyxm37zu4K>nk`%ipOcxJJ-b4gDXJLdF$*#0vI2 zTHRuTW@p_hVB~zx+o)B^Fe&=XkCmsvyg~wECcy{yaub^;9xU)H%Qc#Il5C%2mD&Fg zSls_cG;$1l?t5to}&j1#u7vKTflo$qzu z)cgfTJ(xK{_EfLXsI=(?f&G5&)$x%+xV)QPo~x8PvC|SEK~hZQ(>HJ@e>MstzxJ9w ze}%(;HYy%`bXupyc#lQus&bQglZfUgci9)kikDm^7?aLaiavqB-DO4cY4*DcA-ga8 zvvwV)N^0uxmMq5?$V}Cr-~ZvZ@xsuOl$|uAGDwqepeb{=OF}Ozx*s))@7zAW!+HVEel^1*Ip%>2?*93A;6r;yoQ_JL zLiFTB%fkV5@=N}l%aQM6iytr2r}JMARJvGtEJk1cx6IJ;vD;*$WKotE`#TFCCwZCebZ070f1xnyc^5O9^H}IZ5o2*pC5o8~ z;wVnU=;$}fi&c`%aZ;_V<<0RnE0Kw0`5a*6^;f;L44^O2LZj3u;U((1Svpx6!@1hl zB#$pM{T5Y)b%mqpG(U`n`hDvN6S??O!uo#M%`He*H>sL9tCS2(-<>?J(Oiz*Dz3Ow zO^9+f@N;svW}C{XX>QwQ`SiBiu4};1OEIr+{kQ|!t>-uUrBPj06=>PnJl_iaQKY;5 zP&x4n?=kC$g@v(>-b`2cFeB)Ss;GIQ*mn+crXcCu6_15`dAxltgBAiUK0ch}U1Ds8 z0Ybq*jmtNCR9hW}?lxYqG>H;Z_t-}>~{ z5;lEe&D&YRr}~|Im7CeBwK#FJJ^by6TxE3p#qP4cLf&0G2CqiFaLE|&G^xV(IB!k< z_u$SWoFoc={zJcYwceG5IFih?g$g^*HvHGBwmt9&R!kTwi6+>b%?O z*PT{~A{*AWP2O92JwGNNpyGT}#werqd&>-eTN3z72xoVM1~th!ZA?3T7v?r-`HnHd zjToZNeH!6i;>^wey1?r;U`mF11?@xg^#Z3dOR-kqk<))dwijYxznwJ)ZXud7J$0PB1YIUzi*pARBDdHmo$jU!MlyF znWpOu3e;vwT1Ch4o@(MQRWXF|{~#6Y^KbHXuDRk2*9V_4Gp~-OgJUVH@!*F5fXt7*wYfKf@rV()#dU;a2qAw@4`0e>ac!N?}G zV=bpP$VqPbYH;*RV#r;y%`%h=&*MBB4|PM)*8|1x^<7wZ+@mX5nmA~m8Yw&iBwXNGi-U!rX%PTt~I2VrpCzs~_NK&e$)X3gTRtzcQ`-ZqLg;pQCC?+we42@Pyv!g5c+?)y6Ca zCMAXuk8X*Hta+x*zhC+=o(B>%YQ??UD68{TrNg`yrFvbd`F?9tNqbf?SCzgd>4>aqXi?Wx&OI33Sh-(SP}W3r3nr3JI{I{^k$ zUe{QP&?4NZL6<2~(B7L*CO^uu%0fZNC5sc51_~AqvxHnC2^Guu_)9 zW?mJ02C(%9Oa>aqglFdyU7z0h`DpzZ@Q&%!qAnv~{2FRrmX8 zjjNkVLdGp0mZ)Ey@})&56|(@ziS!BxPCZg*ebi7?9Ez%|rp}k5qg?+MN2>mnAGzz? zne11V<5JE$RlWP2nq_ZXnfuM>8lrT5KfOEwdVS?$;b%Mx7NQmmC6CR@sK@)ykFAGP z2rZGc?3FP_6}I7ThDcS{19ZlWRTW)O=J3b+LuxZQ`>@}j$+dfl**`WrmQr@tL@?zf zF1VWztyNx%+&Kk{ zAGzty1h*uMFYfkLG!gxWzoxQCtFn;}5tGS9%@*ce? zU{m>J2&+P>sx`eU5P^WoFs?|)3l^W_v#4pd?}w?5VI=3XoZx0@i!=_)^Y8IP16R1r zR2P$^#lfo!2QY&dli(QyjtaDCn?mcU-Y??{q>pE2*T;0-7wjV+L}owOO*|!q!!edx z2BLG`-Wo7S{Ml1&y@^HfmL8CDc2P>(_dM5sVZN+UkRpp;nw&F$vBOWf>?sMQh-4rJ z^O#7=EeyYUO{Ek=L&RxKF*IvfTk^ND-*Mt)7<9hG-+{c+~tm}%4nB4Jo z*rR4AIKxuL>F$2ST4wmNb&5ND;qL5IMeD??%+6cIRdrLjd zT4yRR6DV5V9>1qplf;Gr6&~jac;iLjgc2J!b5Zuh1IpE{oPIaI9?18 z-55q=Rdn!Hs!nl}*k$n>%bC7g-kO?sqwc?IO!f<-UYjz>_3!jY#J@nu>RmN}4;Yr+ zkgrxqfBMt6xKdW#@P=FThi|#3?3a^Pvg`EC z#a@qjsS)(%LI*lP8g;?IT=g#BOyciHlYO>TD-Lj9ud>i?;-b8usn-GLdN}qI9brkA z%OdOzq^a-r`wi68ur5tXfk5f5$pgw^Q3CHjEAu}_CI1LD{bG41d+IXtt5jA!qwMLq zM4pQQk<%nd+xN;j!5`diWd?spztKqYq4*gySzIx`!)C&J;RdJiZ(|yl>c@*p4wpqs zk7-)To7>vPUH{Q1f9g{I7v9p#Qq5GHIqFY&+&6*{d83I$8%yiS6i@B&zqYncR)s|O zTGkVAJPvVzjl=6amHJ12oBvnam4`#Q|LsmfCM6^%ON%AhQ)8?pDH>Y`p=6CPmPCww zbf_#t8p~+Lt|3D9U4}%~5ym!_5hmHQufzLwmftz&{k_+Fy??yFzn<&4=9=&NJfG|S z-kOdd!}gP;G=9qZXZ)tzW9Hc&lyFZ5n|5Q0?Vq|B||S zW0#lafwlKfb<1-hS+X*P;LG?|3nR=V2`r6JjBwYiv8r2bMoI71=PU=eUX>+_Ni7)h zeGw+|8=2RqasuwakXh%W1|c4LjuPf_cO_oZTP_`c|1>|UR_<%^#F$vSuD?K)EN?4j zV|~Qr$jwJhH2*}g_s=we{4E~s__Gj~sHh*x2Y#dyF-dUWn)9p<*A#lq0sgs|(fKTT z?<8D_Y^40N)6yxyy5wbv&aLY-b813eCS^_|&AxbX_F$lljt#B)wp7k8OQWwo&2k>P zQ-7=QI2hJel;x+hKQsDN&Acs3{33ofN9ei^jQv+6C2(+w!~6VxK|p^329rF|mN_#MKu)^r5a z^B8hDVfjbq(SuN!Z8FM4s($SD=7+|0Q!S~ajH5bdjOwoSmW5PlTy)O6eID)?m1%Ft zsc>2G*ynlZnZ1u^Ksg2jyTJ3tI5WvYS8fa9jkF$&6i4E|G$EJk}hUMUc6#>y>e8`1Of<`kPIQ-|y&RLL-YyWpeXiUYJjJyY$la z0;+gejUvfR3LBuxEy5_>C}5y87rAblF7=*jvaOf#jR&E&49QLjwl9fpkBu-Im($1$ zY7(?S!*8BYeiyPegL75LAr=7P6U`goqWI#&CRv8kxH$8 zv@IYt9}xQ{IuETyI}?5j`GS?3`9T0qD#;(pkg|EOW#W~lar#8)!Fq4jtI}GQ>@X0VV>-Vx= z6T}T$>2~oP&>PuWaWjfe$P%7axa(p28knz94Wwunc);*H;|-M-bDqm)WwjcA8kd@< zzNJw8ru+k`1w#5?rrsR#-&RGhG+rR&o{jOTkqQ|LidHE<1=Ma`JsJ*^jNkGY7xCiSHn+7wUZAyO99!13+dj*;q~^O#k|Jbidph`^>Etw21I*;KFkd zhi)QW1M=S>nMxS0S;Q9=dDOHM=X()69HHh9IIB;lh&3rw2B|;G(-t_C5BLf=tn5g8fx1w(-v+&I#yLX;= zb?2at7ZvJTpriifx>Z=bqnD`cs0CS8(65v46-@J9Y?imIe@aj2%<)fX{VwBpe<7;?siIqww9JC8tI z_m^Xa{*5?|hLOhG$G>0x_LSb7nCQa;RBHd8|2qP z@)a_nlN)0N9M^4Kc=}1O-Q1!N?jfkOTVYbPduglBAs=c)gF5fY*M}N7dyH+1$``uB zpjD8i_G`q2OXxs_!9%<^efedGBl+1mO{p2UBe9eN?{b80j13=Sw#&L2;QR_|MVr06 z#PK$^?|l*c$TJ)ZUZ-KqiF2or)O#*qQ58VpzldYNaR2S@pXB!+tnQx-_di+ebVe}C zzo_c}#?4M@vjYYi(8_->(5?@^PfvF-ZeoeqTf&h0;_-!bS)*InSCy5D0VR2XGn{tm ztSPXk)8Z^m4B%1VI%;;Y*SxmWQs}%#!96=JsgzaP7Is-ROmQadb*qb!hoSN?ucSCY zF0Sxvr$OKELl1pj#v*@+; zt_xjsV7C|-ATNxAfY;#jYL96QXMR_heJ#^XQzhJ|U`r-5(?JC1RAoeSc5D$^1P2-> z+k#d2t>LgxapKWlb`IMFY$R%7)_q0WVH)a2kc``iQ7IYE!nT4fbMCj{bsm}EH@3Q~ zZuOPW{ur(v@dc2#*+Uulud({yg3KyQ>R)ubcJC!oKH+;As<2I0r>g{%ItiE^(Y&5_ z*RDHpxph1TN76%Cr(FRN2l}=EllrZ1Y@>Qh#0h_I)Fxf&p$G>|ktGP$L;0XxWMQHx zTlX#I&cI#sOr^eG`@bn=5s>7LO!$h!muX#hbs?1mkOYe}5(q^8SR#5$8(!81hq*jd zyqMlu)l^JX+b*7GsZcyrDO$3d>LG+}#KVephu}0z<_!N=@VT@AA$;5=rB3!t6n~nA z{WM>e;*Ms1cU;zCcjr?yjjGKR|U0#9C&%0eqQUo z&OlcZx9CIpDd%E8$Z#mEn6ftbl@Q-N*j)8J)#3S=EBsP&gitE0*;3)NkI*E&Kb*-64)AB>C<|piz-*a zwwe4nu$B;U$vC)LUerg?a?U*(<&9Ri0XklKi z$EV6kW#RCz{MNe(9J*&@!Ds?!coVJ)a}2BQE90G&WU=kA%7Y>No`pE10VblouHV~T z7!qQ;bU9gJ*S~7g_^f`xceHrsEy6(!p`S4GKcTepXk494hM`G zpXR}(U)Yxb#8)>JZkM;^=JySMj?&>YEvlC;DNIMyLk|>FWA91bI0z?xk@=jW8tYFM zH8^-_BT*OsIIOMFlDec0>%R45L>ScwP9_|AE;c9L&~*w}21j|W)FIqU*3k%V9clfl z2N+|-$7a5u&6_jK-k&X7fxlL?Pl3|r(k(mB$@^0YcHxu*4yi(C*|^TJm3zekMSrlx zEoFO%rif6XMOuXE;}o^EPCtjO>lLlV;|{p5HgaDv?STt{Xtkr28<##mnx4JDLn0Nb zQt(U~a6D`DBhf*5{TcwT(v;kO)Opr{ti8tX&!fZRMa%Q>_^yPxS3}&IgNU)z(NGoP zur|KK-?sTp`9kPDu8Wqow}QoKtcO6QTyYB=_bwr`#j(%XxA2K6U#473cLRobpB%wF z?kYGzaTW)AbB}xndj`8yv$O7M-SvVYcHwr!Ms+rcumn~0U@58F04sFZG0UmN3l*`89dUb0}2SP?ciNQHKDFd95; zLmck$^dIkzech`^oa>QXZdtTP4zU!BdY12=Aii9hUb=>C8svS=nhTN|Ugjf!0zUD4gSF6-f-~ zL!?2Di5z{g;J@JOyEePI)klr#D= zI5@a7ZA^?(>vBfTl%cd3cE5MNq3-RjCkDmQ%SuaIe48MKgr&6l6b zd(RQ+gGB%NA%&v$wzhDRv&3K%%AJIrpMSJDtZ*Rf$?itv-d(EJbV8rG?_Bq7l~wV| zloWs~b7Mio*`a}m^ST@Yt0(c<=(h6A3Ucj8$)DCOynnglJNL(_v4&Y%E1Eb&Or{5^ zT%a$g>=qz`mie94!EjyugWBKW4BA}g0(Hfo}fgB@!`$^91p`2;T6`rKu6s}x!HzSI1EO~T2 zpE6L7JJ+x37%N>~4z&(g(Li%lyILbBNnLxW^+(OoV<2@=N)(oR48A0wy#I}W^&d@Y zr&_?a!(D06i(N6(96s>+OYSEbtFn4Ma)a(G&MXVB#<_922nr6k_>EgV$YDkGof6XQ z89)OpW5Jlfq^1;ngUJM^KrCq{haL-(5x9?MT#m6E=6EfbQ{Y-)Smj%$jpFXPV0C++ z#4d|^svJ`R-Yl+@#`<6AqDM5_=-aeq-^czgAb1zf!~?-DK9~81{EmX*zVJc|1Y{Ce zGK2bq{}M|4-(loaz-aCZ8Ai9<{!}^qU39Ro&G-k#{7cC26#XRQ4{Z8a>;lWbt2+LM zdH-VwSkL}xVcja@Z%U4TgU3HT4BG!u^T_|C*Z7Yxqkm!kl;Yp_puFb5zw1~=#}Y&j z{4Z*le~CpBAhiwXfDEr6w|ss{J-)5%^$YKTRi?<;Y&nNYHw9p07{hTI6C86Kf963l zGuyt@XSBd=U)Zxzo1MpDqAYN31sD^yvlVm%0RZ}o%a6MiK0(sMz{2Oi0Zkx=-oFCz k+5Y7>_r(9NyOO3oHZ5)jF%3<+`F`W@o4PmNU$+eWFM5vui2wiq literal 0 HcmV?d00001 diff --git a/cluster-deploy/images/deploy_guest.png b/cluster-deploy/images/deploy_guest.png new file mode 100644 index 0000000000000000000000000000000000000000..1f1876bf41a8718110a536fa49caed07d8eb5266 GIT binary patch literal 13503 zcmbWecT|(j);1pS!G`FAQlvkkG--kmKm&z+gwW?$Ey*$LCqP@y@?d=>(M(5T*1 z)PX=KJs}W^Pp2utlLzcKpy0nVPwyGJK_CoVZI@eYpkbfYmit-P=N0tZ<1vW!I z_|>31M&<=Y_2$a&0m)ja%qi{Zd8)(D0w8|=30bG;;Q$%Z+0^=QSX={tsF&l7aYlsMMDwOG!+#nnw1}10p_N1IFV1)pn*8@#+fJQlz31YznB&l= zjP8z*_9Q-4d?^7z5_*w@=D?$u;sh>wW7#dIGsh3_B1;xgbZS5=2Eox9=L@+@c=T02 zpIy!dujL+zEc{MY&1!XFvxc{2RSj1%ESNme#VY*n?(W~Vei0J645LMuUWw}-(mva{ z(;d+g5_WugKui3vK2fQ|;vrh861V11=CwR-;MyPmd#EHEUtm6m87Rgjv`^A5b$53! z4Oj4R(d?``22_!4pr$fmG0L>FQX_O^u^yn~hRZu4D{|tb?U=(ix#i}(s_(*(LsdTVk7vn$s|DrvGpeNJgcM@JBO^J z9CGmn@PlXY>(Ig2=ySid)P2x@^`qx7fzLV7Q>Dt-;PbsSIdRn^1U=$=OOT0;{(yC9 z5;x+Z_#%?o(Jz^grHY=KEF)oNUFr(Yf zt7nzxgZ?|}{~iZH0RsD95lZpzjba?IRHy z+}drf#5tb<2>@Fvl9l#349^RFNjK47I?@~%sQUCW@<}XPdxx1b)`V~FtMN^Ff!li8 z;Zh7b3tdvT%uMY`=U>;9CHbM*v-}_b6V$(x0QR94(f8XUxZkBJOeyWFgfg#~m`Co| zK{m?e?5iV&`pY9ZhxNr+o#Ixoq|+^I-i&hZ>F5jYee|QX@u|iY3RQU4D{qI_#`*XX zK;j_gX~Cg%LnXCR>dRX0_EoDvPW7-8Z} zY9dUbz(Yo~@*#^$-5J(;7;c|_^EzyI9=GD9YuTd`QeTB1{{id3YLdO2Xn?-bSf-sW zUg&~&b2GIv38X6ADDfwYfnn>B4wIL&B-m-9_dqS(bFwYU@EWJkIu%9G7; zaUJ@StwVQGLT%9(rZ~BI*=<^kT41_$u2u#eX)LNC*_%wm^1@`%(PZ7x%M{l1XKme}~c8$PkvRR$_D1nKIY zRGM|R^IPMrSm}64-}V4Ck?xH}3X(Wn4RY|J^*!bL818ygeyos>A{m1;RG2EQ()pg` zUX}XQ&;-98BelN}i=-*EDiK>AQI?Ukji_^7x^%tTxG?<|R!AVqnw;)E7jSP~Bwqf2 zJwJoEt`u1@PyeX?pJ2=Sl8!1Whj^O{d5wFwP)Zy4s_hXnM!^?RCV3%%z!;_!tcfh>Kr)D1@&tMO1+g^fu_M7PW1 z#k#xoyPDVu&gMq?^ob7cf%H0NY>ZBuviE!SKMQ=9`q_>7FWVg0{ zpRvMu(sk%umswL5`l0^5;M%8oH5pVjJO}UVF>V>~PHW7PL|u9EXy{0r zNwK=pVq38ji*#H_M?bcn4u>`PcdW6Hl>q523;9&@Y|g_=JodaoJ)f>yEX5ji+7GYk zepB`IyJY2Ds$W*S$s`WZk1zQwG zBr1@7rrhM^^DKwmwJqL=R6ykG28;B}e2?F;!{v^TVtK0ptjG?sWJ zwEICJW?71_cAtr@!jm#O#~*X;^>#f$$K9sLRS^LY z7c6SmC}u9N;4&zj$`Ut??C$xfr0s*Nevm2Q;xLJoz`DipR}atmvD+IW9TS?LBhU(k zQVAY}W3_H|y)6)qnz?ACCkWk-sj+CdkXGzNiOmuunq7Jm{Ntt)-`a|UEVSYs%<|}E zk1ldJrRH9BLm=MT%j`$u!Zz&pok|J~i?qx*$7Q zTpK>_SmKa2ZHzY&>8EyPN2HfA7|Si)>U2Zb?fZ3AA8xd(v8W z=_OMA$7U2O;{fW)qv+_$o7s5cIBUgrN<$5U@lY1dj-u~woMkdOMf4}iE1V_DTE%`D z5RlM3_%-l3?~`f~T$_xaMdw-*{!RV<%^>Oj3tod@OQ-%|?J={w{sM5#|Kg$K^hTzO z|Kkh)1F8H^{yP?<+W4f8BYyE{|6}v^?8M$r^MIc`L?jJ@PXJ=VjAb9L)1_^EjFLNY zmOE&6+6i8~uoN*QICSu;;iy}#N^@{;6T^>sXaK1dLt>R(ao{04F{2>%7VoeR4dO`(KN)s7x7r=+*^}-2sHA z+0j(6t|ysg=>5pxL7AdqaXjXS8msO;2iPzciUH%5|ik#&e&!+ex8V(Q6!6! zGP-%=t$PPhj;_H-lz?_2S4=gmr!oo6rAWbS|DPL)DefILtOPEC2$&9AN^y1iY~$*d zLlhMd5fIPYH6jk@QqSi~L&@tA&s~R(!)JWmOoBv>9BG#%GU}Abq8T7f1>5OCL6&aV zwEJ;XI13hT#}PRHVcfmr_hY^6i#~vdO9pIxchoc?(3kv?>15iiKzuE^e5=U8?`1Z* z=sFsNR;QA7fZ8#n%fR7Xbcp57MTyBdnn?bY8Qg>m5y;8wS=H!0`OvxEa{4S-9c~tt z?r*ow50yepZX9S%;NMA~|90Q&956qVl*i#pB`;J2h>xVgmZI!l^|G<(gPIZM#2mQ`ejHY<@!B(<#t~A*BZ}9`M^HGufz*zF=MXnK*47soGq?=Y7^i|2Ty4aH) zl|+N(2G&YL97KEsbqd<$rMHbLIvB(_^YQ$;yjxp5veJkn)6lIb%S;DF#CDkQy<@3d zkeJz@rG^I_uiEJ)*B3-6ju$GL9oFYqR`-INP=Gy-G(C~A_w52nb~Tur;1Ws>0lIMU zxyTyxQ5~dukC~(+`L%ElTk7l8$0ox?ALaqG@-jFS;>odJvb%gClK-jqAv=f(ErG#1 z=+%)f=_`G|)qf5fQmdShb+e5lNy;AEN*?Xd@~M_m&UZcS;KVbS2w&hD#ylTy_U(`N zkyynsS9eD2`IzamW54kEEZ@eDHX?i3Kt225y+M2hZ3Nj~Xjd#V$x&4!ZBOrC?$IJL zKV8_LY=}B?(y#o|=-S^yacts?;`+z#i_kWb<@2=8J{Zv*5(iNnI8|{xYa_})#!K(> z*ILSs9@%RkK#{rlbR$Xq&RvN)#2+c`OwYJ90y;v}NBxb65((n*c)_X{5^?yG**`_R|G!BxWZ2VBtbQ zGp(av&Ix9)C!Eg9He|viPAGt2@!S1+pTVnxjhbT*--_8TwHh~VV3?a@(8w_9Y#cIK zC#4$Pg?}uE?p)ue?~k=VaYR%-sFl zdP=rzW#yw~99C?!+H~VSYQk^5m0rJb{p<&ct_VZeM_7tCb}c*_Cr$x`QN^~iAS~ma zZne(uX^exYKL1U2YwEu9k1NT`(WH>;!OH_3PT>xIQrF6i5ULLq)TrGYKPaO`4E5evPK_j@v&Ec>D~-du zJ+K1k7%7)QhiOZJYy7pVO~coo!ImhISl`es=L}4hs@Fcx#NK9xd&hhU$+3U26wzU( zP$T7l&Exi3^7Z;B`FPw_kb1O791tuW} zjC*^3%yc$%(Vxe;Od?OIGWvGlTXH!1OAuUg@N<@+ zn#0uWr_l7|ma|N}Ji1$uTO}nMVlrJdqaZ&yXYkPek`V5T&w8~(-OPV7O|ArR21AvdfOBRN&)iQ zJXc0WGk4%OOIsX{fKPUU10}X@FSfQe5K`7i_C71z%SjB%u3E(g=vD5)%T<$fjwN{f zJ155LAXj+Y-d*$J^|UGIpqH%O`#9NrV(8FhA7q+;5Ofl|=UL=K&ZBCxZO01^qaG$U zBnv&kwxiI!_4Bp7P+dg|V1M2>6^e%=x(hTnTW_LRWp-?v4}V1;;pYSQbcfy3=A!_P z@bV>x&`I~2ux6KJwuEgfcziX{*iSJ{BVoL>(6}z=b6Nt>I#|x>Y6+W{*bbeEUqeR= zO-FljhbuAW>ez%Fzp1erp{|KaUo6yPjCgZkv^ou^dMfd}9+^*JVYfb+sV&cH-3GY% zvw}(gJ5lqfpZ{o6uGCtv+Wl^i3j7s`;F%aV0UWk_+8HmAp#0lx+TG$E=rbS1nfF>w z3GMd;27d2XCv$^?y}XPX5Hc<$US$*aVe_ZkN;M2QlgZt&*vt0Hj>G5h89++x6em`Bpl!UxwQgmQ9}pF?jvjPNAm0wWlRgBL z0H8ex4S*)kvRtSR=Z1{|I{+y^=np&45?~-Z7JTR;kW*3-G11BL4!q%6q$N6o@t%Bx z2@sobS{uZ!#s#$mo033LBqZCs1 zh#v93#Rxuz#D|p%^HEEz77X+(*}uM@nur-g+8%wjWsqG73Vy}Go+^+xT)mX7?|U$p z%2--Em9BrfN!#Z0MHYs>HvR;2Ka4`*sA)$6DKUYZt`;U$RNFh)lH zkEmc&1c35)z3R$<1%VR^;yL*#Zhx1b7IJ1xx8#ORx>`!6*hiE$JumUKDE z2Afw?U!#p`Ah9862y?~ssVQSsi-dX0psWEcdlam13|*fjsiz$p+$Mp39piyTsmJ>+ zJweljcau3yFWU=Pek44L#zSxzKl(}7=A3-UO1WMNecujCT>OaDRvMCzMf1mfhh~Q1 zeCtc0Z^2zHty(FDMOGcz-Xsp@X68@D&U>FRY445o% zi+XY-f3>IWKHd9p!>)s&N_!|(2TICtKeBGh`JtzZ(YuBy0f#TxQhULz?-hF$pML>r zixmNg5S{M-I{@#X``+SL^_>-5K8a?c!}OCy8+M|GxL;+=gfTS~@;v#_@zv`ggbx3Zj0fE^nN+?ltx=jpo zwXR)k)w-#EIHJP=jeGxi+cWiv!J6t7B$naQl2*NCHWe0j^{v5Ys8s14GJ_A;`>$Z2#^@!?CgTnX0XmHfEKtek|MfvQ6v|ia@5q!u;tGKepwZ zoh|rph?at+_|`y6pzI{@HelVyzei$91ZegcU- z8+P#}EXeiN#%~KU@K25{+Lp7{ZK5?tSz21BYAL-kckYtYBR^1;f&U>9U(MnjN;yo+6B)J`q!i+pd4f{1fbSC={$o6G%oFP!w4c#Ve4esaHxa%^jR%|JTtV4 zdF_{R2(-km`PYjHXT`nv=^;D6Y!Cs@#4YShg1})zxf{SlYK#9=+hmirF#mV;(95PP z0Hp?-;VPC=7wH0yrczU0d?9`5TT7K|B#cAy9##gf6HAUW=^Zql4&;cflQ%E`Y|L-3 z6)KZYV-npvK+XFveU4WJ`g~eH1#Md42T&BFCqsLTfe^I<~19({dHb= zn5E?b(vz1AtGo=0Y9=#te2hO*F%+L~{HC;5M-Ul}pnwJk0F(wNBKSe|X#iRLRDqHm zPbItNc;SJVzx%kIC zmuIll`S*w95hq$sgY8ff?O#v3bRyp=7I@X=o49?y`Sd2JNN0T3uvasYaXe-`=CW2t zkjr30(S>u>E`=4PZrJah#f?Zo5@y=!oIy#gKqd$i-h1MR2w2VEvqJE7NLt5d3W1_) z^KAOxll7w#?C;jQsH5On_!6bTK~%Ew-lDO5Y{F#WH+t&?bJ08|ArL3^D+tYBY=zp zD%dWeY(+xDM{SljHv?CroxNA&4wf_bPgQKA{Fl4<(=)Zco4g-7IUIAN8fot2gqvdoBv7ugb(q_*f?ICQ^$lz(J(-vCAp^UT7ASba%Z%MPiTizNV zXrdVZc_xqM9ABmU>jtl0+sD=CV0gsgX(NQ<9Fl(Z&oFxwBN!eudqJM1BMPR}_-DFT z(^TNSs-f@R6qI;$jsfj5_~kMk|J0L;7J5ClO_4@g@IiE*Gu5L8CMb8 zB$=n*?;Lc$xFA~#F3`bMewN`>e$sbvd&^Sc6EscrSqHK_j%nb#GbNUpn#z*Q)s@R8 z?HTb#&EXU1I$$pXk~SN0MpQeFRtFv(&f!AKxNKR6o_Yx;MweU0aM80Y*O-28PjjWC zrs`oflTexeCJB!Xp9gCR8j1~2HxkQn?cwS82yjs@P>A0#NR+~G(PNQ22?dSHr=bCC z`owAqSb}l@ZG%UNm+>Y?{Prd?L~rB~{J_O{A__TxwT}Z1ZOJ?PUCg|O27V9i6Ev=- zAD}H&imMr)$~F7;^?n0VkOQIie#a~Otl3|UB7%N&4L(<8ul3VxWJx;Ry}?;rD#;FR zvRH~~e{Pzv6yo9P$RblD74#8A#gLZ$Jqto*V>qpGv%7d#seaAjNjpm(ey2HGAW}5{ zVb!!H-(@~eer3NTj~N{Ya2sSJAfLSX{oM8~gVfUw%t+iGDCs-Hwm0uL(6r{3-Y&)O zi0i8#%L`f=I;3RXZEfFWNwVo_{2;dqlxKeuzF$#43!j_{nW`@sZo21Vj;buYG~_W$ zi}hUQ?C6M`m%oeF)v{7q@o12aEUdoS(YG=af&igXY`2p2Pl{%E;I zgPr98iadq@vt!ygni-7~FMF8dIGgJQm^rzfNR1)Omgw&8a(rXnhCz9j_Zp50Nid?# z3`(V`yDD~t{xNff;y;5j8TTJ@R$moc$Pj zwWN(;{BZa~Hi9ZMS6z1cYYB(#u3lGi0n3|c!#S%|lfHsfb-T^kLsxp1O9JxGl&ZtJLkW4~hWZAnM)3V^-zU3G zFP+UO+IrXaJ9R^0LGv`$=NNXIwi>_c=^%1_t>AHVr(b^OaJ=jprzA>1=9MxHmL+za<`A6@F zE+<;;T?Dwt*kXtYzvMGh+08@AZEj=w+&WC8e1=!1-pgze#&c-X{N4oq+9RU=>Od&0 z4~4$A~i#|ZN^i!>P3s&=&krH#iKe=DV zYqe$>l5g+t{;mD7W5f7Ds_lzv#B^;Z4Oo9ur{2qooXwHB#hS9Mi z^B~yG@`|~UlCmhP>lupR^w2*&DRNXweLKhqXq?T!!sEdwq?d>1t({s1jZGeR~w z2zys}(>vHY32yo(K8hQ(n+!8uUk>bULFeE-^oe)l|8`MCcgytWF|bUWlX+Q@Vg%pb z@64b@f0!I<%o2ns+gathkK!)cLZjHQPHC{4%c{#aG!@v>XXck7i zRsI`z$hh=hc%e-?MiQlc&W>nsyRiBCBi#?(0nq2=+oP;d!UIxN9Vl% zbLgw+?nh$LGlO!GR_M?r0b^0Ig0*YL*)J)f0?($cP%1AlLiHa&V;8Q3 zqsIRdPo8Lwpf%C9dYD^$(+l_t23@9YutR5Wf#!peu+!h zPF1^l#q4g}TBw(Ak=N(64KJqi)r&*rNKq5h!#5Q(no?;?Uv(#?dL4W40#@Wk5DQC6 z$=O7OJPBlEW~+iizFXcc;kxH;l|xqoZEt-v!_K|`!DwHdCG?>D>tYq$6ynJ!`tVky zz-yQ-bn5EhO>M&*{DU>ahU=xx_p9@q=JIy%izP`#*PssEpMqSTh;-b&zb~%(x7(fP zq&3T5+`6k;zKJp9Wj|6%oLr2V{h5LqAQi&knibFm@sLu{m}aeUegvDE#*@4 zI&5GwuOJ*&(ep6P^2ElR0!iqpX&k*Skz2650ojdfIc_DgQl2WsPq{OPRg>y{hs<<1 zHERKbhpIxn=u`PNS*}K<^{ClgBY`FjSbwltMQIj1oX}u&H9Xxa2$U7e?TcU2d$@F8 zcw(VWv)rNJlp;9;vELN~3*vi3+?l_v!%4&E+153@Z!AHe8GxlSGzE9+lxxZ|VSAcj z6$X1$oluW{4;OvEjNRIz`w5t#?u8U&OJpD<6dWw~0xr~pgN=Y8RP@8x+zX9BnOUR}?9_3;D z-K*sD0WTmY{a1o4;!-u8w!n>Cq+`AGES`wTcawqdmD8ops+yq^Nlm%Fe>*RHGkbLl zg1Uo7&&0^4(lyZ2kGe~I!F6(*^`YW66qg^t`xs2UI6pleP!oirg!mYe9%F^vGfsY9 zB$_pAh?bh5U@Twa8Okdy;E$LkIB@fVV+loglQk`wAXrTS3~{2!i6&zVg@J|Nac<%> zWpV9P$GI)54j_c9?H#fWER5(z1yfR}S33{DC4+&{x;kHIJ;DD~{j-hna5mAOA-nOg zd}B1NV^w{^?2s8+>5jx>eY(-n<1Dfs8+SkFz`BNN9#f1w*1BN1}5v;7UbG}6=@Ytt2b ztNVUfmXko#tIgMV&ciO6Asc!Tf6!Z2jF!5ocKGHqv#v9K26Use*c4zxR@QDcq&D-N zeVIkNtFZJG!*!=_a-^73eRwQf(4iO?d8%0;nhG*}be@`^hP>PwLsz^%alem@#!?!lTKW6!F3R#=XWZPR3vS)zCTWo&G$3mL__ zuHeJ(!w~ zuE{QcJa7(4ct<F=KyIj!mo()E!CSmP;PrYv;U4w>0KHeU=V{;CBZi9q>Aa$c9^heiSodPsIO_qV) zsRuWpR)h~e6Ec-$02&riT*y$8ff*slz3ayO)r_Lx6h~*N1JKFJ^l7H|x~CN{JDGSn zrzHlNU0jU*y1{qa8JoRD9FDt+Ywx%%v(|He|BZr9+k`cZO4S^4>HHdC1t(8Eoopl0HpQDaHMD=I!w0Q#JK=OemuM5Q6WW}om&gED zuCQme@;!Ag+L^Of7o2qUt;H#*#K&Dr&B489%uN|BF51mwf622vD~605h{tYu3JW}X zf}|2sh1kk#XZFia6C~YI8BpVopTH7WZ{)L2<+&#oLB^D16Z&5>@gAbh8w!S(^118KrZa&X~SzgYKrhWK7!483rKU_SGfgYeBsRN zLjq^p>5Wf@s&l9m#?}j3TO{1I&c&o>4{dg3kXh)q*=49kWT<_;tOvbu!}^-iLJ}_O zIc=d&oxUDniA%B+-|v~T)oWT-S6M75?aHkyP!>aRyZ=g#lt|7!d!eY**sS?y$(sCc zu2|$$suJ?T+_V+8!`kT^Y`~3hxsFsGLp0z{465sec=T->VZgUcz zi-f0C+Ux3*dPL27b%>vp{kF5xxg$iid_l$1?=fi22jwv57tz1PEzme5Bb>-ubo#0% zg3k7i@NkZ#4N8uieS|{D5h8J6$^9TDSO*1LtaQOWf`4Wa?=kTJO=)X_C7D!eo}I-v zm)@{rcU?N!v4&>{j^4iVtafVKuD_WnBN(uhF3_+%=xgRA2(9`xc_GmIM^Cy($4fff zfJ}LgWt@pHW<#gVcgt=ka$+a)5|2ikSogHMzL}5p=TuYn!v)mg`|*%X4W|2Ot?vS= zDCzY!lv@1wweVTKbb4N}%PWRC)~7j+c2N6WsQ!AVe_dt%0eC>IIoqqq2I7`o0 zcNp+#wmTBXW6qC9&d z@dAH4%FWd84;O5&hp#2vaamOM^Cu+AkBMR5Jk@ltxufz{LO#wu>nwrZ^kMixmAUcG zx?EsKVDp9=wZ2)M$TG+Na3*FyG0^AaeO`-^6SbvTHp9J`t?sDTKFJwuZ|jwGdF$Rt zVI-aj=@3B^1b?%4h4%7xz)yt`3j6|XbE(X;Uq0*I!9}G;J3AeW8UCJ@<6n;Uzbx-0 zXNn}Tm8nvCpYVlLrO6$Y?oV}xG|Fv+H`h+*aHO{bvfZbdw7mO6_yeMY8edM4wl=!6 z{pX~HntZ4;`RC|HN$U7Whms;H{{Q{L e4wV0h Date: Fri, 4 Sep 2020 16:18:34 +0800 Subject: [PATCH 12/35] update check scripts --- bin/debug/cluster_env_check.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/debug/cluster_env_check.sh b/bin/debug/cluster_env_check.sh index e709ce81ca..9848f69c0a 100644 --- a/bin/debug/cluster_env_check.sh +++ b/bin/debug/cluster_env_check.sh @@ -18,10 +18,10 @@ cwd=$(cd `dirname $0`; pwd) source ./check_iplist.sh for ip in ${iplist[@]};do - if [[ ! -d "${EGGROLL_HOME}/bin/debug" ]]; then - echo "${EGGROLL_HOME}/bin/debug in $ip is not exist, mkdir -p ${EGGROLL_HOME}/bin/debug." - mkdir -p ${EGGROLL_HOME}/bin/debug - fi + if ! ssh -tt $user@$ip test -d "${EGGROLL_HOME}/bin/debug"; then + echo "${EGGROLL_HOME}/bin/debug in $ip is not exist, mkdir -p ${EGGROLL_HOME}/bin/debug." + ssh -tt $user@$ip "mkdir -p ${EGGROLL_HOME}/bin/debug" + fi if ! ssh -tt $user@$ip test -e ${EGGROLL_HOME}/bin/debug/check_env.sh;then echo "${EGGROLL_HOME}/bin/debug/check_env.sh in $ip is not exist, scp check_env.sh to $ip:${EGGROLL_HOME}/bin/debug" From 8d559b03ebbb3ee1a1d77934a720a283d94ad840 Mon Sep 17 00:00:00 2001 From: zzzcq <1270934223@qq.com> Date: Fri, 4 Sep 2020 18:02:42 +0800 Subject: [PATCH 13/35] update check scripts --- ...77\347\224\250\350\257\264\346\230\216.md" | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git "a/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" index f5886fc7ca..3e2c415696 100644 --- "a/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" +++ "b/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" @@ -13,13 +13,13 @@ 名词解释: -| 名词 | 解释 | -| ---------------------- | -------------------------------------------------------- | -| $FATE_HOME | 通常在/data/projects/fate | -| $EGGROLL_HOME | 通常在/data/projects/fate/eggroll | -| ${集群节点个数} | 如果运行脚本的机器所在集群有3个节点,就取3 | -| ${host party_id} | 可选参数,检查data_access服务是否可用,取host方partyid值 | -| {需要查询的session-id} | 是一个21位左右的长id。如202009031227285073491。 | +| 名词 | 解释 | +| ----------------------- | -------------------------------------------------------- | +| $FATE_HOME | 通常在/data/projects/fate | +| $EGGROLL_HOME | 通常在/data/projects/fate/eggroll | +| ${集群节点个数} | 如果运行脚本的机器所在集群有3个节点,就取3 | +| ${host party_id} | 可选参数,检查data_access服务是否可用,取host方partyid值 | +| ${需要查询的session-id} | 是一个21位左右的长id。如202009031227285073491。 | @@ -72,6 +72,8 @@ cat result_env.log #### 2.3.2 跨集群版 +**需支持节点间免密scp、ssh操作,也可以手动输入密码执行** + 1、设置环境变量: ```shell @@ -175,7 +177,7 @@ sh server_check.sh ${集群内节点个数} ${host party_id(可选)} 可选参数: -​ {host party_id} //当需要检查data_assess的服务是否可用时使用 +​ {host party_id} //当需要检查data_assess的服务是否可用时使用,若不提供该参数时不检测。 结果保存在result_server.log文件中 @@ -197,7 +199,7 @@ sh server_check.sh ${集群内节点个数} ${host party_id(可选)} - 检查方法: - 检测/data/projects/fate/eggroll/conf/route_table.json 是否有配置default参数。如果有,把ip和端口打印出来。如果无提示ERROR。 + 检测/data/projects/fate/eggroll/conf/route_table.json 是否有配置default参数。如果有,把ip和端口打印出来。如果无,提示ERROR。 #### 3.4.2 data_access service check(是否已安装data access) @@ -283,7 +285,7 @@ sh server_check.sh ${集群内节点个数} ${host party_id(可选)} ### 4.3 使用方法 -**需支持节点间免密scp、ssh操作,或手动输入密码执行也可以** +**需支持节点间免密scp、ssh操作,也可以手动输入密码执行** 1、设置环境变量: From 1d92718808eda0775bad7a0e5d814103b9feb983 Mon Sep 17 00:00:00 2001 From: paulbaogang <1111@qq.com> Date: Fri, 4 Sep 2020 20:12:40 +0800 Subject: [PATCH 14/35] =?UTF-8?q?1=E3=80=81add=20deploy=20doc=20for=20vers?= =?UTF-8?q?ion=201.4.5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../doc/Fate_cluster_install_guide_ansible.md | 65 +++++++++++++++++-- .../doc/Fate_guest_install_guide_ansible.md | 40 ++++++++++-- 2 files changed, 93 insertions(+), 12 deletions(-) diff --git a/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md b/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md index 97c40312d7..32b852e516 100644 --- a/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md +++ b/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md @@ -206,6 +206,8 @@ ssh app\@192.168.0.2 生产环境使用时,因内存计算需要增加128G虚拟内存,执行前需检查存储空间是否足够。 +注意:dd执行时间较长,请耐心等待 + ``` cd /data dd if=/dev/zero of=/data/swapfile128G bs=1024 count=134217728 @@ -309,6 +311,35 @@ init var_files/prod init project_prod.yml ``` +### 4.4.2 证书制作配置(可选) + +``` +#证书制作 +vi /data/projects/ansible-nfate-1.*/tools/make.sh +#1、自定义证书需同时部署两端,只部署一端需要手工处理证书,手工处理部分暂不介绍。 +#2、如果同时部署host和guest,则修改guest_host和host_host对应IP,exchange_host不需修改。同时部署host和exchange或者同时部署guest和exchange也是同理。 +#3、如果都需要部署,则都需要修改。 + +guest_host="192.168.0.1" ---根据实际IP修改 +host_host="192.168.0.2" ---根据实际IP修改 +exchange_host="192.168.0.3" ---根据实际IP修改 + +#执行脚本制作证书 +cd tools +sh ./make.sh + +在keys/host,guest,exchange下会产生证书文件。 + +#拷贝证书到部署目录 +sh cp-keys.sh host guest + +证书文件会拷贝到roles/eggroll/files/keys目录 + +特别说明: +1、cp-keys.sh的两个参数可以取值为host、guest和exchange。 +2、目前脚本部署只支持2方设置证书认证。(host&guest、host&exchange、guest&exchange) +``` + #### 4.4.2 修改配置文件 **1)修改初始化主机IP** @@ -317,10 +348,13 @@ init project_prod.yml vi /data/projects/ansible-nfate-1.*/environments/prod/hosts #ansible格式配置文件 -[init] ---把需要部署的主机IP填入init组 +[fate] ---把需要部署的主机IP填入fate组 192.168.0.1 192.168.0.2 +[deploy_check] ---把执行ansible的本机IP填入deploy_check组 +192.168.0.1 + [all:vars] ansible_connection=ssh ansible_ssh_port=22 ---根据实际情况修改 @@ -355,13 +389,17 @@ host: enable: True ips: ---IP列表,目前rollsite只支持部署到一台服务器 - 192.168.0.1 - port: 9370 + port: 9370 ---grpc端口 + secure_port: 9371 ---grpcs端口 pool_size: 600 ---线程池大小 - max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如8G + max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如12G,如果是rollsite专用的机器,配置成物理内存的75%。 + server_secure: False ---作为服务端,开启安全证书验证,不使用安全证书默认即可 + client_secure: False ---作为客户端,使用证书发起安全请求,不使用安全证书默认即可 default_rules: ---本party指向exchange或者其他party的IP,端口路由配置 - name: default ip: 192.168.0.3 ---exchange或者对端party rollsite IP port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370 + is_secure: False ---server_secure或者client_secure为true,指向的下一跳rollsite也开启了安全认证,此参数需要设置为true,上一个参数port需设置为9371,不使用安全证书默认即可 rules: ---本party自身路由配置 - name: default ip: 192.168.0.1 @@ -426,13 +464,17 @@ guest: enable: True ips: ---IP列表,目前rollsite只支持部署到一台服务器 - 192.168.0.2 - port: 9370 + port: 9370 ---grpc端口 + secure_port: 9371 ---grpcs端口 pool_size: 600 ---线程池大小 - max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如8G + max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如12G,如果是rollsite专用的机器,配置成物理内存的75%。 + server_secure: False ---作为服务端,开启安全证书验证,不使用安全证书默认即可 + client_secure: False ---作为客户端,使用证书发起安全请求,不使用安全证书默认即可 default_rules: ---本party指向exchange或者其他party的IP,端口路由配置 - name: default ip: 192.168.0.3 ---exchange或者对端party rollsite IP port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370 + is_secure: False ---server_secure或者client_secure为true,指向的下一跳rollsite也开启了安全认证,此参数需要设置为true,上一个参数port需设置为9371,不使用安全证书默认即可 rules: ---本party自身路由配置 - name: default ip: 192.168.0.2 @@ -491,24 +533,29 @@ guest: vi /data/projects/ansible-nfate-1.*/var_files/prod/fate_exchange exchange: - enable: True + enable: False rollsite: ips: - 192.168.0.3 port: 9370 + secure_port: 9371 ---grpcs端口 pool_size: 600 - max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如8G + max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如12G,如果是rollsite专用的机器,配置成物理内存的75%。 + server_secure: False ---作为服务端,开启安全证书验证,不使用安全证书默认即可 + client_secure: False ---作为客户端,使用证书发起安全请求,不使用安全证书默认即可 partys: ---指向各party的路由配置 - id: 10000 rules: - name: default ip: 192.168.0.1 port: 9367 + is_secure: False ---server_secure或者client_secure为true,指向的下一跳rollsite也开启了安全认证,此参数需要设置为true,上一个参数port需设置为9371,不使用安全证书默认即可 - id: 9999 rules: - name: default ip: 192.168.0.2 port: 9370 + is_secure: False ---server_secure或者client_secure为true,指向的下一跳rollsite也开启了安全认证,此参数需要设置为true,上一个参数port需设置为9371,不使用安全证书默认即可 ``` @@ -593,6 +640,8 @@ cd /data/projects/fate/python/examples/toy_example/ python run_toy_example.py 10000 10000 1 ``` +注意:如果超过1分钟没输出,表示部署有问题,需要看日志进行问题定位。 + 类似如下结果表示成功: "2020-04-28 18:26:20,789 - secure_add_guest.py[line:126] - INFO: success to calculate secure_sum, it is 1999.9999999999998" @@ -605,6 +654,8 @@ cd /data/projects/fate/python/examples/toy_example/ python run_toy_example.py 9999 9999 1 ``` +注意:如果超过1分钟没输出,表示部署有问题,需要看日志进行问题定位。 + 类似如下结果表示成功: "2020-04-28 18:26:20,789 - secure_add_guest.py[line:126] - INFO: success to calculate secure_sum, it is 1999.9999999999998" diff --git a/cluster-deploy/doc/Fate_guest_install_guide_ansible.md b/cluster-deploy/doc/Fate_guest_install_guide_ansible.md index 6267c7de70..cb561b9bf5 100644 --- a/cluster-deploy/doc/Fate_guest_install_guide_ansible.md +++ b/cluster-deploy/doc/Fate_guest_install_guide_ansible.md @@ -203,6 +203,8 @@ ssh app\@192.168.0.2 生产环境使用时,因内存计算需要增加128G虚拟内存,执行前需检查存储空间是否足够。 +注意:dd执行时间较长,请耐心等待 + ``` cd /data dd if=/dev/zero of=/data/swapfile128G bs=1024 count=134217728 @@ -307,6 +309,25 @@ init var_files/prod init project_prod.yml ``` +### 4.4.2 证书部署前配置(可选) + +1)联系webank获取guest端部署证书文件。 + +2)放置到部署目录 + +``` +cd /data/projects/ansible-nfate-* +mkdir -p roles/eggroll/files/keys/guest +cd roles/eggroll/files/keys/guest +把获取到证书文件解压缩并放置到此目录下,如下: +-rw-r--r-- 1 app apps 1371 Sep 4 18:07 guest-ca.pem +-rw-r--r-- 1 app apps 241 Sep 4 18:07 guest-server.key +-rw-r--r-- 1 app apps 1151 Sep 4 18:07 guest-server.pem +-rw-r--r-- 1 app apps 1371 Sep 4 18:07 host-client-ca.pem +-rw-r--r-- 1 app apps 241 Sep 4 18:07 host-client.key +-rw-r--r-- 1 app apps 1143 Sep 4 18:07 host-client.pem +``` + ### 4.4.2 修改配置文件 **1)修改初始化主机IP** @@ -315,10 +336,13 @@ init project_prod.yml vi /data/projects/ansible-nfate-1.*/environments/prod/hosts #ansible格式配置文件 -[init] ---把需要部署的主机IP填入init组 +[fate] ---把需要部署的主机IP填入fate组 192.168.0.1 192.168.0.2 +[deploy_check] ---把执行ansible的本机IP填入deploy_check组 +192.168.0.1 + [all:vars] ansible_connection=ssh ansible_ssh_port=22 ---根据实际主机ssh协议端口修改 @@ -352,13 +376,17 @@ guest: enable: True ---是否部署rollsite模块,True为部署,False为否 ips: ---IP列表,目前rollsite只支持部署到一台服务器 - 192.168.0.1 - port: 9370 ---rollsite端口 + port: 9370 ---rollsite grpc端口 + secure_port: 9371 ---rollsite grpcs端口 pool_size: 600 ---线程池大小 - max_memory: 8G ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如8G + max_memory: 8G ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如12G,如果是rollsite专用的机器,配置成物理内存的75%。 + server_secure: False ---作为服务端,开启安全证书验证,不使用安全证书默认即可 + client_secure: False ---作为客户端,使用证书发起安全请求,不使用安全证书默认即可 default_rules: ---默认路由,本party指向exchange或者其他party的IP,端口 - name: default ---名称,默认即可 - ip: 192.168.0.3 ---exchange或者对端party rollsite IP,和webank确认后修改 - port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370,和webank确认后修改 + ip: 192.168.0.3 ---exchange或者对端party rollsite IP,和webank确认后修改。 + port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370,和webank确认后修改。 + is_secure: False ---server_secure或者client_secure为true,指向的下一跳rollsite也开启了安全认证,此参数需要设置为true,上一个参数port需设置为9371,不使用安全证书默认即可。 rules: ---本party自身路由配置 - name: default ---本party rollsite所在主机IP和端口 ip: 192.168.0.1 @@ -492,6 +520,8 @@ cd /data/projects/fate/python/examples/toy_example/ python run_toy_example.py 9999 9999 1 ``` +注意:如果超过1分钟没输出,表示部署有问题,需要看日志进行问题定位。 + 类似如下结果表示成功: "2020-04-28 18:26:20,789 - secure_add_guest.py[line:126] - INFO: success to calculate secure_sum, it is 1999.9999999999998" From f050ae04dba13b04c177d2eae42a88c8c814bae4 Mon Sep 17 00:00:00 2001 From: zzzcq <1270934223@qq.com> Date: Mon, 7 Sep 2020 09:56:52 +0800 Subject: [PATCH 15/35] update check scripts --- bin/debug/server_check.py | 41 +++++++++++-------- bin/debug/server_check.sh | 32 +++++++++++++++ ...77\347\224\250\350\257\264\346\230\216.md" | 4 +- 3 files changed, 60 insertions(+), 17 deletions(-) create mode 100644 bin/debug/server_check.sh diff --git a/bin/debug/server_check.py b/bin/debug/server_check.py index 80d2f432e5..bef0dd96ec 100644 --- a/bin/debug/server_check.py +++ b/bin/debug/server_check.py @@ -14,6 +14,7 @@ # # import re +import os import sys import json import time @@ -33,6 +34,7 @@ arg_parser.add_argument("-t","--time", type=int, help="Sleep time wait, default value 0s", default=0) arg_parser.add_argument("-n","--nodes", type=int, help="Eggroll session processors per node, default value 1", default=1) arg_parser.add_argument("-p","--partitions", type=int, help="Total partitions, default value 1", default=1) +arg_parser.add_argument("-d","--partyid", type=int, help="host partyid", default=0) args = arg_parser.parse_args() def str_generator(include_key=True, row_limit=10, key_suffix_size=0, value_suffix_size=0): @@ -71,9 +73,14 @@ def get_host_ip(): fate_flow_client = "/data/projects/fate/python/fate_flow/fate_flow_client.py" mem_info = {} mem_info["Ip"] = get_host_ip() - mem_info["route_table"] = query_cmd("if [ -f $EGGROLL_HOME/conf/route_table.json ];then array=(`cat $EGGROLL_HOME/conf/route_table.json |grep -E 'ip|port'`); echo ${array[@]}; else echo 0; fi") + eggroll_home = query_cmd("echo $EGGROLL_HOME") + route_file = eggroll_home + "/conf/route_table.json" + f = open(route_file, encoding='utf-8') + mem_info["route_table"] = json.load(f) mem_info["data_access"] = query_cmd("ps aux |grep data_access_server |grep -v grep |wc -l") - mem_info["data_test"] = query_cmd("curl -X POST --header 'Content-Type: application/json' -d '{\"local\": {\"role\": \"host\", \"party_id\": 10000}, \"id_type\":\"phone\", \"encrypt_type\":\"md5\"}' 'http://127.0.0.1:9350/v1/data/query_imported_id_library_info'") + if args.partyid != 0: + mem_info["data_test"] = query_cmd("curl -X POST --header 'Content-Type: application/json' -d '{\"local\": {\"role\": \"host\", \"party_id\": %s}, \"id_type\":\"phone\", \"encrypt_type\":\"md5\"}' 'http://127.0.0.1:9350/v1/data/query_imported_id_library_info'" %(args.partyid)) + mem_info["data_num"] = mem_info["data_test"].split(':')[-1].split('}')[0] mem_info["directory"] = query_cmd("if [ -d /data/projects/fdn/FDN-DataAcces ];then echo 1; else echo 0; fi") mem_info["services"] = ['ClusterManagerBootstrap','NodeManagerBootstrap','rollsite','fate_flow_server.py','fateboard','mysql'] mem_info["job_run"] = query_cmd("if [ -f %s ];then python %s -f query_job -s running | grep f_job_id |wc -l; else echo -1; fi" %(fate_flow_client,fate_flow_client)) @@ -81,7 +88,6 @@ def get_host_ip(): mem_info["job_thread"] = [] mem_info["jobs"] = query_cmd("array=(`python %s -f query_job -s running | grep f_job_id |awk -F: '{print $2}' |awk -F '\"' '{print $2}'`);echo ${array[@]}" %(fate_flow_client)) mem_info["job_mem"] = [] - mem_info["data_num"] = mem_info["data_test"].split(':')[-1].split('}')[0] for job_id in mem_info["jobs"]: mem_info["job_thread"] = query_cmd("ps -ef |grep egg_pair |grep -v grep |grep %s |wc -l" %(job_id)) mem_info["job_mem"] = query_cmd("ps aux |grep egg_pair |grep %s |awk '{sum+=$6};END {print sum}'" %(job_id)) @@ -101,24 +107,30 @@ def get_host_ip(): for node in result: print_green("==============This is node " + str(node[0]) + ":" + node[1]["Ip"] + "===========================================") print_green("-------------default route check-------------------------------------------------------") - if node[1]["route_table"] == 0: - print_red("[ERROR] eggroll route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed!") + route_table_dict = node[1]["route_table"] + if 'default' not in route_table_dict['route_table']: + print_red("[ERROR] eggroll exchange route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed!") else: - print_green("[OK] eggroll route configured!") - print_green(node[1]["route_table"]) + try: + ip = route_table_dict['route_table']['default']['default'][0]['ip'] + port = route_table_dict['route_table']['default']['default'][0]['port'] + print_green("[OK] eggroll route configured!") + print_green("exchange ip:{}, exchange port:{}".format(ip, port)) + except KeyError: + print_red("[ERROR] eggroll exchange route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed!") print_green("--------------data_access service check-------------------------------------------------") if int(node[1]["data_access"]) == 0: if int(node[1]["directory"]) == 0: print_red("[ERROR] data_access service and directory not found, please check if it is installed!") else: - print_yellow("[WARNING] data_access not running or check /data/projects/fdn/FDN-DataAcces directory") + print_yellow("[WARNING] data_access not running or check /data/projects/fdn/FDN-DataAcces directory") else: + print_green("[OK] Installed and running data_access service!") + if args.partyid != 0: if int(node[1]["data_num"]) == 0 or int(node[1]["data_num"]) == 201: - print_green(node[1]["data_test"]) - print_green("[OK] Installed and running data_access service!") + print_green("[OK] Route verification success!") else: - print_yellow(node[1]["data_test"]) print_yellow("[WARNING] data_access service not available, please check host and host route!") print_green("--------------fate service check-------------------------------------------------------") @@ -136,7 +148,7 @@ def get_host_ip(): print_green("[OK] Number of tasks waiting is " + node[1]["job_wait"]) if int(node[1]["job_run"]) > 0: for job_id in node[1]["jobs"].split(" "): - print_green("[OK] running task job_id : " + job_id + " run " + str(node[1]["job_thread"]) + " processes; used memory : " + str(node[1]["job_mem"]) + "KB.") + print_green("[OK] running task job_id : " + job_id + ", number of egg_pair processes is : " + str(node[1]["job_thread"]) + "; used memory : " + str(node[1]["job_mem"]) + "KB.") print("\n") finally: @@ -150,8 +162,5 @@ def get_host_ip(): while 1: check_actual_max_threads() time.sleep(args.time) -s() - else: - while 1: - check_actual_max_threads() +eads() time.sleep(args.time) diff --git a/bin/debug/server_check.sh b/bin/debug/server_check.sh new file mode 100644 index 0000000000..56cc20416c --- /dev/null +++ b/bin/debug/server_check.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +cwd=$(cd `dirname $0`; pwd) +if [ ! -f 'result_server.log' ];then +touch result_server.log +fi + +nodes=$1 +party=$2 +LogLevel=$EGGROLL_LOG_LEVEL +export EGGROLL_LOG_LEVEL=INFO +if [ -n "$party" ];then + python server_check.py -p $nodes -d $party >> result_server.log +else + python server_check.py -p $nodes >> result_server.log +fi +export EGGROLL_LOG_LEVEL=$LogLevel +echo "Check the result in the current directory, Please execute command: cat result_server.log" diff --git "a/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" index 3e2c415696..efaceb5219 100644 --- "a/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" +++ "b/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" @@ -177,7 +177,7 @@ sh server_check.sh ${集群内节点个数} ${host party_id(可选)} 可选参数: -​ {host party_id} //当需要检查data_assess的服务是否可用时使用,若不提供该参数时不检测。 +​ {host party_id} //当需要检查data_assess的服务是否可用时使用。若不提供该参数时不检测。 结果保存在result_server.log文件中 @@ -200,6 +200,8 @@ sh server_check.sh ${集群内节点个数} ${host party_id(可选)} - 检查方法: 检测/data/projects/fate/eggroll/conf/route_table.json 是否有配置default参数。如果有,把ip和端口打印出来。如果无,提示ERROR。 + + #### 3.4.2 data_access service check(是否已安装data access) From e36586b43d8c84f270a26d7a9f9bf5e87c0262d0 Mon Sep 17 00:00:00 2001 From: zzzcq <1270934223@qq.com> Date: Mon, 7 Sep 2020 16:57:57 +0800 Subject: [PATCH 16/35] update check scripts --- bin/debug/server_check.py | 2 -- ...\254\344\275\277\347\224\250\350\257\264\346\230\216.md" | 6 ++++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/debug/server_check.py b/bin/debug/server_check.py index bef0dd96ec..d39eda417d 100644 --- a/bin/debug/server_check.py +++ b/bin/debug/server_check.py @@ -162,5 +162,3 @@ def get_host_ip(): while 1: check_actual_max_threads() time.sleep(args.time) -eads() - time.sleep(args.time) diff --git "a/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" index efaceb5219..333988af64 100644 --- "a/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" +++ "b/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" @@ -89,8 +89,10 @@ vi check_iplist.sh 参数说明: -​ user=app <远程登录用户名> -​ iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> +```shell +user=app <远程登录用户名> +iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> +``` 3、执行检测脚本: From 4dfa530fe8a278bae4d46846b6a2c12a0692f618 Mon Sep 17 00:00:00 2001 From: zzzcq <1270934223@qq.com> Date: Mon, 7 Sep 2020 17:12:32 +0800 Subject: [PATCH 17/35] update check scripts --- ...4\275\277\347\224\250\350\257\264\346\230\216.md" | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git "a/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" index 333988af64..2038034f8b 100644 --- "a/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" +++ "b/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" @@ -306,8 +306,10 @@ vi check_iplist.sh 参数说明: -​ user=app <远程登录用户名> -​ iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> +```shell +user=app <远程登录用户名> +iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> +``` 3、执行检测脚本: @@ -360,8 +362,10 @@ vi check_iplist.sh 参数说明: -​ user=app <远程登录用户名> -​ iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> +```shell +user=app <远程登录用户名> +iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> +``` 3、然后执行脚本: From 0cd730bfdffcced6b4f3d599f95b458b80fd5031 Mon Sep 17 00:00:00 2001 From: wzh <15779896112@163.com> Date: Tue, 8 Sep 2020 09:31:21 +0800 Subject: [PATCH 18/35] fix db bug --- fate_flow/db/db_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fate_flow/db/db_models.py b/fate_flow/db/db_models.py index 46ecd4a515..abb8ffb488 100644 --- a/fate_flow/db/db_models.py +++ b/fate_flow/db/db_models.py @@ -186,8 +186,8 @@ class DataView(DataBaseModel): f_task_id = CharField(max_length=100) f_type = CharField(max_length=50, null=True) f_ttl = IntegerField(default=0) - f_party_model_id = CharField(max_length=100, null=True) - f_model_version = CharField(max_length=100, null=True) + f_party_model_id = CharField(max_length=200, null=True) + f_model_version = CharField(max_length=200, null=True) f_size = BigIntegerField(default=0) f_description = TextField(null=True, default='') f_tag = CharField(max_length=50, null=True, index=True, default='') From 6491bc9d1d2bf3cd9cec9abced0ef4431354eff2 Mon Sep 17 00:00:00 2001 From: paulbaogang <1111@qq.com> Date: Tue, 8 Sep 2020 10:14:26 +0800 Subject: [PATCH 19/35] =?UTF-8?q?1=E3=80=81add=20deploy=20doc=20for=20vers?= =?UTF-8?q?ion=201.4.5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cluster-deploy/README.md | 8 ++++---- cluster-deploy/README.rst | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cluster-deploy/README.md b/cluster-deploy/README.md index 2aeca269f8..53fff6d7cb 100644 --- a/cluster-deploy/README.md +++ b/cluster-deploy/README.md @@ -199,10 +199,10 @@ Execute under the app user of the target server (192.168.0.1 has an external net ``` mkdir -p /data/projects/install cd /data/projects/install -wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.3-release.tar.gz +wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.5-release.tar.gz wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/jdk-8u192-linux-x64.tar.gz -wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.3-release.tar.gz -wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.3-release.tar.gz +wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.5-release.tar.gz +wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.5-release.tar.gz #Send to 192.168.0.2和192.168.0.3 scp *.tar.gz app@192.168.0.2:/data/projects/install @@ -355,7 +355,7 @@ sh Miniconda3-4.5.4-Linux-x86_64.sh -b -p /data/projects/fate/common/miniconda3 tar xvf pip-packages-fate-*.tar.gz source /data/projects/fate/common/python/venv/bin/activate pip install setuptools-42.0.2-py2.py3-none-any.whl -pip install -r pip-packages-fate-1.4.3/requirements.txt -f ./pip-packages-fate-1.4.3 --no-index +pip install -r pip-packages-fate-1.4.5/requirements.txt -f ./pip-packages-fate-1.4.5 --no-index pip list | wc -l #The result should be 161 ``` diff --git a/cluster-deploy/README.rst b/cluster-deploy/README.rst index f432d8fbba..40ec8c672f 100644 --- a/cluster-deploy/README.rst +++ b/cluster-deploy/README.rst @@ -239,10 +239,10 @@ external network environment): mkdir -p /data/projects/install cd /data/projects/install - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.3-release.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/python-env-1.4.5-release.tar.gz wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/jdk-8u192-linux-x64.tar.gz - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.3-release.tar.gz - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.3-release.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/mysql-1.4.5-release.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/FATE_install_1.4.5-release.tar.gz #Send to 192.168.0.2和192.168.0.3 scp *.tar.gz app@192.168.0.2:/data/projects/install @@ -400,7 +400,7 @@ external network environment): tar xvf pip-packages-fate-*.tar.gz source /data/projects/fate/common/python/venv/bin/activate pip install setuptools-42.0.2-py2.py3-none-any.whl - pip install -r pip-packages-fate-1.4.3/requirements.txt -f ./pip-packages-fate-1.4.3 --no-index + pip install -r pip-packages-fate-1.4.5/requirements.txt -f ./pip-packages-fate-1.4.5 --no-index pip list | wc -l #The result should be 161 From c0e1eca81ff014ef73dfa1805bd66a97da212f18 Mon Sep 17 00:00:00 2001 From: paulbaogang <1111@qq.com> Date: Wed, 16 Sep 2020 10:27:56 +0800 Subject: [PATCH 20/35] =?UTF-8?q?1=E3=80=81add=20deploy=20doc=20for=20vers?= =?UTF-8?q?ion=201.4.5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cluster-deploy/doc/Fate_cluster_install_guide_ansible.md | 4 ++-- cluster-deploy/doc/Fate_guest_install_guide_ansible.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md b/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md index 32b852e516..ef98144933 100644 --- a/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md +++ b/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md @@ -398,8 +398,8 @@ host: default_rules: ---本party指向exchange或者其他party的IP,端口路由配置 - name: default ip: 192.168.0.3 ---exchange或者对端party rollsite IP - port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370 - is_secure: False ---server_secure或者client_secure为true,指向的下一跳rollsite也开启了安全认证,此参数需要设置为true,上一个参数port需设置为9371,不使用安全证书默认即可 + port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370,即无安全证书部署;如需开启安全证书通信,应设置为9371; + is_secure: False ---是否使用安全认证通讯;需要结合server_secure或者client_secure使用,当三者都为true时,表示和下一跳rollsite使用安全认证通讯,同时上一个参数port需设置为9371;不使用安全证书默认即可。 rules: ---本party自身路由配置 - name: default ip: 192.168.0.1 diff --git a/cluster-deploy/doc/Fate_guest_install_guide_ansible.md b/cluster-deploy/doc/Fate_guest_install_guide_ansible.md index cb561b9bf5..e055e37704 100644 --- a/cluster-deploy/doc/Fate_guest_install_guide_ansible.md +++ b/cluster-deploy/doc/Fate_guest_install_guide_ansible.md @@ -385,8 +385,8 @@ guest: default_rules: ---默认路由,本party指向exchange或者其他party的IP,端口 - name: default ---名称,默认即可 ip: 192.168.0.3 ---exchange或者对端party rollsite IP,和webank确认后修改。 - port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370,和webank确认后修改。 - is_secure: False ---server_secure或者client_secure为true,指向的下一跳rollsite也开启了安全认证,此参数需要设置为true,上一个参数port需设置为9371,不使用安全证书默认即可。 + port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370,即无安全证书部署;如需开启安全证书通信,应设置为9371;和webank确认后修改。 + is_secure: False ---是否使用安全认证通讯;需要结合server_secure或者client_secure使用,当三者都为true时,表示和下一跳rollsite使用安全认证通讯,同时上一个参数port需设置为9371;不使用安全证书默认即可。 rules: ---本party自身路由配置 - name: default ---本party rollsite所在主机IP和端口 ip: 192.168.0.1 From 4291a7a330fe214d33d4b25402b1a78eedf55421 Mon Sep 17 00:00:00 2001 From: paulbaogang <1111@qq.com> Date: Thu, 17 Sep 2020 10:03:43 +0800 Subject: [PATCH 21/35] =?UTF-8?q?1=E3=80=81add=20deploy=20doc=20for=20vers?= =?UTF-8?q?ion=201.4.5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../doc/Fate_cluster_install_guide_ansible.md | 111 +++++++++++------- .../doc/Fate_guest_install_guide_ansible.md | 61 ++++++---- 2 files changed, 104 insertions(+), 68 deletions(-) diff --git a/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md b/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md index ef98144933..dafe33e2f7 100644 --- a/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md +++ b/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md @@ -40,10 +40,12 @@ Eggroll 是一个适用于机器学习和深度学习的大规模分布式架构 本示例是每端只有一台主机,每端可以多台主机,目前只支持nodemanager多节点部署,其他组件都是单节点。 -| role | partyid | IP地址 | 操作系统 | 主机配置 | 存储 | 部署模块 | -| ----- | ------- | --------------------- | ----------------------- | -------- | ---- | ------------------------------------------------------------ | -| host | 10000 | 192.168.0.1 (有外网) | CentOS 7.2/Ubuntu 16.04 | 8C16G | 500G | fate_flow,fateboard,clustermanager,nodemanger,rollsite,mysql | -| guest | 9999 | 192.168.0.2 | CentOS 7.2/Ubuntu 16.04 | 8C16G | 500G | fate_flow,fateboard,clustermanager,nodemanger,rollsite,mysql | +| role | partyid | IP地址 | 操作系统 | 主机配置 | 存储 | 部署模块 | +| ----- | ------- | --------------------- | ---------- | -------- | ---- | ------------------------------------------------------------ | +| host | 10000 | 192.168.0.1 (有外网) | CentOS 7.2 | 8C16G | 500G | fate_flow,fateboard,clustermanager,nodemanger,rollsite,mysql | +| guest | 9999 | 192.168.0.2 | CentOS 7.2 | 8C16G | 500G | fate_flow,fateboard,clustermanager,nodemanger,rollsite,mysql | + +备注:涉及exchange说明会用192.168.0.3表示其IP,但本次示例不涉及exchange的部署。 ## 2.2.主机资源和操作系统要求 @@ -51,7 +53,7 @@ Eggroll 是一个适用于机器学习和深度学习的大规模分布式架构 | -------- | ------------------------------------------------------------ | | 主机配置 | 不低于8C16G500G,千兆网卡 | | 操作系统 | CentOS linux 7.2及以上同时低于8/Ubuntu 16.04 或 Ubuntu 18.04 | -| 依赖包 | 需要安装如下依赖包:
#centos
gcc gcc-c++ make openssl-devel gmp-devel mpfr-devel libmpc-devel libaio
numactl autoconf automake libtool libffi-devel ansible jq supervisor
#ubuntu
gcc g++ make openssl supervisor ansible jq libgmp-dev libmpfr-dev libmpc-dev
libaio libaio-dev numactl autoconf automake libtool libffi-dev ansible jq supervisor
cd /usr/lib/x86_64-linux-gnu
if [ ! -f "libssl.so.10" ];then
ln -s libssl.so.1.0.0 libssl.so.10
ln -s libcrypto.so.1.0.0 libcrypto.so.10
fi | +| 依赖包 | 需要安装如下依赖包:
#centos
gcc gcc-c++ make openssl-devel gmp-devel mpfr-devel libmpc-devel libaio
numactl autoconf automake libtool libffi-devel ansible jq supervisor | | 用户 | 用户:app,属主:apps(app用户需可以sudo su root而无需密码) | | 文件系统 | 1、数据盘挂载在/data目录下。
2、创建/data/projects目录,目录属主为:app:apps。
3、根目录空闲空间不低于20G。 | | 虚拟内存 | 不低于128G | @@ -100,8 +102,6 @@ vim /etc/hosts centos系统执行:rpm -qa | grep selinux -ubuntu系统执行:apt list --installed | grep selinux - 如果已安装了selinux就执行:setenforce 0 3.3 修改Linux系统参数 @@ -109,22 +109,32 @@ ubuntu系统执行:apt list --installed | grep selinux **在目标服务器(192.168.0.1 192.168.0.2)root用户下执行:** -1)vim /etc/security/limits.conf +1)清理20-nproc.conf文件 + +cd /etc/security/limits.d + +ls -lrt 20-nproc.conf + +存在则:mv 20-nproc.conf 20-nproc.conf_bak + +2)vim /etc/security/limits.conf \* soft nofile 65535 \* hard nofile 65535 -2)vim /etc/security/limits.d/20-nproc.conf +\* soft nproc 65535 -\* soft nproc unlimited +\* hard nproc 65535 + +重新登陆,ulimit -a查看是否生效 3.4 关闭防火墙 -------------- **在目标服务器(192.168.0.1 192.168.0.2 )root用户下执行** -如果是Centos系统: +Centos系统: systemctl disable firewalld.service @@ -132,12 +142,6 @@ systemctl stop firewalld.service systemctl status firewalld.service -如果是Ubuntu系统: - -ufw disable - -ufw status - 3.5 软件环境初始化 ------------------ @@ -165,6 +169,8 @@ Defaults !env_reset **3)配置ssh无密登录** +**注意:192.168.0.1不但需要可以免密登陆192.168.0.2,也需要可以免密登陆自身,配置完后务必手工ssh连接下自身和192.168.0.2,确认下认证信息。** + **a. 在目标服务器(192.168.0.1 192.168.0.2)app用户下执行** su app @@ -297,9 +303,6 @@ tar xzf ansible_nfate_1.4.5_release-1.0.0.tar.gz ``` cd ansible-nfate-* #init.sh文件不需要修改,主要是辅助生成一些配置文件 - -#测试环境加test参数执行 - sh ./tools/init.sh test #生产环境加prod参数执行 sh ./tools/init.sh prod @@ -313,31 +316,40 @@ init project_prod.yml ### 4.4.2 证书制作配置(可选) +1)证书制作 + ``` -#证书制作 vi /data/projects/ansible-nfate-1.*/tools/make.sh -#1、自定义证书需同时部署两端,只部署一端需要手工处理证书,手工处理部分暂不介绍。 -#2、如果同时部署host和guest,则修改guest_host和host_host对应IP,exchange_host不需修改。同时部署host和exchange或者同时部署guest和exchange也是同理。 -#3、如果都需要部署,则都需要修改。 + +#1、自定义安全证书需同时部署两端,只部署一端需要手工处理证书,手工处理部分暂不介绍。 +#2、安全证书支持如下部署方式: + 1)host和guest同时部署。 + 2)host和exchange同时部署。 + 3)guest和exchange同时部署。 guest_host="192.168.0.1" ---根据实际IP修改 host_host="192.168.0.2" ---根据实际IP修改 -exchange_host="192.168.0.3" ---根据实际IP修改 +exchange_host="192.168.0.3" ---根据实际IP修改,本示例不部署无需修改 +``` + +2)执行脚本制作证书 -#执行脚本制作证书 +``` cd tools sh ./make.sh -在keys/host,guest,exchange下会产生证书文件。 +在keys/host,guest目录下会产生证书文件。 +``` + +3)拷贝证书到部署目录 -#拷贝证书到部署目录 +``` sh cp-keys.sh host guest 证书文件会拷贝到roles/eggroll/files/keys目录 特别说明: -1、cp-keys.sh的两个参数可以取值为host、guest和exchange。 -2、目前脚本部署只支持2方设置证书认证。(host&guest、host&exchange、guest&exchange) +1、目前脚本部署只支持2方设置证书认证。(host&guest、host&exchange、guest&exchange) ``` #### 4.4.2 修改配置文件 @@ -378,6 +390,8 @@ deploy_mode: "install" ---默认为空,修改为install,表示新部署 **3)修改host方参数** +**注意:启用安全证书通讯需把server_secure,client_secure,is_secure设置为true,以及is_secure对应的port设置为9371**。 + ``` #不部署host方则不用修改 #除了nodemanger可以设置多个IP外,其他都是单个IP @@ -395,10 +409,10 @@ host: max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如12G,如果是rollsite专用的机器,配置成物理内存的75%。 server_secure: False ---作为服务端,开启安全证书验证,不使用安全证书默认即可 client_secure: False ---作为客户端,使用证书发起安全请求,不使用安全证书默认即可 - default_rules: ---本party指向exchange或者其他party的IP,端口路由配置 + default_rules: ---本party指向exchange或者其他party的IP、端口路由配置 - name: default - ip: 192.168.0.3 ---exchange或者对端party rollsite IP - port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370,即无安全证书部署;如需开启安全证书通信,应设置为9371; + ip: 192.168.0.2 ---对端party rollsite IP + port: 9370 ---对端party rollsite 端口,一般默认9370,即无安全证书部署;如需开启安全证书通信,应设置为9371; is_secure: False ---是否使用安全认证通讯;需要结合server_secure或者client_secure使用,当三者都为true时,表示和下一跳rollsite使用安全认证通讯,同时上一个参数port需设置为9371;不使用安全证书默认即可。 rules: ---本party自身路由配置 - name: default @@ -453,6 +467,8 @@ host: **4)修改guest参数** +**注意:启用安全证书通讯需把server_secure,client_secure,is_secure设置为true,以及is_secure对应的port设置为9371**。 + ``` #不部署guest方则不用修改 #除了nodemanger可以设置多个IP外,其他都是单个IP @@ -470,10 +486,10 @@ guest: max_memory: ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如12G,如果是rollsite专用的机器,配置成物理内存的75%。 server_secure: False ---作为服务端,开启安全证书验证,不使用安全证书默认即可 client_secure: False ---作为客户端,使用证书发起安全请求,不使用安全证书默认即可 - default_rules: ---本party指向exchange或者其他party的IP,端口路由配置 + default_rules: ---本party指向exchange或者其他party的IP、端口路由配置 - name: default - ip: 192.168.0.3 ---exchange或者对端party rollsite IP - port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370 + ip: 192.168.0.1 ---对端party rollsite IP + port: 9370 ---对端party rollsite 端口,一般默认9370,即无安全证书部署;如需开启安全证书通信,应设置为9371; is_secure: False ---server_secure或者client_secure为true,指向的下一跳rollsite也开启了安全认证,此参数需要设置为true,上一个参数port需设置为9371,不使用安全证书默认即可 rules: ---本party自身路由配置 - name: default @@ -528,12 +544,14 @@ guest: **5)修改exchange参数** +**注意:启用安全证书通讯需把server_secure,client_secure,is_secure设置为true,以及is_secure对应的port设置为9371**。 + ``` #不部署exchange则不需要修改 vi /data/projects/ansible-nfate-1.*/var_files/prod/fate_exchange exchange: - enable: False + enable: False --部署exchange需修改为True rollsite: ips: - 192.168.0.3 @@ -548,18 +566,16 @@ exchange: rules: - name: default ip: 192.168.0.1 - port: 9367 + port: 9370 ---对应party rollsite 端口,一般默认9370,即无安全证书通讯;如需开启安全证书通信,应设置为9371; is_secure: False ---server_secure或者client_secure为true,指向的下一跳rollsite也开启了安全认证,此参数需要设置为true,上一个参数port需设置为9371,不使用安全证书默认即可 - id: 9999 rules: - name: default ip: 192.168.0.2 - port: 9370 + port: 9370 ---对应party rollsite 端口,一般默认9370,即无安全证书通讯;如需开启安全证书通信,应设置为9371; is_secure: False ---server_secure或者client_secure为true,指向的下一跳rollsite也开启了安全认证,此参数需要设置为true,上一个参数port需设置为9371,不使用安全证书默认即可 ``` - - ### 4.5 部署 按照上述配置含义修改对应的配置项后,然后执行部署脚本: @@ -567,11 +583,10 @@ exchange: ``` #相对ansible-nfate-*目录 cd /data/projects/ansible-nfate-1.* -#测试环境加test参数执行 - nohup sh ./boot.sh test -D > logs/boot.log 2>&1 & #生产环境加prod参数执行 nohup sh ./boot.sh prod -D > logs/boot.log 2>&1 & + ``` 部署日志输出在logs目录下,实时查看是否有报错: @@ -598,6 +613,16 @@ tail -f ansible.log (实时查看部署情况,如果没有这个日志文件 ---/data/projects/common目录存在,需要mv。 ``` +fateflow部署完重启: + +``` +#因为fate_flow依赖的组件比较多,可能启动会有异常,处理如下: +netstat -tlnp | grep 9360 +如果没有端口则重起fateflow: +sh service.sh stop fate-fateflow +sh service.sh start fate-fateflow +``` + ### 4.6 问题定位 1)eggroll日志 diff --git a/cluster-deploy/doc/Fate_guest_install_guide_ansible.md b/cluster-deploy/doc/Fate_guest_install_guide_ansible.md index e055e37704..117a731feb 100644 --- a/cluster-deploy/doc/Fate_guest_install_guide_ansible.md +++ b/cluster-deploy/doc/Fate_guest_install_guide_ansible.md @@ -37,18 +37,20 @@ Eggroll 是一个适用于机器学习和深度学习的大规模分布式架构 ## 2.1.部署规划 -| role | partyid | IP地址 | 操作系统 | 主机配置 | 存储 | 外网IP | 外网带宽 | 部署模块 | -| ----- | ---------------------- | --------------------- | ----------------------- | -------- | ---- | ----------- | -------- | ----------------------------------------------------- | -| guest | 9999(根据实际规划修改) | 192.168.0.1 (有外网) | CentOS 7.2/Ubuntu 16.04 | 8C16G | 500G | xx.xx.xx.xx | >=20Mb | fate_flow,fateboard,clustermanager,rollsite,mysql | -| guest | 9999(根据实际规划修改) | 192.168.0.2 | CentOS 7.2/Ubuntu 16.04 | 16C32G | 2T | | | nodemanger | +| role | partyid | IP地址 | 操作系统 | 主机配置 | 存储 | 外网IP | 外网带宽 | 部署模块 | +| ----- | ---------------------- | --------------------- | ---------- | -------- | ---- | ----------- | -------- | ----------------------------------------------------- | +| guest | 9999(根据实际规划修改) | 192.168.0.1 (有外网) | CentOS 7.2 | 8C16G | 500G | xx.xx.xx.xx | >=20Mb | fate_flow,fateboard,clustermanager,rollsite,mysql | +| guest | 9999(根据实际规划修改) | 192.168.0.2 | CentOS 7.2 | 16C32G | 2T | | | nodemanger | + +备注:涉及exchange说明会用192.168.0.3表示其IP,但本次示例不涉及exchange的部署。 ## 2.2.主机资源和操作系统要求 | **类别** | **说明** | | -------- | ------------------------------------------------------------ | | 主机配置 | 不低于8C16G500G,千兆网卡 | -| 操作系统 | CentOS linux 7.2及以上同时低于8/Ubuntu 16.04 或 Ubuntu 18.04 | -| 依赖包 | 需要安装如下依赖包:
#centos
gcc gcc-c++ make openssl-devel gmp-devel mpfr-devel libmpc-devel libaio
numactl autoconf automake libtool libffi-devel ansible jq supervisor
#ubuntu
gcc g++ make openssl supervisor ansible jq libgmp-dev libmpfr-dev libmpc-dev
libaio libaio-dev numactl autoconf automake libtool libffi-dev ansible jq supervisor
cd /usr/lib/x86_64-linux-gnu
if [ ! -f "libssl.so.10" ];then
ln -s libssl.so.1.0.0 libssl.so.10
ln -s libcrypto.so.1.0.0 libcrypto.so.10
fi | +| 操作系统 | CentOS linux 7.2及以上同时低于8 | +| 依赖包 | 需要安装如下依赖包:
#centos
gcc gcc-c++ make openssl-devel gmp-devel mpfr-devel libmpc-devel libaio
numactl autoconf automake libtool libffi-devel ansible jq supervisor | | 用户 | 用户:app,属主:apps(app用户需可以sudo su root而无需密码) | | 文件系统 | 1、数据盘挂载在/data目录下。
2、创建/data/projects目录,目录属主为:app:apps。
3、根目录空闲空间不低于20G。 | | 虚拟内存 | 不低于128G | @@ -97,8 +99,6 @@ vim /etc/hosts centos系统执行:rpm -qa | grep selinux -ubuntu系统执行:apt list --installed | grep selinux - 如果已安装了selinux就执行:setenforce 0 3.3 修改Linux系统参数 @@ -106,35 +106,37 @@ ubuntu系统执行:apt list --installed | grep selinux **在目标服务器(192.168.0.1 192.168.0.2)root用户下执行:** -1)vim /etc/security/limits.conf +1)清理20-nproc.conf文件 + +cd /etc/security/limits.d + +ls -lrt 20-nproc.conf + +存在则:mv 20-nproc.conf 20-nproc.conf_bak + +2)vim /etc/security/limits.conf \* soft nofile 65535 \* hard nofile 65535 -2)vim /etc/security/limits.d/20-nproc.conf +\* soft nproc 65535 + +\* hard nproc 65535 -\* soft nproc unlimited +重新登陆,ulimit -a查看是否生效 3.4 关闭防火墙 -------------- **在目标服务器(192.168.0.1 192.168.0.2 )root用户下执行** -如果是Centos系统: - systemctl disable firewalld.service systemctl stop firewalld.service systemctl status firewalld.service -如果是Ubuntu系统: - -ufw disable - -ufw status - 3.5 软件环境初始化 ------------------ @@ -162,6 +164,8 @@ Defaults !env_reset **3)配置ssh无密登录** +**注意:192.168.0.1不但需要可以免密登陆192.168.0.2,也需要可以免密登陆自身,配置完后务必手工ssh连接下自身和192.168.0.2,确认下认证信息。** + **a. 在目标服务器(192.168.0.1 192.168.0.2)app用户下执行** su app @@ -295,9 +299,6 @@ tar xzf ansible_nfate_1.4.5_release-1.0.0.tar.gz ``` cd ansible-nfate-* #init.sh文件不需要修改,主要是辅助生成一些配置文件 - -#测试环境加test参数执行 - sh ./tools/init.sh test #生产环境加prod参数执行 sh ./tools/init.sh prod @@ -366,6 +367,8 @@ deploy_mode: "install" ---默认为空,修改为install,表示新部署 **3)修改guest参数** +**注意:启用安全证书通讯需把server_secure,client_secure,is_secure设置为true,以及is_secure对应的port设置为9371**。 + ``` #除了nodemanger可以设置多个IP外,其他都是单个IP vi /data/projects/ansible-nfate-1.*/var_files/prod/fate_guest @@ -379,7 +382,7 @@ guest: port: 9370 ---rollsite grpc端口 secure_port: 9371 ---rollsite grpcs端口 pool_size: 600 ---线程池大小 - max_memory: 8G ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如12G,如果是rollsite专用的机器,配置成物理内存的75%。 + max_memory: 12G ---rollsite进程JVM内存参数,默认是物理内存的1/4,可根据实际情况设置,如12G,如果是rollsite专用的机器,配置成物理内存的75%。 server_secure: False ---作为服务端,开启安全证书验证,不使用安全证书默认即可 client_secure: False ---作为客户端,使用证书发起安全请求,不使用安全证书默认即可 default_rules: ---默认路由,本party指向exchange或者其他party的IP,端口 @@ -447,8 +450,6 @@ guest: ``` #相对ansible-nfate-*目录 cd /data/projects/ansible-nfate-1.* -#测试环境加test参数执行 - nohup sh ./boot.sh test -D > logs/boot.log 2>&1 & #生产环境加prod参数执行 nohup sh ./boot.sh prod -D > logs/boot.log 2>&1 & @@ -478,6 +479,16 @@ tail -f ansible.log (实时查看部署情况,如果没有这个日志文件 ---/data/projects/common目录存在,需要mv。 ``` +fateflow部署完重启: + +``` +#因为fate_flow依赖的组件比较多,可能启动会有异常,处理如下: +netstat -tlnp | grep 9360 +如果没有端口则重起fateflow: +sh service.sh stop fate-fateflow +sh service.sh start fate-fateflow +``` + ## 4.6 问题定位 1)eggroll日志 From d0604836323b34556b784384d0471a7b196d729d Mon Sep 17 00:00:00 2001 From: paulbaogang <1111@qq.com> Date: Thu, 17 Sep 2020 10:49:19 +0800 Subject: [PATCH 22/35] =?UTF-8?q?1=E3=80=81add=20deploy=20doc=20for=20vers?= =?UTF-8?q?ion=201.4.5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../doc/Fate_cluster_install_guide_ansible.md | 25 +++++++++++-------- .../doc/Fate_guest_install_guide_ansible.md | 21 ++++++++++------ 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md b/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md index dafe33e2f7..39ca8ff078 100644 --- a/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md +++ b/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md @@ -45,7 +45,7 @@ Eggroll 是一个适用于机器学习和深度学习的大规模分布式架构 | host | 10000 | 192.168.0.1 (有外网) | CentOS 7.2 | 8C16G | 500G | fate_flow,fateboard,clustermanager,nodemanger,rollsite,mysql | | guest | 9999 | 192.168.0.2 | CentOS 7.2 | 8C16G | 500G | fate_flow,fateboard,clustermanager,nodemanger,rollsite,mysql | -备注:涉及exchange说明会用192.168.0.3表示其IP,但本次示例不涉及exchange的部署。 +备注:涉及exchange说明会用192.168.0.88表示其IP,但本次示例不涉及exchange的部署。 ## 2.2.主机资源和操作系统要求 @@ -223,18 +223,21 @@ cat /proc/swaps echo '/data/swapfile128G swap swap defaults 0 0' >> /etc/fstab ``` -## 3.7 安装ansible +## 3.7 安装依赖包 -**目标服务器(192.168.0.1) root用户执行** +**目标服务器(192.168.0.1 192.168.0.2)root用户执行** ``` -#判断是否已安装ansible -ansible --version -#没有则执行 -yum install -y ansible -``` - +#安装基础依赖包 +yum install -y gcc gcc-c++ make openssl-devel gmp-devel mpfr-devel libmpc-devel libaio numactl autoconf automake libtool libffi-devel +#如果有报错,需要解决yum源问题。 +#安装ansible和进程管理依赖包 +yum install -y ansible jq supervisor +#如果有报错同时服务器有外网,没有外网的需要解决yum源不全的问题,执行: +yum install -y epel-release +#增加一个更全面的第三方的源,然后再重新安装ansible jq supervisor +``` 4.项目部署 ========== @@ -329,7 +332,7 @@ vi /data/projects/ansible-nfate-1.*/tools/make.sh guest_host="192.168.0.1" ---根据实际IP修改 host_host="192.168.0.2" ---根据实际IP修改 -exchange_host="192.168.0.3" ---根据实际IP修改,本示例不部署无需修改 +exchange_host="192.168.0.88" ---根据实际IP修改,本示例不部署无需修改 ``` 2)执行脚本制作证书 @@ -554,7 +557,7 @@ exchange: enable: False --部署exchange需修改为True rollsite: ips: - - 192.168.0.3 + - 192.168.0.88 port: 9370 secure_port: 9371 ---grpcs端口 pool_size: 600 diff --git a/cluster-deploy/doc/Fate_guest_install_guide_ansible.md b/cluster-deploy/doc/Fate_guest_install_guide_ansible.md index 117a731feb..38101cb4a1 100644 --- a/cluster-deploy/doc/Fate_guest_install_guide_ansible.md +++ b/cluster-deploy/doc/Fate_guest_install_guide_ansible.md @@ -42,7 +42,7 @@ Eggroll 是一个适用于机器学习和深度学习的大规模分布式架构 | guest | 9999(根据实际规划修改) | 192.168.0.1 (有外网) | CentOS 7.2 | 8C16G | 500G | xx.xx.xx.xx | >=20Mb | fate_flow,fateboard,clustermanager,rollsite,mysql | | guest | 9999(根据实际规划修改) | 192.168.0.2 | CentOS 7.2 | 16C32G | 2T | | | nodemanger | -备注:涉及exchange说明会用192.168.0.3表示其IP,但本次示例不涉及exchange的部署。 +备注:涉及exchange说明会用192.168.0.88表示其IP,但本次示例不涉及exchange的部署。 ## 2.2.主机资源和操作系统要求 @@ -218,15 +218,20 @@ cat /proc/swaps echo '/data/swapfile128G swap swap defaults 0 0' >> /etc/fstab ``` -## 3.7 安装ansible +## 3.7 安装依赖包 -**目标服务器(192.168.0.1) root用户执行** +**目标服务器(192.168.0.1 192.168.0.2) root用户执行** ``` -#判断是否已安装ansible -ansible --version -#没有则执行 -yum install -y ansible +#安装基础依赖包 +yum install -y gcc gcc-c++ make openssl-devel gmp-devel mpfr-devel libmpc-devel libaio numactl autoconf automake libtool libffi-devel +#如果有报错,需要解决yum源问题。 + +#安装ansible和进程管理依赖包 +yum install -y ansible jq supervisor +#如果有报错同时服务器有外网,没有外网的需要解决yum源不全的问题,执行: +yum install -y epel-release +#增加一个更全面的第三方的源,然后再重新安装ansible jq supervisor ``` 4 项目部署 @@ -387,7 +392,7 @@ guest: client_secure: False ---作为客户端,使用证书发起安全请求,不使用安全证书默认即可 default_rules: ---默认路由,本party指向exchange或者其他party的IP,端口 - name: default ---名称,默认即可 - ip: 192.168.0.3 ---exchange或者对端party rollsite IP,和webank确认后修改。 + ip: 192.168.0.88 ---exchange或者对端party rollsite IP,和webank确认后修改。 port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370,即无安全证书部署;如需开启安全证书通信,应设置为9371;和webank确认后修改。 is_secure: False ---是否使用安全认证通讯;需要结合server_secure或者client_secure使用,当三者都为true时,表示和下一跳rollsite使用安全认证通讯,同时上一个参数port需设置为9371;不使用安全证书默认即可。 rules: ---本party自身路由配置 From 52701beaf5f549d26b3718d926fea455d83e6cca Mon Sep 17 00:00:00 2001 From: paulbaogang <1111@qq.com> Date: Thu, 17 Sep 2020 11:44:25 +0800 Subject: [PATCH 23/35] =?UTF-8?q?1=E3=80=81add=20deploy=20doc=20for=20vers?= =?UTF-8?q?ion=201.4.5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../doc/Fate_cluster_install_guide_ansible.md | 20 +++++++++++++------ .../doc/Fate_guest_install_guide_ansible.md | 8 +++++++- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md b/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md index 39ca8ff078..3e2551e801 100644 --- a/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md +++ b/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md @@ -326,9 +326,9 @@ vi /data/projects/ansible-nfate-1.*/tools/make.sh #1、自定义安全证书需同时部署两端,只部署一端需要手工处理证书,手工处理部分暂不介绍。 #2、安全证书支持如下部署方式: - 1)host和guest同时部署。 - 2)host和exchange同时部署。 - 3)guest和exchange同时部署。 + 1)部署host+guest,host和guest使用安全证书通讯。 + 2)部署host+exchange+guest,其中host和exchange使用安全证书通讯,guest和exchange普通通讯。 + 3)部署host+exchange+guest,其中guest和exchange使用安全证书通讯,host和exchange普通通讯。 guest_host="192.168.0.1" ---根据实际IP修改 host_host="192.168.0.2" ---根据实际IP修改 @@ -393,7 +393,7 @@ deploy_mode: "install" ---默认为空,修改为install,表示新部署 **3)修改host方参数** -**注意:启用安全证书通讯需把server_secure,client_secure,is_secure设置为true,以及is_secure对应的port设置为9371**。 +**注意:默认是不启用安全证书的配置,如果启用安全证书通讯需把server_secure,client_secure,is_secure设置为true,以及is_secure对应的port设置为9371**。 ``` #不部署host方则不用修改 @@ -470,7 +470,7 @@ host: **4)修改guest参数** -**注意:启用安全证书通讯需把server_secure,client_secure,is_secure设置为true,以及is_secure对应的port设置为9371**。 +**注意:默认是不启用安全证书的配置,如果启用安全证书通讯需把server_secure,client_secure,is_secure设置为true,以及is_secure对应的port设置为9371**。 ``` #不部署guest方则不用修改 @@ -547,7 +547,7 @@ guest: **5)修改exchange参数** -**注意:启用安全证书通讯需把server_secure,client_secure,is_secure设置为true,以及is_secure对应的port设置为9371**。 +**注意:默认是不启用安全证书的配置,如果启用安全证书通讯需把server_secure,client_secure,is_secure设置为true,以及is_secure对应的port设置为9371**。 ``` #不部署exchange则不需要修改 @@ -765,6 +765,12 @@ cd /data/projects/common/supervisord ``` sh service.sh start/stop/status all + +#说明:因为fateflow依赖的组件比较多,重启所有的操作可能会导致fateflow启动异常,处理如下: +netstat -tlnp | grep 9360 +如果没有端口则重起fateflow: +sh service.sh stop fate-fateflow +sh service.sh start fate-fateflow ``` 启动/关闭/查看单个模块(可选:clustermanager,nodemanager,rollsite,fateflow,fateboard,mysql): @@ -773,6 +779,8 @@ sh service.sh start/stop/status all sh service.sh start/stop/status fate-clustermanager ``` + + ## 6.2 查看进程和端口 **在目标服务器(192.168.0.1 192.168.0.2 )app用户下执行** diff --git a/cluster-deploy/doc/Fate_guest_install_guide_ansible.md b/cluster-deploy/doc/Fate_guest_install_guide_ansible.md index 38101cb4a1..693149446e 100644 --- a/cluster-deploy/doc/Fate_guest_install_guide_ansible.md +++ b/cluster-deploy/doc/Fate_guest_install_guide_ansible.md @@ -372,7 +372,7 @@ deploy_mode: "install" ---默认为空,修改为install,表示新部署 **3)修改guest参数** -**注意:启用安全证书通讯需把server_secure,client_secure,is_secure设置为true,以及is_secure对应的port设置为9371**。 +**注意:默认是不启用安全证书的配置,如果启用安全证书通讯需把server_secure,client_secure,is_secure设置为true,以及is_secure对应的port设置为9371**。 ``` #除了nodemanger可以设置多个IP外,其他都是单个IP @@ -619,6 +619,12 @@ cd /data/projects/common/supervisord ``` sh service.sh start/stop/status all + +#说明:因为fateflow依赖的组件比较多,重启所有的操作可能会导致fateflow启动异常,处理如下: +netstat -tlnp | grep 9360 +如果没有端口则重起fateflow: +sh service.sh stop fate-fateflow +sh service.sh start fate-fateflow ``` 启动/关闭/查看单个模块(可选:clustermanager,nodemanager,rollsite,fateflow,fateboard,mysql): From d10289acba951f6c976077f87284cac7c3f06164 Mon Sep 17 00:00:00 2001 From: zengjice Date: Thu, 17 Sep 2020 17:44:15 +0800 Subject: [PATCH 24/35] update version --- .gitmodules | 2 +- fate.env | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index 8d32dbed61..f9182f0090 100644 --- a/.gitmodules +++ b/.gitmodules @@ -5,4 +5,4 @@ [submodule "eggroll"] path = eggroll url = https://github.com/WeBankFinTech/eggroll.git - branch = release-2.0.2-build-5 + branch = release-2.0.2-build-6 diff --git a/fate.env b/fate.env index e46d4e96e7..0f3970743b 100755 --- a/fate.env +++ b/fate.env @@ -1,4 +1,4 @@ -FATE=1.4.4 +FATE=1.4.5 CENTOS=7.2 UBUNTU=16.04 PYTHON=3.6.5 From e9b7a550be0d861d035542f01eb32d399797a231 Mon Sep 17 00:00:00 2001 From: paulbaogang <1111@qq.com> Date: Fri, 18 Sep 2020 15:53:40 +0800 Subject: [PATCH 25/35] =?UTF-8?q?1=E3=80=81add=20deploy=20doc=20for=20vers?= =?UTF-8?q?ion=201.4.5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cluster-deploy/doc/Fate_cluster_install_guide_ansible.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md b/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md index 3e2551e801..e2d2dfeaa5 100644 --- a/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md +++ b/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md @@ -414,8 +414,8 @@ host: client_secure: False ---作为客户端,使用证书发起安全请求,不使用安全证书默认即可 default_rules: ---本party指向exchange或者其他party的IP、端口路由配置 - name: default - ip: 192.168.0.2 ---对端party rollsite IP - port: 9370 ---对端party rollsite 端口,一般默认9370,即无安全证书部署;如需开启安全证书通信,应设置为9371; + ip: 192.168.0.2 ---exchange或者对端party rollsite IP + port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370,即无安全证书部署;如需开启安全证书通信,应设置为9371; is_secure: False ---是否使用安全认证通讯;需要结合server_secure或者client_secure使用,当三者都为true时,表示和下一跳rollsite使用安全认证通讯,同时上一个参数port需设置为9371;不使用安全证书默认即可。 rules: ---本party自身路由配置 - name: default @@ -491,8 +491,8 @@ guest: client_secure: False ---作为客户端,使用证书发起安全请求,不使用安全证书默认即可 default_rules: ---本party指向exchange或者其他party的IP、端口路由配置 - name: default - ip: 192.168.0.1 ---对端party rollsite IP - port: 9370 ---对端party rollsite 端口,一般默认9370,即无安全证书部署;如需开启安全证书通信,应设置为9371; + ip: 192.168.0.1 ---exchange或者对端party rollsite IP + port: 9370 ---exchange或者对端party rollsite 端口,一般默认9370,即无安全证书部署;如需开启安全证书通信,应设置为9371; is_secure: False ---server_secure或者client_secure为true,指向的下一跳rollsite也开启了安全认证,此参数需要设置为true,上一个参数port需设置为9371,不使用安全证书默认即可 rules: ---本party自身路由配置 - name: default From 0049f3d877d806ca6b1a7a25997e51f394d3b9a9 Mon Sep 17 00:00:00 2001 From: paulbaogang <1111@qq.com> Date: Fri, 18 Sep 2020 16:36:02 +0800 Subject: [PATCH 26/35] =?UTF-8?q?1=E3=80=81add=20deploy=20doc=20for=20vers?= =?UTF-8?q?ion=201.4.5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cluster-deploy/doc/Fate_cluster_install_guide_ansible.md | 7 ++++--- cluster-deploy/doc/Fate_guest_install_guide_ansible.md | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md b/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md index e2d2dfeaa5..467139d519 100644 --- a/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md +++ b/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md @@ -253,16 +253,16 @@ yum install -y epel-release **在目标服务器(192.168.0.1 192.168.0.2)app用户下执行** ``` -#虚拟内存,size不低于128G,如不满足需参考4.6章节重新设置 +#虚拟内存,size不低于128G,如不满足需参考3.6章节重新设置 cat /proc/swaps Filename Type Size Used Priority /data/swapfile128G file 134217724 384 -1 -#文件句柄数,不低于65535,如不满足需参考4.3章节重新设置 +#文件句柄数,不低于65535,如不满足需参考3.3章节重新设置 ulimit -n 65535 -#用户进程数,不低于64000,如不满足需参考4.3章节重新设置 +#用户进程数,不低于64000,如不满足需参考3.3章节重新设置 ulimit -u 65535 @@ -272,6 +272,7 @@ ps -ef| grep -i fate netstat -tlnp | grep 4670 netstat -tlnp | grep 4671 netstat -tlnp | grep 9370 +netstat -tlnp | grep 9371 netstat -tlnp | grep 9360 netstat -tlnp | grep 8080 netstat -tlnp | grep 3306 diff --git a/cluster-deploy/doc/Fate_guest_install_guide_ansible.md b/cluster-deploy/doc/Fate_guest_install_guide_ansible.md index 693149446e..54e70c3528 100644 --- a/cluster-deploy/doc/Fate_guest_install_guide_ansible.md +++ b/cluster-deploy/doc/Fate_guest_install_guide_ansible.md @@ -249,16 +249,16 @@ yum install -y epel-release **在目标服务器(192.168.0.1 192.168.0.2)app用户下执行** ``` -#虚拟内存,size不低于128G,如不满足需参考4.6章节重新设置 +#虚拟内存,size不低于128G,如不满足需参考3.6章节重新设置 cat /proc/swaps Filename Type Size Used Priority /data/swapfile128G file 134217724 384 -1 -#文件句柄数,不低于65535,如不满足需参考4.3章节重新设置 +#文件句柄数,不低于65535,如不满足需参考3.3章节重新设置 ulimit -n 65535 -#用户进程数,不低于64000,如不满足需参考4.3章节重新设置 +#用户进程数,不低于64000,如不满足需参考3.3章节重新设置 ulimit -u 65535 @@ -268,6 +268,7 @@ ps -ef| grep -i fate netstat -tlnp | grep 4670 netstat -tlnp | grep 4671 netstat -tlnp | grep 9370 +netstat -tlnp | grep 9371 netstat -tlnp | grep 9360 netstat -tlnp | grep 8080 netstat -tlnp | grep 3306 From 07a78479ae474d93f9d016f597913b325e9985f2 Mon Sep 17 00:00:00 2001 From: paulbaogang <1111@qq.com> Date: Tue, 22 Sep 2020 10:07:35 +0800 Subject: [PATCH 27/35] =?UTF-8?q?1=E3=80=81add=20deploy=20doc=20for=20vers?= =?UTF-8?q?ion=201.4.5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cluster-deploy/doc/Fate_cluster_install_guide_ansible.md | 9 +++++---- cluster-deploy/doc/Fate_guest_install_guide_ansible.md | 9 +++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md b/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md index 467139d519..a716e6ca02 100644 --- a/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md +++ b/cluster-deploy/doc/Fate_cluster_install_guide_ansible.md @@ -762,10 +762,11 @@ Fateboard是一项Web服务。如果成功启动了fateboard服务,则可以 cd /data/projects/common/supervisord ``` -启动/关闭/查看所有: +启动/关闭/重启/查看所有: ``` -sh service.sh start/stop/status all +#注意:因mysql是基础组件,启动较慢,建议重启操作是先停止所有组件,然后先启动mysql,再启动其他组件 +sh service.sh start/stop/restart/status all #说明:因为fateflow依赖的组件比较多,重启所有的操作可能会导致fateflow启动异常,处理如下: netstat -tlnp | grep 9360 @@ -774,10 +775,10 @@ sh service.sh stop fate-fateflow sh service.sh start fate-fateflow ``` -启动/关闭/查看单个模块(可选:clustermanager,nodemanager,rollsite,fateflow,fateboard,mysql): +启动/关闭/重启/查看单个模块(可选:clustermanager,nodemanager,rollsite,fateflow,fateboard,mysql): ``` -sh service.sh start/stop/status fate-clustermanager +sh service.sh start/stop/rsstart/status fate-clustermanager ``` diff --git a/cluster-deploy/doc/Fate_guest_install_guide_ansible.md b/cluster-deploy/doc/Fate_guest_install_guide_ansible.md index 54e70c3528..9312f7e0bc 100644 --- a/cluster-deploy/doc/Fate_guest_install_guide_ansible.md +++ b/cluster-deploy/doc/Fate_guest_install_guide_ansible.md @@ -616,10 +616,11 @@ Fateboard是一项Web服务。如果成功启动了fateboard服务,则可以 cd /data/projects/common/supervisord ``` -启动/关闭/查看所有: +启动/关闭/重启/查看所有: ``` -sh service.sh start/stop/status all +#注意:因mysql是基础组件,启动较慢,建议重启操作是先停止所有组件,然后先启动mysql,再启动其他组件 +sh service.sh start/stop/restart/status all #说明:因为fateflow依赖的组件比较多,重启所有的操作可能会导致fateflow启动异常,处理如下: netstat -tlnp | grep 9360 @@ -628,10 +629,10 @@ sh service.sh stop fate-fateflow sh service.sh start fate-fateflow ``` -启动/关闭/查看单个模块(可选:clustermanager,nodemanager,rollsite,fateflow,fateboard,mysql): +启动/关闭/重启/查看单个模块(可选:clustermanager,nodemanager,rollsite,fateflow,fateboard,mysql): ``` -sh service.sh start/stop/status fate-clustermanager +sh service.sh start/stop/restart/status fate-clustermanager ``` ## 6.2 查看进程和端口 From 9e4c5c4dedb08b67721448826275e2d962118858 Mon Sep 17 00:00:00 2001 From: zengjice Date: Thu, 24 Sep 2020 14:58:14 +0800 Subject: [PATCH 28/35] modify version --- .travis.yml | 6 +++--- standalone-deploy/README.md | 16 ++++++++-------- standalone-deploy/README.rst | 16 ++++++++-------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9054c89616..f2e6d4a696 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,9 +14,9 @@ matrix: - env: OS='centos:7' script: - - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.4.tar.gz - - tar -xzf standalone-fate-master-1.4.4.tar.gz - - cd standalone-fate-master-1.4.4 + - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.5.tar.gz + - tar -xzf standalone-fate-master-1.4.5.tar.gz + - cd standalone-fate-master-1.4.5 - sed -i.bak "s/sh service.sh/bash service.sh/g" init.sh - source init.sh init - ls -alh diff --git a/standalone-deploy/README.md b/standalone-deploy/README.md index abd2990070..d7d6face52 100644 --- a/standalone-deploy/README.md +++ b/standalone-deploy/README.md @@ -22,11 +22,11 @@ It is strongly recommended to use docker, which greatly reduces the possibility ``` #Get code -wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/docker_standalone-fate-1.4.4.tar.gz -tar -xzvf docker_standalone-fate-1.4.4.tar.gz +wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/docker_standalone-fate-1.4.5.tar.gz +tar -xzvf docker_standalone-fate-1.4.5.tar.gz #Execute the command -cd docker_standalone-fate-1.4.4 +cd docker_standalone-fate-1.4.5 bash install_standalone_docker.sh ``` @@ -82,14 +82,14 @@ Http://hostip:8080. 2. Download the compressed package of stand-alone version and decompress it. ``` - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.4.tar.gz - tar -xzvf standalone-fate-master-1.4.4.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.5.tar.gz + tar -xzvf standalone-fate-master-1.4.5.tar.gz ``` 3. Enter FATE directory and execute the init.sh. ``` - cd standalone-fate-master-1.4.4 + cd standalone-fate-master-1.4.5 source init.sh init ``` @@ -98,7 +98,7 @@ Http://hostip:8080. - Unit Test ``` - cd standalone-fate-master-1.4.4 + cd standalone-fate-master-1.4.5 bash ./federatedml/test/run_test.sh ``` @@ -111,7 +111,7 @@ Http://hostip:8080. - Toy_example Test ``` - cd standalone-fate-master-1.4.4 + cd standalone-fate-master-1.4.5 python ./examples/toy_example/run_toy_example.py 10000 10000 0 ``` diff --git a/standalone-deploy/README.rst b/standalone-deploy/README.rst index c664644200..aa76567a07 100644 --- a/standalone-deploy/README.rst +++ b/standalone-deploy/README.rst @@ -36,11 +36,11 @@ possibility of encountering problems. :: #Get code - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/docker_standalone-fate-1.4.4.tar.gz - tar -xzvf docker_standalone-fate-1.4.4.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/docker_standalone-fate-1.4.5.tar.gz + tar -xzvf docker_standalone-fate-1.4.5.tar.gz #Execute the command - cd docker_standalone-fate-1.4.4 + cd docker_standalone-fate-1.4.5 bash install_standalone_docker.sh @@ -98,14 +98,14 @@ Http://hostip:8080. :: - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.4.tar.gz - tar -xzvf standalone-fate-master-1.4.4.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.5.tar.gz + tar -xzvf standalone-fate-master-1.4.5.tar.gz 3. Enter FATE directory and execute the init.sh. :: - cd standalone-fate-master-1.4.4 + cd standalone-fate-master-1.4.5 source init.sh init 4. Test @@ -114,7 +114,7 @@ Http://hostip:8080. :: - cd standalone-fate-master-1.4.4 + cd standalone-fate-master-1.4.5 bash ./federatedml/test/run_test.sh If success, the screen shows like blow: @@ -127,7 +127,7 @@ Http://hostip:8080. :: - cd standalone-fate-master-1.4.4 + cd standalone-fate-master-1.4.5 python ./examples/toy_example/run_toy_example.py 10000 10000 0 If success, the screen shows like blow: From 7ceac03acf444fa16afacfd2d040d711613754bd Mon Sep 17 00:00:00 2001 From: zengjice Date: Thu, 24 Sep 2020 15:02:35 +0800 Subject: [PATCH 29/35] move debug tools into tools directory --- {bin => tools}/debug/check_conf.sh | 0 {bin => tools}/debug/check_env.sh | 0 {bin => tools}/debug/check_iplist.sh | 0 {bin => tools}/debug/cluster_env_check.sh | 0 {bin => tools}/debug/env_check.py | 0 {bin => tools}/debug/env_check.sh | 0 {bin => tools}/debug/grep_logs.sh | 0 {bin => tools}/debug/server_check.py | 0 {bin => tools}/debug/server_check.sh | 0 {bin => tools}/debug/test_env.py | 0 {bin => tools}/debug/time_check.py | 0 ...46\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" | 0 12 files changed, 0 insertions(+), 0 deletions(-) rename {bin => tools}/debug/check_conf.sh (100%) rename {bin => tools}/debug/check_env.sh (100%) rename {bin => tools}/debug/check_iplist.sh (100%) rename {bin => tools}/debug/cluster_env_check.sh (100%) rename {bin => tools}/debug/env_check.py (100%) rename {bin => tools}/debug/env_check.sh (100%) rename {bin => tools}/debug/grep_logs.sh (100%) rename {bin => tools}/debug/server_check.py (100%) rename {bin => tools}/debug/server_check.sh (100%) rename {bin => tools}/debug/test_env.py (100%) rename {bin => tools}/debug/time_check.py (100%) rename "bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" => "tools/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" (100%) diff --git a/bin/debug/check_conf.sh b/tools/debug/check_conf.sh similarity index 100% rename from bin/debug/check_conf.sh rename to tools/debug/check_conf.sh diff --git a/bin/debug/check_env.sh b/tools/debug/check_env.sh similarity index 100% rename from bin/debug/check_env.sh rename to tools/debug/check_env.sh diff --git a/bin/debug/check_iplist.sh b/tools/debug/check_iplist.sh similarity index 100% rename from bin/debug/check_iplist.sh rename to tools/debug/check_iplist.sh diff --git a/bin/debug/cluster_env_check.sh b/tools/debug/cluster_env_check.sh similarity index 100% rename from bin/debug/cluster_env_check.sh rename to tools/debug/cluster_env_check.sh diff --git a/bin/debug/env_check.py b/tools/debug/env_check.py similarity index 100% rename from bin/debug/env_check.py rename to tools/debug/env_check.py diff --git a/bin/debug/env_check.sh b/tools/debug/env_check.sh similarity index 100% rename from bin/debug/env_check.sh rename to tools/debug/env_check.sh diff --git a/bin/debug/grep_logs.sh b/tools/debug/grep_logs.sh similarity index 100% rename from bin/debug/grep_logs.sh rename to tools/debug/grep_logs.sh diff --git a/bin/debug/server_check.py b/tools/debug/server_check.py similarity index 100% rename from bin/debug/server_check.py rename to tools/debug/server_check.py diff --git a/bin/debug/server_check.sh b/tools/debug/server_check.sh similarity index 100% rename from bin/debug/server_check.sh rename to tools/debug/server_check.sh diff --git a/bin/debug/test_env.py b/tools/debug/test_env.py similarity index 100% rename from bin/debug/test_env.py rename to tools/debug/test_env.py diff --git a/bin/debug/time_check.py b/tools/debug/time_check.py similarity index 100% rename from bin/debug/time_check.py rename to tools/debug/time_check.py diff --git "a/bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/tools/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" similarity index 100% rename from "bin/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" rename to "tools/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" From 97549817887a78b51f9e0bc7166c4a6fdf890349 Mon Sep 17 00:00:00 2001 From: zengjice Date: Thu, 24 Sep 2020 15:03:41 +0800 Subject: [PATCH 30/35] modify standalone deploy doc --- .../doc/Fate-standalone_deployment_guide_zh.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/standalone-deploy/doc/Fate-standalone_deployment_guide_zh.md b/standalone-deploy/doc/Fate-standalone_deployment_guide_zh.md index e79046e157..c6fa12ff0a 100644 --- a/standalone-deploy/doc/Fate-standalone_deployment_guide_zh.md +++ b/standalone-deploy/doc/Fate-standalone_deployment_guide_zh.md @@ -21,11 +21,11 @@ ``` #获取安装包 - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/docker_standalone-fate-1.4.4.tar.gz - tar -xzvf docker_standalone-fate-1.4.4.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/docker_standalone-fate-1.4.5.tar.gz + tar -xzvf docker_standalone-fate-1.4.5.tar.gz #执行部署 - cd docker_standalone-fate-1.4.4 + cd docker_standalone-fate-1.4.5 bash install_standalone_docker.sh ``` @@ -80,14 +80,14 @@ 2. 下载独立版本的压缩包并解压缩。 ``` - wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.4.tar.gz - tar -xzvf standalone-fate-master-1.4.4.tar.gz + wget https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/standalone-fate-master-1.4.5.tar.gz + tar -xzvf standalone-fate-master-1.4.5.tar.gz ``` 3. 进入FATE目录并执行init.sh。 ``` - cd standalone-fate-master-1.4.4 + cd standalone-fate-master-1.4.5 source init.sh init ``` @@ -96,7 +96,7 @@ - 单元测试 ``` - cd standalone-fate-master-1.4.4 + cd standalone-fate-master-1.4.5 bash ./federatedml/test/run_test.sh ``` @@ -109,7 +109,7 @@ - Toy测试 ``` - cd standalone-fate-master-1.4.4 + cd standalone-fate-master-1.4.5 python ./examples/toy_example/run_toy_example.py 10000 10000 0 ``` From b80ff6c4a6f9c9838e1812b1c268d2ef56730f50 Mon Sep 17 00:00:00 2001 From: easongao <46415758+easson001@users.noreply.github.com> Date: Thu, 24 Sep 2020 16:45:38 +0800 Subject: [PATCH 31/35] Add files via upload delete data acess checking --- ...77\347\224\250\350\257\264\346\230\216.md" | 750 +++++++++--------- 1 file changed, 356 insertions(+), 394 deletions(-) diff --git "a/tools/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/tools/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" index 2038034f8b..f3cbe58f1a 100644 --- "a/tools/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" +++ "b/tools/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" @@ -1,394 +1,356 @@ -# 脚本使用说明 - -## 一 概述 - -本工具集提供4个工具,功能如下: - -| 工具名称 | 工具功能 | 使用场景 | -| ---------------- | ------------------------------------------ | -------------------- | -| 机器基础信息检测 | 验证机器设置是否满足跑fate任务要求 | 部署完成并启动服务后 | -| fate运行信息检测 | 验证机器当前状态是否适合新建一个fate任务 | 启动fate任务前 | -| 日志搜集 | 搜集该集群下所有session_id的日志到当前目录 | 跑任务出现错误后 | -| 集群配置检测 | 搜集展示集群的配置文件信息 | 部署完成启动服务失败 | - -名词解释: - -| 名词 | 解释 | -| ----------------------- | -------------------------------------------------------- | -| $FATE_HOME | 通常在/data/projects/fate | -| $EGGROLL_HOME | 通常在/data/projects/fate/eggroll | -| ${集群节点个数} | 如果运行脚本的机器所在集群有3个节点,就取3 | -| ${host party_id} | 可选参数,检查data_access服务是否可用,取host方partyid值 | -| ${需要查询的session-id} | 是一个21位左右的长id。如202009031227285073491。 | - - - -## 二 机器基础信息检测 - -### 2.1 使用场景 - ------- - -此脚本在完成部署并正常启动服务后运行,脚本功能检查系统内存 / 虚拟内存 / 磁盘 / 最大用户进程数 / 文件数 / 线程数设置 / rollsite进程堆内存 等机器基础信息,用于验证机器设置是否满足跑fate任务要求。 - -### 2.2 工具功能 - ------- - -此检测检测提供两种版本: - -- 单集群版:基于eggroll服务检测,需要各个节点eggroll服务正常启动后方可使用,用于检测各个nodemanager服务所在节点的集群基础信息,其检测项包含以下所有共7项; - -- 跨集群版:无需依赖eggroll服务,可以跨节点检测指定所有ip的基础信息,其检测项仅包括以下列出前5项,**需支持节点间免密登录**。 - -1)检查系统内存:系统内存总量、系统内存使用量、系统内存使用占比 - -2)检查虚拟内存:虚拟内存总量、虚拟内存使用量、虚拟内存使用占比 - -3)检查磁盘使用情况:磁盘总量、磁盘使用量、磁盘使用占比 - -4)检查系统最大用户进程数 - -5)检查最大文件数 - -6)检查线程数设置:检查egg pair线程数eggroll.rollpair.eggpair.server.executor.pool.max.size设置是否充足 - -7)检查rollsite进程堆内存是否充足: - -### 2.3 使用方法 - ------- - -#### 2.3.1 单集群版 - -```shell -source $FATE_HOME/init_env.sh -cd $EGGROLL_HOME/bin/debug -sh env_check.sh ${集群节点个数} -cat result_env.log -``` - -若对几个$开头的变量有疑问,请参考概述中的名词解释。 - -#### 2.3.2 跨集群版 - -**需支持节点间免密scp、ssh操作,也可以手动输入密码执行** - -1、设置环境变量: - -```shell -source $FATE_HOME/init_env.sh -``` - -2、编辑配置文件: - -``` -cd $EGGROLL_HOME/bin/debug -vi check_iplist.sh -``` - -参数说明: - -```shell -user=app <远程登录用户名> -iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> -``` - -3、执行检测脚本: - -```python -python time_check.py -//查看检测结果,各个ip的检测结果生成于当前目录下以ip命名的文件,直接cat可查看对应ip的返回信息 -cat ./$ip -``` - -//若需定时检测观察内存信息变化则加-t参数,可指定间隔秒数定时输出 - -``` -python time_check.py -t {检测间隔秒数,不填只检测一次} -``` - -### 2.4 检测结果说明 - ------- - -返回示例信息如下: - -*说明:以下信息分为三种提示等级:* - -*[OK\] 表示该检查项正常;* - -*[WARNING\]表示该项需要注意,仅作关键信息展示,需要自行判断;* - -*[ERROR\]表示该项不符合预期结果,需要按提示修改。* - -```properties -//脚本执行时间 - 2020-09-02 15:00:41.424053 -//返回的节点ip - ==============This is node 0:127.0.0.1=========================================== -//系统内存总量、系统内存使用量、系统内存使用占比 - [WARNING] MemTotal:78.51G, MemUsed:11.5G, MemUsedPCT:15% -//虚拟内存总量、虚拟内存使用量、虚拟内存使用占比,若小于128G,则提示ERROR,如下所示: - [ERROR] The swap memory is:32.0G, no less than 128G. <虚拟内存不足 - [WARNING] MemTotal:16.51G, MemUsed:128G, MemUsedPCT:12.3% <虚拟内存正常 -//磁盘总量、磁盘使用量、磁盘使用占比 - [WARNING] DiskTotal:984.18G, DiskUsed:566.53G, DiskUsedPCT:61% - --------------Max user processes and max file count------------------------------ -//最大用户进程数与最大文件数各个文件设置值展示,其中不满足65535的项则报[ERROR提示]: - [OK] /proc/sys/kernel/threads-max = 642956 - [OK] /etc/sysctl.conf = 1048576 - [OK] /proc/sys/kernel/pid_max = 131072 - [ERROR] please check /proc/sys/vm/max_map_count = 65530, no less than 65535. - [OK] /etc/security/limits.conf = 102401 - [OK] /etc/security/limits.d/80-nofile.conf = 131072 - [OK] /etc/sysctl.conf = 1048576 - [OK] /proc/sys/fs/file-max = 1048576 - --------------Thread count check------------------------------------------------- -//判断eggroll.properties中eggroll.rollpair.eggpair.server.executor.pool.max.size配置项设置的线程值是否充足,若不充足,则报[ERROR]提示需要调大线程值 - [OK] The thread count = 1406, the total processes = 16 * 500 = 8000 - ----------Rollsite memory use percent-------------------------------------------- -//展示rollsite进程占用堆内存与rollsite设置内存上限比值,以判断rollsite内存是否充足,若百分比偏大,则需考虑释放rollsite内存或调高rollsite内存上限 - [WARNING] rollsite memory use: 0.69% -``` - - - -## 三 fate运行信息检测 - -### 3.1 使用场景 - ------- - -跑fate任务前,检测fate运行信息。验证机器当前状态是否适合新建一个fate任务 - -### 3.2 工具功能 - ------- - -检测fate运行信息:eggroll路由是不是默认路由、是否已安装data access、fate服务的运行状态、进程数及占用内存情况、当前环境正在运行及等待的job任务数、job任务有多少进程及占用的内存情况。 - -### 3.3 使用方法 - -``` -source $FATE_HOME/init_env.sh //FATE_HOME为用户环境的fate目录 -cd $EGGROLL_HOME/bin/debug -sh server_check.sh ${集群内节点个数} ${host party_id(可选)} -例:sh server_check.sh 1 10000 -``` - -可选参数: - -​ {host party_id} //当需要检查data_assess的服务是否可用时使用。若不提供该参数时不检测。 - -结果保存在result_server.log文件中 - -### 3.4 检测结果说明 - ------- - -#### 3.4.1 default route check(eggroll路由是不是默认路由) - -- 检测通过提示: - - [OK] eggroll route configured! - - "port": 9370, "ip": "127.0.0.1" - -- 检测失败提示: - - [ERROR] eggroll route is not configured, please check /data/projects/fate/eggroll/conf/route_table.json file if it is existed! - -- 检查方法: - - 检测/data/projects/fate/eggroll/conf/route_table.json 是否有配置default参数。如果有,把ip和端口打印出来。如果无,提示ERROR。 - - - -#### 3.4.2 data_access service check(是否已安装data access) - -- 检测通过提示: - - [OK] Installed and running data_access service! - -- 检测失败提示: - - [ERROR] data_access service and directory not found, please check if it is installed! - -- 检查方法: - - 先检查data_access 进程是否存在或者目录是否存在。若存在,会进一步检查data_access 服务是否可用。详细逻辑是: - - ``` - 若返回进程数为0,判断检查服务目录的返回值,若为0,则视为没有安装access,提示ERROR;否则,则视为没有启动access,提示WARNING; - - 若返回进程数大于0,判断路由验证返回码,如果返回 "status":0,或 "status":201,则说明 DataAccess 服务以及路由表配置没有问题,否则提示WARNING检查路由设置 - ``` - -#### 3.4.3 fate service check(fate服务状态、进程数及占用内存) - -- 检测通过提醒: - - [OK] the service is running , number of processes is :; used memory: - -- 检测失败提醒: - - [WARNING] the service not running, please check service status. - -- 检查方法: - - 检查服务列表: - - 'ClusterManagerBootstrap','NodeManagerBootstrap','rollsite','fate_flow_server.py','fateboard','mysql' - - 检查进程数方法: - - ``` - thread = ps -ef |grep service |grep -v grep |wc -l - ``` - - 检查服务占用内存方法: - - ``` - server_mem = ps aux |grep %s |grep -v grep |awk '{sum+=$6};END {print sum}' - ``` - - -#### 3.4.4 fate_flow jobs process and mem info check(job任务数检测、job任务进程及占用内存) -- 检测通过提醒: - - [OK] Number of tasks running is xxx - - [OK] Number of tasks waiting is xxx - - [OK] running task job_id :xxx ,number of egg_pair processes is :xxx; used memory:xxx - -- 检测失败提醒: - - [ERROR] There is no such fate_flow_client.py file, please check fate_flow server if it is running! - -- 检查方法: - - 通过FATE自带的fate_flow_client 命令查看任务相关信息,通过ps命令查看内存相关信息。 - - - -## 四 日志搜集 - -### 4.1 使用场景 - ------- - -适用于跑任务出现错误后,在开发人员指导下进行错误日志搜集脚本,需要从报错日志中提取关键报错信息。 - -### 4.2 工具功能 - ------- - -拉取指定ip:$EGGROLL_HOME/logs目录下带传入关键字的目录到本机当前目录下 - -### 4.3 使用方法 - -**需支持节点间免密scp、ssh操作,也可以手动输入密码执行** - -1、设置环境变量: - -```shell -source $FATE_HOME/init_env.sh -``` - -2、编辑配置文件: - -``` -cd $EGGROLL_HOME/bin/debug -vi check_iplist.sh -``` - -参数说明: - -```shell -user=app <远程登录用户名> -iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> -``` - -3、执行检测脚本: - -```shell -sh grep_logs.sh ${需要查询的session-id} <带上需要搜集的session-id,支持模糊查询> -``` - -执行后该session-id的各个ip的日志便会搜集到当前目录下的$session-id/$ip目录下 - -### 4.4 结果说明 - ------- - -执行完可在当前目录下看到传入的$session_id目录,目录下是各个ip的关于$session_id的日志。 - - - -## 五 集群配置检测 - -### 5.1 使用场景 - ------- - -适用于运维人员部署好项目后,肉眼检查各个机器的eggroll.properties、route_table.json配置是否存在问题。 - -### 5.2 工具功能 - ------- - -拉取指定ip的eggroll.properties、route_table.json配置到本机展示。 - -### 5.3 使用方法 - -**需支持节点间免密scp、ssh操作,或手动输入密码执行也可以** - ------- - -1、设置环境变量: - -```shell -source $FATE_HOME/init_env.sh -``` - -2、编辑配置文件: - -``` -cd $EGGROLL_HOME/bin/debug -vi check_iplist.sh -``` - -参数说明: - -```shell -user=app <远程登录用户名> -iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> -``` - -3、然后执行脚本: - -```shell -sh check_conf.sh -``` - -### 5.4 结果说明 - ------- - -该脚本展示配置所有ip与本机的配置对比,说明如下: - -```properties -//展示本机eggroll.properties配置信息 -----------------------$EGGROLL_HOME/conf/eggroll.properties-------------------- -//展示本机route_table.json配置信息 ------------------------$EGGROLL_HOME/conf/route_table.json--------------------- -//展示ip列表中第一个ip配置与本机的diff结果,若为空则完全相同 -------------------diff $ip1 with ./conf/eggroll.properties------------------------- -//展示ip列表中第二个ip配置与本机的diff结果,若为空则完全相同 -------------------diff $ip2 with ./conf/eggroll.properties------------------------- -//展示ip列表中第三个ip配置与本机的diff结果,若为空则完全相同 -------------------diff $ip3 with ./conf/eggroll.properties------------------------- -``` - +# 脚本使用说明 + +## 一 概述 + +本工具集提供4个工具,功能如下: + +| 工具名称 | 工具功能 | 使用场景 | +| ---------------- | ------------------------------------------ | -------------------- | +| 机器基础信息检测 | 验证机器设置是否满足跑fate任务要求 | 部署完成并启动服务后 | +| fate运行信息检测 | 验证机器当前状态是否适合新建一个fate任务 | 启动fate任务前 | +| 日志搜集 | 搜集该集群下所有session_id的日志到当前目录 | 跑任务出现错误后 | +| 集群配置检测 | 搜集展示集群的配置文件信息 | 部署完成启动服务失败 | + +名词解释: + +| 名词 | 解释 | +| ----------------------- | ----------------------------------------------- | +| $FATE_HOME | 通常在/data/projects/fate | +| $EGGROLL_HOME | 通常在/data/projects/fate/eggroll | +| ${集群节点个数} | 如果运行脚本的机器所在集群有3个节点,就取3 | +| ${需要查询的session-id} | 是一个21位左右的长id。如202009031227285073491。 | + + + +## 二 机器基础信息检测 + +### 2.1 使用场景 + +------ + +此脚本在完成部署并正常启动服务后运行,脚本功能检查系统内存 / 虚拟内存 / 磁盘 / 最大用户进程数 / 文件数 / 线程数设置 / rollsite进程堆内存 等机器基础信息,用于验证机器设置是否满足跑fate任务要求。 + +### 2.2 工具功能 + +------ + +此检测检测提供两种版本: + +- 单集群版:基于eggroll服务检测,需要各个节点eggroll服务正常启动后方可使用,用于检测各个nodemanager服务所在节点的集群基础信息,其检测项包含以下所有共7项; + +- 跨集群版:无需依赖eggroll服务,可以跨节点检测指定所有ip的基础信息,其检测项仅包括以下列出前5项,需支持节点间免密登录。 + +1)检查系统内存:系统内存总量、系统内存使用量、系统内存使用占比 + +2)检查虚拟内存:虚拟内存总量、虚拟内存使用量、虚拟内存使用占比 + +3)检查磁盘使用情况:磁盘总量、磁盘使用量、磁盘使用占比 + +4)检查系统最大用户进程数 + +5)检查最大文件数 + +6)检查线程数设置:检查egg pair线程数eggroll.rollpair.eggpair.server.executor.pool.max.size设置是否充足 + +7)检查rollsite进程堆内存是否充足: + +### 2.3 使用方法 + +------ + +#### 2.3.1 完整版 + +```shell +source $FATE_HOME/init_env.sh +cd $EGGROLL_HOME/bin/debug +sh env_check.sh ${集群节点个数} +cat result_env.log +``` + +若对几个$开头的变量有疑问,请参考概述中的名词解释。 + +#### 2.3.2 简约版 + +1、设置环境变量: + +```shell +source $FATE_HOME/init_env.sh +``` + +2、编辑配置文件: + +``` +cd $EGGROLL_HOME/bin/debug +vi check_iplist.sh +``` + +参数说明: + +​ user=app <远程登录用户名> +​ iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> + +3、执行检测脚本: + +```python +python time_check.py +//查看检测结果,各个ip的检测结果生成于当前目录下以ip命名的文件,直接cat可查看对应ip的返回信息 +cat ./$ip +``` + +//若需定时检测观察内存信息变化则加-t参数,可指定间隔秒数定时输出 + +``` +python time_check.py -t {检测间隔秒数,不填只检测一次} +``` + +### 2.4 检测结果说明 + +------ + +返回示例信息如下: + +*说明:以下信息分为三种提示等级:* + +*[OK\] 表示该检查项正常;* + +*[WARNING\]表示该项需要注意,仅作关键信息展示,需要自行判断;* + +*[ERROR\]表示该项不符合预期结果,需要按提示修改。* + +```properties +//脚本执行时间 + 2020-09-02 15:00:41.424053 +//返回的节点ip + ==============This is node 0:127.0.0.1=========================================== +//系统内存总量、系统内存使用量、系统内存使用占比 + [WARNING] MemTotal:78.51G, MemUsed:11.5G, MemUsedPCT:15% +//虚拟内存总量、虚拟内存使用量、虚拟内存使用占比,若小于128G,则提示ERROR,如下所示: + [ERROR] The swap memory is:32.0G, no less than 128G. <虚拟内存不足 + [WARNING] MemTotal:16.51G, MemUsed:128G, MemUsedPCT:12.3% <虚拟内存正常 +//磁盘总量、磁盘使用量、磁盘使用占比 + [WARNING] DiskTotal:984.18G, DiskUsed:566.53G, DiskUsedPCT:61% + --------------Max user processes and max file count------------------------------ +//最大用户进程数与最大文件数各个文件设置值展示,其中不满足65535的项则报[ERROR提示]: + [OK] /proc/sys/kernel/threads-max = 642956 + [OK] /etc/sysctl.conf = 1048576 + [OK] /proc/sys/kernel/pid_max = 131072 + [ERROR] please check /proc/sys/vm/max_map_count = 65530, no less than 65535. + [OK] /etc/security/limits.conf = 102401 + [OK] /etc/security/limits.d/80-nofile.conf = 131072 + [OK] /etc/sysctl.conf = 1048576 + [OK] /proc/sys/fs/file-max = 1048576 + --------------Thread count check------------------------------------------------- +//判断eggroll.properties中eggroll.rollpair.eggpair.server.executor.pool.max.size配置项设置的线程值是否充足,若不充足,则报[ERROR]提示需要调大线程值 + [OK] The thread count = 1406, the total processes = 16 * 500 = 8000 + ----------Rollsite memory use percent-------------------------------------------- +//展示rollsite进程占用堆内存与rollsite设置内存上限比值,以判断rollsite内存是否充足,若百分比偏大,则需考虑释放rollsite内存或调高rollsite内存上限 + [WARNING] rollsite memory use: 0.69% +``` + + + +## 三 fate运行信息检测 + +### 3.1 使用场景 + +------ + +跑fate任务前,检测fate运行信息。验证机器当前状态是否适合新建一个fate任务 + +### 3.2 工具功能 + +------ + +检测fate运行信息:eggroll路由是不是默认路由、是否已安装data access、fate服务的运行状态、进程数及占用内存情况、当前环境正在运行及等待的job任务数、job任务有多少进程及占用的内存情况。 + +### 3.3 使用方法 + +``` +source $FATE_HOME/init_env.sh //FATE_HOME为用户环境的fate目录 +cd $EGGROLL_HOME/bin/debug +sh server_check.sh ${集群内节点个数} +例:sh server_check.sh 1 +``` + +结果保存在result_server.log文件中 + +### 3.4 检测结果说明 + +------ + +#### 3.4.1 default route check(eggroll路由是不是默认路由) + +- 检测通过提示: + + [OK] eggroll route configured! + + "port": 9801, "ip": "172.16.153.25" + +- 检测失败提示: + + [ERROR] eggroll route is not configured, please check /data/projects/fate/eggroll/conf/route_table.json file if it is existed! + +- 检查方法: + + 检测/data/projects/fate/eggroll/conf/route_table.json 是否有配置default参数。如果有,把ip和端口打印出来。如果无,提示ERROR。 + + +#### 3.4.2 fate service check(fate服务状态、进程数及占用内存) + +- 检测通过提醒: + + [OK] the service is running , number of processes is :; used memory: + +- 检测失败提醒: + + [WARNING] the service not running, please check service status. + +- 检查方法: + + 检查服务列表: + + 'ClusterManagerBootstrap','NodeManagerBootstrap','rollsite','fate_flow_server.py','fateboard','mysql' + + 检查进程数方法: + + ``` + thread = ps -ef |grep service |grep -v grep |wc -l + ``` + + 检查服务占用内存方法: + + ``` + server_mem = ps aux |grep %s |grep -v grep |awk '{sum+=$6};END {print sum}' + ``` + + +#### 3.4.3 fate_flow jobs process and mem info check(job任务数检测、job任务进程及占用内存) +- 检测通过提醒: + + [OK] Number of tasks running is xxx + + [OK] Number of tasks waiting is xxx + + [OK] running task job_id :xxx ,number of egg_pair processes is :xxx; used memory:xxx + +- 检测失败提醒: + + [ERROR] There is no such fate_flow_client.py file, please check fate_flow server if it is running! + +- 检查方法: + + 通过FATE自带的fate_flow_client 命令查看任务相关信息,通过ps命令查看内存相关信息。 + + + +## 四 日志搜集 + +### 4.1 使用场景 + +------ + +适用于跑任务出现错误后,在开发人员指导下进行错误日志搜集脚本,需要从报错日志中提取关键报错信息。 + +### 4.2 工具功能 + +------ + +拉取指定ip:$EGGROLL_HOME/logs目录下带传入关键字的目录到本机当前目录下 + +### 4.3 使用方法 + +1、设置环境变量: + +```shell +source $FATE_HOME/init_env.sh +``` + +2、编辑配置文件: + +``` +cd $EGGROLL_HOME/bin/debug +vi check_iplist.sh +``` + +参数说明: + +​ user=app <远程登录用户名> +​ iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> + +3、执行检测脚本: + +```shell +sh grep_logs.sh ${需要查询的session-id} <带上需要搜集的session-id,支持模糊查询> +``` + +执行后该session-id的各个ip的日志便会搜集到当前目录下的$session-id/$ip目录下 + +### 4.4 结果说明 + +------ + +执行完可在当前目录下看到传入的$session_id目录,目录下是各个ip的关于$session_id的日志。 + + + +## 五 集群配置检测 + +### 5.1 使用场景 + +------ + +适用于运维人员部署好项目后,肉眼检查各个机器的eggroll.properties、route_table.json配置是否存在问题。 + +### 5.2 工具功能 + +------ + +拉取指定ip的eggroll.properties、route_table.json配置到本机展示。 + +### 5.3 使用方法 + +------ + +1、设置环境变量: + +```shell +source $FATE_HOME/init_env.sh +``` + +2、编辑配置文件: + +``` +cd $EGGROLL_HOME/bin/debug +vi check_iplist.sh +``` + +参数说明: + +​ user=app <远程登录用户名> +​ iplist=(127.0.0.1 127.0.0.2 127.0.0.3) <需要拉取日志的ip列表> + +3、然后执行脚本: + +```shell +sh check_conf.sh +``` + +### 5.4 结果说明 + +------ + +该脚本展示配置所有ip与本机的配置对比,说明如下: + +```properties +//展示本机eggroll.properties配置信息 +----------------------$EGGROLL_HOME/conf/eggroll.properties-------------------- +//展示本机route_table.json配置信息 +-----------------------$EGGROLL_HOME/conf/route_table.json--------------------- +//展示ip列表中第一个ip配置与本机的diff结果,若为空则完全相同 +------------------diff $ip1 with ./conf/eggroll.properties------------------------- +//展示ip列表中第二个ip配置与本机的diff结果,若为空则完全相同 +------------------diff $ip2 with ./conf/eggroll.properties------------------------- +//展示ip列表中第三个ip配置与本机的diff结果,若为空则完全相同 +------------------diff $ip3 with ./conf/eggroll.properties------------------------- +``` + From 7ef1d0af29f7b4c13f4b5a4cd580772e4e24984f Mon Sep 17 00:00:00 2001 From: easongao <46415758+easson001@users.noreply.github.com> Date: Thu, 24 Sep 2020 16:48:07 +0800 Subject: [PATCH 32/35] Add files via upload delete data access checking --- tools/debug/server_check.py | 308 +++++++++++++++++------------------- tools/debug/server_check.sh | 7 +- 2 files changed, 145 insertions(+), 170 deletions(-) diff --git a/tools/debug/server_check.py b/tools/debug/server_check.py index d39eda417d..a24abc3b73 100644 --- a/tools/debug/server_check.py +++ b/tools/debug/server_check.py @@ -1,164 +1,144 @@ -# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# -import re -import os -import sys -import json -import time -import socket -import psutil -import datetime -import threading -import argparse -import subprocess -from eggroll.core.session import ErSession -from eggroll.roll_pair.roll_pair import RollPairContext -from eggroll.utils.log_utils import get_logger - -L = get_logger() - -arg_parser = argparse.ArgumentParser() -arg_parser.add_argument("-t","--time", type=int, help="Sleep time wait, default value 0s", default=0) -arg_parser.add_argument("-n","--nodes", type=int, help="Eggroll session processors per node, default value 1", default=1) -arg_parser.add_argument("-p","--partitions", type=int, help="Total partitions, default value 1", default=1) -arg_parser.add_argument("-d","--partyid", type=int, help="host partyid", default=0) -args = arg_parser.parse_args() - -def str_generator(include_key=True, row_limit=10, key_suffix_size=0, value_suffix_size=0): - for i in range(row_limit): - if include_key: - yield str(i) + "s"*key_suffix_size, str(i) + "s"*value_suffix_size - else: - yield str(i) + "s"*value_suffix_size - -def round2(x): - return str(round(x / 1024 / 1024 / 1024, 2)) - -def print_red(str): - print("\033[1;31;40m\t" + str + "\033[0m") - -def print_green(str): - print("\033[1;32;40m\t" + str + "\033[0m") - -def print_yellow(str): - print("\033[1;33;40m\t" + str + "\033[0m") - -def check_actual_max_threads(): - def getMemInfo(fn): - def query_cmd(cmd): - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0].decode().strip().split('\n') - return p[0] - - def get_host_ip(): - try: - s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - s.connect(('8.8.8.8', 80)) - ip = s.getsockname()[0] - finally: - s.close() - return ip - fate_flow_client = "/data/projects/fate/python/fate_flow/fate_flow_client.py" - mem_info = {} - mem_info["Ip"] = get_host_ip() - eggroll_home = query_cmd("echo $EGGROLL_HOME") - route_file = eggroll_home + "/conf/route_table.json" - f = open(route_file, encoding='utf-8') - mem_info["route_table"] = json.load(f) - mem_info["data_access"] = query_cmd("ps aux |grep data_access_server |grep -v grep |wc -l") - if args.partyid != 0: - mem_info["data_test"] = query_cmd("curl -X POST --header 'Content-Type: application/json' -d '{\"local\": {\"role\": \"host\", \"party_id\": %s}, \"id_type\":\"phone\", \"encrypt_type\":\"md5\"}' 'http://127.0.0.1:9350/v1/data/query_imported_id_library_info'" %(args.partyid)) - mem_info["data_num"] = mem_info["data_test"].split(':')[-1].split('}')[0] - mem_info["directory"] = query_cmd("if [ -d /data/projects/fdn/FDN-DataAcces ];then echo 1; else echo 0; fi") - mem_info["services"] = ['ClusterManagerBootstrap','NodeManagerBootstrap','rollsite','fate_flow_server.py','fateboard','mysql'] - mem_info["job_run"] = query_cmd("if [ -f %s ];then python %s -f query_job -s running | grep f_job_id |wc -l; else echo -1; fi" %(fate_flow_client,fate_flow_client)) - mem_info["job_wait"] = query_cmd("if [ -f %s ];then python %s -f query_job -s waiting | grep f_job_id |wc -l; else echo -1; fi" %(fate_flow_client,fate_flow_client)) - mem_info["job_thread"] = [] - mem_info["jobs"] = query_cmd("array=(`python %s -f query_job -s running | grep f_job_id |awk -F: '{print $2}' |awk -F '\"' '{print $2}'`);echo ${array[@]}" %(fate_flow_client)) - mem_info["job_mem"] = [] - for job_id in mem_info["jobs"]: - mem_info["job_thread"] = query_cmd("ps -ef |grep egg_pair |grep -v grep |grep %s |wc -l" %(job_id)) - mem_info["job_mem"] = query_cmd("ps aux |grep egg_pair |grep %s |awk '{sum+=$6};END {print sum}'" %(job_id)) - mem_info["server_mem"] = {} - mem_info["thread"] = {} - for service in mem_info["services"]: - mem_info["thread"][service] = query_cmd("ps -ef |grep %s |grep -v grep |wc -l" %(service)) - mem_info["server_mem"][service] = str(query_cmd("ps aux |grep %s |grep -v grep |awk '{sum+=$6};END {print sum}'" %(service))) - return mem_info - - session = ErSession(options={"eggroll.session.processors.per.node": args.nodes}) - try: - ctx = RollPairContext(session) - rp = ctx.parallelize(str_generator(row_limit=1000), options={'total_partitions': args.partitions}) - result = rp.with_stores(func=getMemInfo) - print_green(str(datetime.datetime.now())) - for node in result: - print_green("==============This is node " + str(node[0]) + ":" + node[1]["Ip"] + "===========================================") - print_green("-------------default route check-------------------------------------------------------") - route_table_dict = node[1]["route_table"] - if 'default' not in route_table_dict['route_table']: - print_red("[ERROR] eggroll exchange route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed!") - else: - try: - ip = route_table_dict['route_table']['default']['default'][0]['ip'] - port = route_table_dict['route_table']['default']['default'][0]['port'] - print_green("[OK] eggroll route configured!") - print_green("exchange ip:{}, exchange port:{}".format(ip, port)) - except KeyError: - print_red("[ERROR] eggroll exchange route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed!") - - print_green("--------------data_access service check-------------------------------------------------") - if int(node[1]["data_access"]) == 0: - if int(node[1]["directory"]) == 0: - print_red("[ERROR] data_access service and directory not found, please check if it is installed!") - else: - print_yellow("[WARNING] data_access not running or check /data/projects/fdn/FDN-DataAcces directory") - else: - print_green("[OK] Installed and running data_access service!") - if args.partyid != 0: - if int(node[1]["data_num"]) == 0 or int(node[1]["data_num"]) == 201: - print_green("[OK] Route verification success!") - else: - print_yellow("[WARNING] data_access service not available, please check host and host route!") - - print_green("--------------fate service check-------------------------------------------------------") - for server in node[1]["services"]: - if int(node[1]["thread"][server]) > 0: - print_green("[OK] the " + server.ljust(23) + " service is running , number of processes is : " + str(node[1]["thread"][server]) + "; used memory : " + str(node[1]["server_mem"][server]) + "KB.") - else: - print_yellow("[WARNING] the " + server + " service not running, please check service status.") - - print_green("--------------fate_flow jobs process and mem info check--------------------------------------------------") - if int(node[1]["job_run"]) == -1: - print_red("[ERROR] There is no such fate_flow_client.py file, please check fate_flow server if it is running!") - else: - print_green("[OK] Number of tasks running is " + node[1]["job_run"]) - print_green("[OK] Number of tasks waiting is " + node[1]["job_wait"]) - if int(node[1]["job_run"]) > 0: - for job_id in node[1]["jobs"].split(" "): - print_green("[OK] running task job_id : " + job_id + ", number of egg_pair processes is : " + str(node[1]["job_thread"]) + "; used memory : " + str(node[1]["job_mem"]) + "KB.") - - print("\n") - finally: - session.kill() - - -if __name__ == '__main__': - if args.time == 0: - check_actual_max_threads() - else: - while 1: - check_actual_max_threads() - time.sleep(args.time) +# Copyright (c) 2019 - now, Eggroll Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +import re +import os +import sys +import json +import time +import socket +import psutil +import datetime +import threading +import argparse +import subprocess +from eggroll.core.session import ErSession +from eggroll.roll_pair.roll_pair import RollPairContext +from eggroll.utils.log_utils import get_logger + +L = get_logger() + +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument("-t","--time", type=int, help="Sleep time wait, default value 0s", default=0) +arg_parser.add_argument("-n","--nodes", type=int, help="Eggroll session processors per node, default value 1", default=1) +arg_parser.add_argument("-p","--partitions", type=int, help="Total partitions, default value 1", default=1) +args = arg_parser.parse_args() + +def str_generator(include_key=True, row_limit=10, key_suffix_size=0, value_suffix_size=0): + for i in range(row_limit): + if include_key: + yield str(i) + "s"*key_suffix_size, str(i) + "s"*value_suffix_size + else: + yield str(i) + "s"*value_suffix_size + +def round2(x): + return str(round(x / 1024 / 1024 / 1024, 2)) + +def print_red(str): + print("\033[1;31;40m\t" + str + "\033[0m") + +def print_green(str): + print("\033[1;32;40m\t" + str + "\033[0m") + +def print_yellow(str): + print("\033[1;33;40m\t" + str + "\033[0m") + +def check_actual_max_threads(): + def getMemInfo(fn): + def query_cmd(cmd): + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0].decode().strip().split('\n') + return p[0] + + def get_host_ip(): + try: + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.connect(('8.8.8.8', 80)) + ip = s.getsockname()[0] + finally: + s.close() + return ip + fate_flow_client = "/data/projects/fate/python/fate_flow/fate_flow_client.py" + mem_info = {} + mem_info["Ip"] = get_host_ip() + eggroll_home = query_cmd("echo $EGGROLL_HOME") + route_file = eggroll_home + "/conf/route_table.json" + f = open(route_file, encoding='utf-8') + mem_info["route_table"] = json.load(f) + mem_info["services"] = ['ClusterManagerBootstrap','NodeManagerBootstrap','rollsite','fate_flow_server.py','fateboard','mysql'] + mem_info["job_run"] = query_cmd("if [ -f %s ];then python %s -f query_job -s running | grep f_job_id |wc -l; else echo -1; fi" %(fate_flow_client,fate_flow_client)) + mem_info["job_wait"] = query_cmd("if [ -f %s ];then python %s -f query_job -s waiting | grep f_job_id |wc -l; else echo -1; fi" %(fate_flow_client,fate_flow_client)) + mem_info["job_thread"] = [] + mem_info["jobs"] = query_cmd("array=(`python %s -f query_job -s running | grep f_job_id |awk -F: '{print $2}' |awk -F '\"' '{print $2}'`);echo ${array[@]}" %(fate_flow_client)) + mem_info["job_mem"] = [] + for job_id in mem_info["jobs"]: + mem_info["job_thread"] = query_cmd("ps -ef |grep egg_pair |grep -v grep |grep %s |wc -l" %(job_id)) + mem_info["job_mem"] = query_cmd("ps aux |grep egg_pair |grep %s |awk '{sum+=$6};END {print sum}'" %(job_id)) + mem_info["server_mem"] = {} + mem_info["thread"] = {} + for service in mem_info["services"]: + mem_info["thread"][service] = query_cmd("ps -ef |grep %s |grep -v grep |wc -l" %(service)) + mem_info["server_mem"][service] = str(query_cmd("ps aux |grep %s |grep -v grep |awk '{sum+=$6};END {print sum}'" %(service))) + return mem_info + + session = ErSession(options={"eggroll.session.processors.per.node": args.nodes}) + try: + ctx = RollPairContext(session) + rp = ctx.parallelize(str_generator(row_limit=1000), options={'total_partitions': args.partitions}) + result = rp.with_stores(func=getMemInfo) + print_green(str(datetime.datetime.now())) + for node in result: + print_green("==============This is node " + str(node[0]) + ":" + node[1]["Ip"] + "===========================================") + print_green("-------------default route check-------------------------------------------------------") + route_table_dict = node[1]["route_table"] + if 'default' not in route_table_dict['route_table']: + print_red("[ERROR] eggroll exchange route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed!") + else: + try: + ip = route_table_dict['route_table']['default']['default'][0]['ip'] + port = route_table_dict['route_table']['default']['default'][0]['port'] + print_green("[OK] eggroll route configured!") + print_green("exchange ip:{}, exchange port:{}".format(ip, port)) + except KeyError: + print_red("[ERROR] eggroll exchange route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed!") + + print_green("--------------fate service check-------------------------------------------------------") + for server in node[1]["services"]: + if int(node[1]["thread"][server]) > 0: + print_green("[OK] the " + server.ljust(23) + " service is running , number of processes is : " + str(node[1]["thread"][server]) + "; used memory : " + str(node[1]["server_mem"][server]) + "KB.") + else: + print_yellow("[WARNING] the " + server + " service not running, please check service status.") + + print_green("--------------fate_flow jobs process and mem info check--------------------------------------------------") + if int(node[1]["job_run"]) == -1: + print_red("[ERROR] There is no such fate_flow_client.py file, please check fate_flow server if it is running!") + else: + print_green("[OK] Number of tasks running is " + node[1]["job_run"]) + print_green("[OK] Number of tasks waiting is " + node[1]["job_wait"]) + if int(node[1]["job_run"]) > 0: + for job_id in node[1]["jobs"].split(" "): + print_green("[OK] running task job_id : " + job_id + ", number of egg_pair processes is : " + str(node[1]["job_thread"]) + "; used memory : " + str(node[1]["job_mem"]) + "KB.") + + print("\n") + finally: + session.kill() + + +if __name__ == '__main__': + if args.time == 0: + check_actual_max_threads() + else: + while 1: + check_actual_max_threads() + time.sleep(args.time) diff --git a/tools/debug/server_check.sh b/tools/debug/server_check.sh index 56cc20416c..78e7d3734a 100644 --- a/tools/debug/server_check.sh +++ b/tools/debug/server_check.sh @@ -20,13 +20,8 @@ touch result_server.log fi nodes=$1 -party=$2 LogLevel=$EGGROLL_LOG_LEVEL export EGGROLL_LOG_LEVEL=INFO -if [ -n "$party" ];then - python server_check.py -p $nodes -d $party >> result_server.log -else - python server_check.py -p $nodes >> result_server.log -fi +python server_check.py -p $nodes >> result_server.log export EGGROLL_LOG_LEVEL=$LogLevel echo "Check the result in the current directory, Please execute command: cat result_server.log" From 630e9641cf904dc62955fe5e5d196cac2d9a30dc Mon Sep 17 00:00:00 2001 From: easongao <46415758+easson001@users.noreply.github.com> Date: Thu, 24 Sep 2020 17:15:53 +0800 Subject: [PATCH 33/35] Add files via upload fix bug about delete data access checking --- ...\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git "a/tools/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/tools/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" index f3cbe58f1a..d9384247a8 100644 --- "a/tools/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" +++ "b/tools/debug/\350\204\232\346\234\254\344\275\277\347\224\250\350\257\264\346\230\216.md" @@ -161,7 +161,7 @@ python time_check.py -t {检测间隔秒数,不填只检测一次} ------ -检测fate运行信息:eggroll路由是不是默认路由、是否已安装data access、fate服务的运行状态、进程数及占用内存情况、当前环境正在运行及等待的job任务数、job任务有多少进程及占用的内存情况。 +检测fate运行信息:eggroll路由是不是默认路由、fate服务的运行状态、进程数及占用内存情况、当前环境正在运行及等待的job任务数、job任务有多少进程及占用的内存情况。 ### 3.3 使用方法 From 9d8d64e922666b32d26c1f8471b74ed0dcbe9a01 Mon Sep 17 00:00:00 2001 From: zengjice Date: Thu, 24 Sep 2020 21:30:33 +0800 Subject: [PATCH 34/35] add tools into build.sh --- cluster-deploy/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster-deploy/build.sh b/cluster-deploy/build.sh index 5174ca583b..7ed0065636 100644 --- a/cluster-deploy/build.sh +++ b/cluster-deploy/build.sh @@ -49,7 +49,7 @@ cp -r arch/transfer_variables ${package_dir}/python/arch/ cp -r arch/standalone ${package_dir}/python/arch/ cp fate.env requirements.txt RELEASE.md ${package_dir}/python/ cp -r examples federatedml fate_flow ${package_dir}/python/ -cp -r bin ${package_dir}/ +cp -r bin tools ${package_dir}/ echo "[INFO] Package fate done" echo "[INFO] Package fateboard start" From ca99d0728dc558eaee513b0af1a8ae5408b971a4 Mon Sep 17 00:00:00 2001 From: zengjice Date: Sun, 27 Sep 2020 11:09:54 +0800 Subject: [PATCH 35/35] release v1.4.5 --- .gitmodules | 2 +- RELEASE.md | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index f9182f0090..5a5393bbca 100644 --- a/.gitmodules +++ b/.gitmodules @@ -5,4 +5,4 @@ [submodule "eggroll"] path = eggroll url = https://github.com/WeBankFinTech/eggroll.git - branch = release-2.0.2-build-6 + branch = v2.0.2 diff --git a/RELEASE.md b/RELEASE.md index db23513e37..6985f98385 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,3 +1,8 @@ +# Release 1.4.5 +## Major Features and Improvements +> EggRoll +* RollSite supports the communication certificates + # Release 1.4.4 ## Major Features and Improvements > FATE-Flow