Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merge to local #27

Merged
merged 34 commits into from
Jun 18, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
b5a1c14
Update CPU_NUM config (#18059)
Jun 13, 2019
42f12a4
fix ci test cmake test=develop (#18060)
Shixiaowei02 Jun 13, 2019
4c735f2
fix bug in fleet, test=develop (#18058)
seiriosPlus Jun 13, 2019
8cf25c4
refine core warning message (#18063)
tensor-tang Jun 14, 2019
4662541
add Mobilienet ssd int8 analyzer tester (#18075)
lidanqing-intel Jun 14, 2019
660c1a6
Optimize fused_elewise_activation_grad op. (#18041)
Xreki Jun 14, 2019
e81756f
Hidden paddle.fluid.layers.detection_map. (#18033)
qingqing01 Jun 14, 2019
354643d
Add warning for cudnn warpctc kernel in CUDA9\CUDA10. (#18046)
wanghaoshuang Jun 14, 2019
f5caf34
Fix reinitialized ncclid error! (#18025)
gongweibao Jun 14, 2019
b2cfdc3
Refine unittest log (#18084)
guru4elephant Jun 14, 2019
26a7c1a
add unit test to cover all parameters for print op test=develop (#18089)
wopeizl Jun 14, 2019
d9270af
Fix getitems slice bug (#18053)
phlrain Jun 14, 2019
ff59866
test=develop, fix mnist will segment fault (#18083)
JiabinYang Jun 14, 2019
3f55ab0
Modify format of GPU allocation failure log. (#18034)
zhhsplendid Jun 14, 2019
24e988a
Fix bug of scope_buffered_ssa_graph_executor (#18100)
Jun 15, 2019
2e1d8cf
add approval to requirements.txt
tianshuo78520a Jun 15, 2019
5c3cbb5
Update requirement for py2 and py3 (#18068)
junjun315 Jun 15, 2019
accb132
fix slim int8 mkldnn multithreading issue (#18009)
sfraczek Jun 15, 2019
7faf095
Sync Dockerfile change of PR#17889 (#18072)
zhhsplendid Jun 15, 2019
9ed2f93
add target assign operator for supporting retinanet (#17893)
FlyingQianMM Jun 15, 2019
9e4b9d9
Update generate_proposal_labels_op to support CascadeRCNN. (#17200)
FDInSky Jun 16, 2019
0aee1f0
add sigmoid focal loss operator for supporting retinanet (#17895)
FlyingQianMM Jun 16, 2019
9089774
fix python ver for matplotlib, test=develop (#18123)
junjun315 Jun 16, 2019
0941e3e
add class name and timeline for test_dist_base.py (#18122)
guru4elephant Jun 16, 2019
ff83655
add detection output operator for supporting retinanet (#17896)
FlyingQianMM Jun 16, 2019
c26130f
reuse C-API INT8 unit test application (#18077)
Jun 16, 2019
ca5642c
unify FP32 vs. INT8 comparison tests output (#18111)
Jun 16, 2019
80d2e66
Update backward appending stragety to support double backward and fix…
qingqing01 Jun 16, 2019
58f3e1b
add paddle cloud role maker for customized usage, note this is only f…
guru4elephant Jun 17, 2019
6eec66a
Fix py_reader iterable bug (#18108)
sneaxiy Jun 17, 2019
23f8a4b
assign role_maker before use (#18137)
jacquesqiao Jun 17, 2019
1c6e560
core replace x86cpu with py cpuinfo (#18151)
tensor-tang Jun 18, 2019
25ab23b
Fix dygraph mem leak (#18082)
sneaxiy Jun 18, 2019
4978db2
Remove nccl dep when the number of GPU is 1 (#18158)
Jun 18, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Fix reinitialized ncclid error! (PaddlePaddle#18025)
  • Loading branch information
gongweibao authored Jun 14, 2019
commit f5caf3443c4d536a72aba9ddf778be0d442f5dfe
2 changes: 1 addition & 1 deletion paddle/fluid/framework/details/all_reduce_op_handle.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ namespace details {
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::MultiNCCLContextMap *ctxs)
const platform::NCCLCommunicator *ctxs)
: NCCLOpHandleBase(node, places, ctxs), local_scopes_(local_scopes) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
}
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/details/all_reduce_op_handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class AllReduceOpHandle : public NCCLOpHandleBase {
public:
AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::MultiNCCLContextMap *ctxs);
const platform::NCCLCommunicator *ctxs);
#else
class AllReduceOpHandle : public OpHandleBase {
public:
Expand Down
26 changes: 14 additions & 12 deletions paddle/fluid/framework/details/build_strategy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -266,14 +266,16 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
return framework::ir::MultiDevSSAGraphBuilder().count(pass_name) > 0;
}

ir::Graph *BuildStrategy::Apply(
ir::Graph *graph, const std::vector<platform::Place> &places,
const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
const size_t &nranks,
ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
const std::vector<platform::Place> &places,
const std::string &loss_var_name,
const std::vector<Scope *> &local_scopes,
const size_t &nranks,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const bool use_cuda, platform::MultiNCCLContextMap *nccl_ctxs) const {
const bool use_cuda,
platform::NCCLCommunicator *nccl_ctxs) const {
#else
const bool use_cuda) const {
const bool use_cuda) const {
#endif
VLOG(3) << "apply all passes";
// Create a default one if not finalized by user.
Expand All @@ -293,9 +295,9 @@ ir::Graph *BuildStrategy::Apply(
pass->Set<size_t>(ir::kNRanks, new size_t(nranks));

#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
platform::MultiNCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr;
pass->Erase(kNCCLCtxs);
pass->SetNotOwned<platform::MultiNCCLContextMap>(kNCCLCtxs, nctx);
pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
#endif
} else if (pass->Type() == "alloc_continuous_space_for_grad_pass" ||
pass->Type() == "fuse_adam_op_pass" ||
Expand All @@ -309,9 +311,9 @@ ir::Graph *BuildStrategy::Apply(
&local_scopes);
if (pass->Type() == "fuse_all_reduce_op_pass") {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
platform::MultiNCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr;
pass->Erase(kNCCLCtxs);
pass->SetNotOwned<platform::MultiNCCLContextMap>(kNCCLCtxs, nctx);
pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
pass->Erase(kUseHierarchicalAllReduce);
pass->Set<bool>(kUseHierarchicalAllReduce,
new bool(use_hierarchical_allreduce_));
Expand All @@ -328,9 +330,9 @@ ir::Graph *BuildStrategy::Apply(
<< enable_sequential_execution_;
} else if (pass->Type() == "all_reduce_deps_pass") {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
platform::MultiNCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr;
pass->Erase(kNCCLCtxs);
pass->SetNotOwned<platform::MultiNCCLContextMap>(kNCCLCtxs, nctx);
pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
pass->Erase(kUseHierarchicalAllReduce);
pass->Set<bool>(kUseHierarchicalAllReduce,
new bool(use_hierarchical_allreduce_));
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/details/build_strategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ struct BuildStrategy {
const size_t &nranks,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const bool use_cuda,
platform::MultiNCCLContextMap *nccl_ctxs) const;
platform::NCCLCommunicator *nccl_ctxs) const;
#else
const bool use_cuda) const;
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
FusedAllReduceOpHandle::FusedAllReduceOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const size_t num_of_all_reduce,
const platform::MultiNCCLContextMap *ctxs)
const platform::NCCLCommunicator *ctxs)
: NCCLOpHandleBase(node, places, ctxs),
local_scopes_(local_scopes),
num_of_all_reduce_(num_of_all_reduce) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ struct FusedAllReduceOpHandle : public NCCLOpHandleBase {
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const size_t num_of_all_reduce,
const platform::MultiNCCLContextMap *ctxs);
const platform::NCCLCommunicator *ctxs);
#else
struct FusedAllReduceOpHandle : public OpHandleBase {
FusedAllReduceOpHandle(ir::Node *node,
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/framework/details/nccl_op_handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ namespace details {
class NCCLOpHandleBase : public OpHandleBase {
public:
NCCLOpHandleBase(ir::Node* node, const std::vector<platform::Place>& places,
const platform::MultiNCCLContextMap* nccl_ctxs)
const platform::NCCLCommunicator* nccl_ctxs)
: OpHandleBase(node), places_(places), nccl_ctxs_(nccl_ctxs) {
if (nccl_ctxs == nullptr) {
return;
Expand Down Expand Up @@ -215,7 +215,7 @@ class NCCLOpHandleBase : public OpHandleBase {

protected:
std::vector<platform::Place> places_;
const platform::MultiNCCLContextMap* nccl_ctxs_{nullptr};
const platform::NCCLCommunicator* nccl_ctxs_{nullptr};
// When multi trainer call collective function, they need run the same order.
// Or the program will hang.So we use allreduce_deps_pass to set this
// run_order_.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ namespace details {
SparseAllReduceOpHandle::SparseAllReduceOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::MultiNCCLContextMap *ctxs, bool is_encoded, int nranks)
const platform::NCCLCommunicator *ctxs, bool is_encoded, int nranks)
: AllReduceOpHandle(node, local_scopes, places, ctxs),
is_encoded_(is_encoded),
nranks_(nranks) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class SparseAllReduceOpHandle : public AllReduceOpHandle {
SparseAllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::MultiNCCLContextMap *ctxs,
const platform::NCCLCommunicator *ctxs,
bool is_encoded = false, int nranks = -1);
std::string Name() const override;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class FuseAllReduceOpPass : public ir::Pass {
auto &local_scopes = Get<const std::vector<Scope *>>(details::kLocalScopes);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
auto *multi_nccl_ctxs =
&Get<platform::MultiNCCLContextMap>(details::kNCCLCtxs);
&Get<platform::NCCLCommunicator>(details::kNCCLCtxs);
#endif

std::unordered_set<std::string> grads;
Expand Down Expand Up @@ -103,14 +103,14 @@ class FuseAllReduceOpPass : public ir::Pass {
}
}

void InsertFusedAllReduce(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes, const size_t num_of_all_reduce,
const std::vector<ir::Node *> &all_reduce_ops,
void InsertFusedAllReduce(const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const size_t num_of_all_reduce,
const std::vector<ir::Node *> &all_reduce_ops,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const platform::MultiNCCLContextMap *multi_nccl_ctxs,
const platform::NCCLCommunicator *multi_nccl_ctxs,
#endif
ir::Graph *result) const {
ir::Graph *result) const {
std::vector<details::VarHandleBase *> inputs;
std::vector<details::VarHandleBase *> outputs;
for (auto &op : all_reduce_ops) {
Expand Down Expand Up @@ -151,7 +151,7 @@ class FuseAllReduceOpPass : public ir::Pass {
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const platform::MultiNCCLContextMap *multi_nccl_ctxs,
const platform::NCCLCommunicator *multi_nccl_ctxs,
#endif
ir::Graph *result) const {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ void MultiDevSSAGraphBuilderBase::Init() const {
local_scopes_ = Get<const std::vector<Scope *>>(details::kLocalScopes);
strategy_ = Get<const details::BuildStrategy>(kStrategy);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
multi_nccl_ctxs_ = &Get<platform::MultiNCCLContextMap>(details::kNCCLCtxs);
multi_nccl_ctxs_ = &Get<platform::NCCLCommunicator>(details::kNCCLCtxs);
nccl_ctxs_ = nullptr;
if (multi_nccl_ctxs_) {
nccl_ctxs_ = multi_nccl_ctxs_->DefaultFlatCtx();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {

#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
mutable platform::NCCLContextMap *nccl_ctxs_{nullptr};
mutable platform::MultiNCCLContextMap *multi_nccl_ctxs_{nullptr};
mutable platform::NCCLCommunicator *multi_nccl_ctxs_{nullptr};
#endif

mutable std::string loss_var_name_;
Expand Down
78 changes: 38 additions & 40 deletions paddle/fluid/framework/parallel_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,8 @@ class ParallelExecutorPrivate {
std::vector<ncclUniqueId *> flat_nccl_ids;
if (nranks_ == 1) {
// FIXME(gongwb): need not to create ncclid when nranks==1
nccl_ctxs_.InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
bst.trainer_id_);
nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
bst.trainer_id_);
return;
}

Expand All @@ -132,16 +132,16 @@ class ParallelExecutorPrivate {

flat_nccl_ids.push_back(nccl_id);

nccl_ctxs_.InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
bst.trainer_id_);
nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
bst.trainer_id_);
VLOG(1) << "init bst nccl context complete!";
return;
}

// num_trainers ==1 && places > 1
if (bst.num_trainers_ == 1) {
nccl_ctxs_.InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
bst.trainer_id_);
nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
bst.trainer_id_);
return;
}

Expand All @@ -153,8 +153,8 @@ class ParallelExecutorPrivate {
flat_nccl_ids.push_back(nccl_id);
}

nccl_ctxs_.InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
bst.trainer_id_);
nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
bst.trainer_id_);

if (bst.use_hierarchical_allreduce_) {
std::vector<ncclUniqueId *> inter_nccl_ids;
Expand All @@ -175,12 +175,30 @@ class ParallelExecutorPrivate {
exter_nccl_ids.push_back(nccl_id);
}

nccl_ctxs_.InitHierarchicalCtxs(places_, inter_nccl_ids, exter_nccl_ids,
bst.num_trainers_, bst.trainer_id_,
bst.hierarchical_allreduce_inter_nranks_,
bst.hierarchical_allreduce_exter_nranks_);
nccl_ctxs_->InitHierarchicalCtxs(
places_, inter_nccl_ids, exter_nccl_ids, bst.num_trainers_,
bst.trainer_id_, bst.hierarchical_allreduce_inter_nranks_,
bst.hierarchical_allreduce_exter_nranks_);
}
}

void InitOrGetNCCLCommunicator(framework::Scope *scope,
const BuildStrategy &bst) {
const std::string var_name = "NCCLCommunicator";
auto var = scope->FindVar(var_name);
if (var != nullptr) {
PADDLE_ENFORCE(var->IsInitialized(),
"if %s exists, it must be initialized", var_name);
VLOG(1) << "find " << var_name
<< " in scope, so use it and does not recreate!";
nccl_ctxs_ = var->GetMutable<platform::NCCLCommunicator>();
return;
}

VLOG(1) << "not find " << var_name << " in scope, so recreate it!";
nccl_ctxs_ = scope->Var(var_name)->GetMutable<platform::NCCLCommunicator>();
InitNCCLCtxs(scope, bst);
}
#endif

BuildStrategy build_strategy_;
Expand All @@ -190,7 +208,7 @@ class ParallelExecutorPrivate {
std::unique_ptr<details::SSAGraphExecutor> executor_;

#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
platform::MultiNCCLContextMap nccl_ctxs_;
platform::NCCLCommunicator *nccl_ctxs_{nullptr};
#endif
bool own_local_scope_;
bool use_cuda_;
Expand Down Expand Up @@ -281,27 +299,6 @@ bool ParallelExecutor::NeedCreateLocalExeScope() {
return executor && executor->NeedCreateLocalExeScope();
}

#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
/*
* When nccl inits nccl comm using ncclCommInitAll, it meets error when
* allreduce ophandle and sync_batch_norm_op use ncclallreduce parallelly. So
* create a new nccl comm for sync_batch_norm_op. And these codes should be
* polished with a unified nccl management.
*/
platform::NCCLContextMap *ParallelExecutor::GetNCCLContextForSyncbatchNomrOp(
framework::Scope *scope) {
auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
if (nccl_id_var != nullptr) {
return member_->nccl_ctxs_.DefaultFlatCtx();
}

if (dev_nccl_ctxs_.get() == nullptr) {
dev_nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_));
}
return dev_nccl_ctxs_.get();
}
#endif

ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
const std::vector<std::string> &bcast_vars,
const std::string &loss_var_name,
Expand Down Expand Up @@ -375,7 +372,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
if (member_->use_cuda_) {
// Bcast Parameters to all GPUs
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
member_->InitNCCLCtxs(scope, build_strategy);
member_->InitOrGetNCCLCommunicator(scope, build_strategy);

// Initialize device context's nccl comm, will be used by normal
// Operators like sync_batch_norm, and collective ops.
Expand All @@ -384,7 +381,8 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
// NOTE: NCCL group-calls and non-group-calls can not use the same
// NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use
// same communicators.
auto *nccl_ctxs = GetNCCLContextForSyncbatchNomrOp(scope);
auto *nccl_ctxs =
member_->nccl_ctxs_->GetSyncBatchNormCtx(scope, member_->places_);
for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
Expand Down Expand Up @@ -421,18 +419,18 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
VLOG(3) << "use local async mode";
graph = build_strategy.Apply(graph, {member_->places_[0]}, loss_var_name,
{member_->local_scopes_[0]}, 1,
member_->use_cuda_, &member_->nccl_ctxs_);
member_->use_cuda_, member_->nccl_ctxs_);
for (size_t i = 1; i < member_->places_.size(); ++i) {
graphs[i] =
build_strategy.Apply(graphs[i], {member_->places_[i]}, loss_var_name,
{member_->local_scopes_[i]}, 1,
member_->use_cuda_, &member_->nccl_ctxs_);
member_->use_cuda_, member_->nccl_ctxs_);
async_graphs[i] = graphs[i];
}
} else {
graph = build_strategy.Apply(graph, member_->places_, loss_var_name,
member_->local_scopes_, member_->nranks_,
member_->use_cuda_, &member_->nccl_ctxs_);
member_->use_cuda_, member_->nccl_ctxs_);
}
#else
if (build_strategy.async_mode_) {
Expand Down Expand Up @@ -565,7 +563,7 @@ void ParallelExecutor::BCastParamsToDevices(
PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
"variables' buffer size to bcast NOT equal to places");
{
auto *nccl_ctxs = member_->nccl_ctxs_.DefaultFlatCtx();
auto *nccl_ctxs = member_->nccl_ctxs_->DefaultFlatCtx();
platform::NCCLGroupGuard guard;
for (size_t i = 0; i < member_->places_.size(); ++i) {
auto &nccl_ctx = nccl_ctxs->at(member_->places_[i]);
Expand Down
7 changes: 0 additions & 7 deletions paddle/fluid/framework/parallel_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,6 @@ class ParallelExecutor {

ParallelExecutorPrivate *member_;
std::vector<std::unique_ptr<ir::Graph>> async_graphs_;

#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
// used for compatible with syncbatch norm op
std::unique_ptr<platform::NCCLContextMap> dev_nccl_ctxs_;
platform::NCCLContextMap *GetNCCLContextForSyncbatchNomrOp(
framework::Scope *scope);
#endif
};
} // namespace framework
} // namespace paddle
2 changes: 2 additions & 0 deletions paddle/fluid/framework/var_type_traits.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
// limitations under the License.

#include "paddle/fluid/framework/var_type_traits.h"
#include <unordered_map>
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/scope.h"
Expand All @@ -22,6 +23,7 @@
#ifdef PADDLE_WITH_CUDA
#ifndef _WIN32
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#include "paddle/fluid/platform/nccl_helper.h"
#endif
#include <cudnn.h>
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
Expand Down
Loading