An extra memcpy with -Zmir-opt-level=2 #77613

jrmuizel · 2020-10-06T12:59:36Z

pub enum SpecificDisplayItem {
    PopStackingContext,
    Other([f64; 22]),
}

struct DI {
    item: SpecificDisplayItem,
}

#[inline(never)]
pub fn g(clip: Option<&bool>) {
    clip.unwrap();
    let item = SpecificDisplayItem::PopStackingContext;
    do_item(&DI {
            item,
    });
}

#[inline(never)]
fn do_item(di: &DI) { unsafe { ext(di) } }
extern {
    fn ext(di: &DI);
}

With -Zmir-opt-level=1 this compiles to

example::g:
        sub     rsp, 184
        test    rdi, rdi
        je      .LBB0_1
        mov     qword ptr [rsp], 0
        mov     rdi, rsp
        call    example::do_item
        add     rsp, 184
        ret
.LBB0_1:
        lea     rdi, [rip + .L__unnamed_1]
        lea     rdx, [rip + .L__unnamed_2]
        mov     esi, 43
        call    qword ptr [rip + core::panicking::panic@GOTPCREL]
        ud2

With mir-opt-level=2 it compiles to:

example::g:
        sub     rsp, 360
        test    rdi, rdi
        je      .LBB0_1
        mov     qword ptr [rsp], 0
        lea     rdi, [rsp + 8]
        lea     rsi, [rsp + 184]
        mov     edx, 176
        call    qword ptr [rip + memcpy@GOTPCREL]
        mov     rdi, rsp
        call    example::do_item
        add     rsp, 360
        ret
.LBB0_1:
        lea     rdi, [rip + .L__unnamed_1]
        lea     rdx, [rip + .L__unnamed_2]
        mov     esi, 43
        call    qword ptr [rip + core::panicking::panic@GOTPCREL]
        ud2

This is with Rust nightly on rust.godbolt.org

MSxDOS · 2020-10-06T18:13:45Z

Something is really wrong here as removing clip.unwrap(); also removes the memcpy and

if let Some(clip) = clip {
    clip
} else {
    return
};

brings it back.

jrmuizel · 2020-10-06T19:47:32Z

Here's the llvm-ir for
mir-opt-level: 1

define void @_ZN7example1g17h14264d45cdc1c8ddE(i8* noalias readonly align 1 dereferenceable_or_null(1) %clip) unnamed_addr #1 !dbg !18 {
start:
  %_9 = alloca %SpecificDisplayItem, align 8
  %_8 = alloca %DI, align 8
  %item = alloca %SpecificDisplayItem, align 8
  %_2 = call align 1 dereferenceable(1) i8* @"_ZN4core6option15Option$LT$T$GT$6unwrap17hd51eb875dba3900fE"(i8* noalias readonly align 1 dereferenceable_or_null(1) %clip, %"std::panic::Location"* noalias readonly align 8 dereferenceable(24) bitcast (<{ i8*, [16 x i8] }>* @alloc21 to %"std::panic::Location"*)), !dbg !21
  br label %bb1, !dbg !21

bb1:                                              ; preds = %start
  %0 = bitcast %SpecificDisplayItem* %item to i64*, !dbg !22
  store i64 0, i64* %0, align 8, !dbg !22
  %1 = bitcast %SpecificDisplayItem* %_9 to i8*, !dbg !23
  %2 = bitcast %SpecificDisplayItem* %item to i8*, !dbg !23
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %1, i8* align 8 %2, i64 184, i1 false), !dbg !23
  %3 = bitcast %DI* %_8 to %SpecificDisplayItem*, !dbg !24
  %4 = bitcast %SpecificDisplayItem* %3 to i8*, !dbg !24
  %5 = bitcast %SpecificDisplayItem* %_9 to i8*, !dbg !24
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %4, i8* align 8 %5, i64 184, i1 false), !dbg !24
  call void @_ZN7example7do_item17h87ff3f7e8c231d7dE(%DI* noalias readonly align 8 dereferenceable(184) %_8), !dbg !25
  br label %bb2, !dbg !25

bb2:                                              ; preds = %bb1
  ret void, !dbg !26
}

and mir-opt-level=2

define void @_ZN7example1g17h14264d45cdc1c8ddE(i8* noalias readonly align 1 dereferenceable_or_null(1) %clip) unnamed_addr #1 !dbg !18 {
start:
  %item = alloca %SpecificDisplayItem, align 8
  %_6 = alloca %DI, align 8
  %_2 = call align 1 dereferenceable(1) i8* @"_ZN4core6option15Option$LT$T$GT$6unwrap17hd51eb875dba3900fE"(i8* noalias readonly align 1 dereferenceable_or_null(1) %clip, %"std::panic::Location"* noalias readonly align 8 dereferenceable(24) bitcast (<{ i8*, [16 x i8] }>* @alloc21 to %"std::panic::Location"*)), !dbg !21
  br label %bb1, !dbg !21

bb1:                                              ; preds = %start
  %0 = bitcast %SpecificDisplayItem* %item to i64*, !dbg !22
  store i64 0, i64* %0, align 8, !dbg !22
  %1 = bitcast %DI* %_6 to %SpecificDisplayItem*, !dbg !23
  %2 = bitcast %SpecificDisplayItem* %1 to i8*, !dbg !23
  %3 = bitcast %SpecificDisplayItem* %item to i8*, !dbg !23
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %2, i8* align 8 %3, i64 184, i1 false), !dbg !23
  call void @_ZN7example7do_item17h87ff3f7e8c231d7dE(%DI* noalias readonly align 8 dereferenceable(184) %_6), !dbg !24
  br label %bb2, !dbg !24

bb2:                                              ; preds = %bb1
  ret void, !dbg !25
}

tgnottingham · 2020-10-12T23:44:10Z

I was only able to reproduce this with -C opt-level set to 2 or 3. Other combinations of -C opt-level and -Z mir-opt-level seemed to produce reasonable enough results:

tgnottingham · 2020-10-14T04:50:32Z

I think I know what the issue is, broadly. The tl;dr is that LLVM doesn't take into account context from multiple basic blocks when applying the memcpy optimization. And certain combinations of opt-level and mir-opt-level generate code where the context required to optimize away the memcpy is in multiple basic blocks.

Disclaimer: this is my first real brush with MIR and LLVM-IR, so some of the following has a good chance of being wrong.

Part of the issue comes from clip.unwrap() being inlined, which introduces a branch and a new basic block. I modified the example code to replace the unwrap with a conditional early return, which ensures the issue occurs without depending on unwrap being inlined:

pub fn g(clip: bool) {
    if clip {
        return;
    }

    let item = SpecificDisplayItem::PopStackingContext;
    do_item(&DI {
            item,
    });
}

Here is the LLVM-IR just before the LLVM optimization pass that would remove the memcpy:

define void @_ZN3lib1g17h19b6423a2d95f243E(i1 zeroext %clip) unnamed_addr #0 {
start:
  %item.sroa.2 = alloca [22 x i64], align 8
  %_5 = alloca %DI, align 8
  br i1 %clip, label %bb4, label %bb1

bb1:                                              ; preds = %start
  %0 = bitcast %DI* %_5 to i8*
  call void @llvm.lifetime.start.p0i8(i64 184, i8* nonnull %0)
  %item.sroa.0.0..sroa_idx = getelementptr inbounds %DI, %DI* %_5, i64 0, i32 0, i64 0
  store i64 0, i64* %item.sroa.0.0..sroa_idx, align 8
  %item.sroa.2.0..sroa_idx1 = getelementptr inbounds %DI, %DI* %_5, i64 0, i32 1, i32 2
  %item.sroa.2.0..sroa_cast = bitcast [22 x i64]* %item.sroa.2.0..sroa_idx1 to i8*
  %item.sroa.2.0.sroa_cast = bitcast [22 x i64]* %item.sroa.2 to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(176) %item.sroa.2.0..sroa_cast, i8* nonnull align 8 dereferenceable(176) %item.sroa.2.0.sroa_cast, i64 176, i1 false)
  call fastcc void @_ZN3lib7do_item17h4ec77638412d024eE(%DI* noalias nonnull readonly align 8 dereferenceable(184) %_5)
  call void @llvm.lifetime.end.p0i8(i64 184, i8* nonnull %0)
  br label %bb4

bb4:                                              ; preds = %start, %bb1
  ret void
}

The problem is that the source of the memcpy, %item.sroa.2, into the [f64; 22] portion of DI.item (the other portion is the tag) is allocated in a different basic block from the memcpy itself, and there's no context in the memcpy basic block to tell the optimizer that the copy is unnecessary. If you remove the conditional early return, it causes the allocation of %item.sroa.2 to be in the same basic block as the memcpy. The optimizer can see that %item.sroa.2 is never initialized, so the memcpy doesn't accomplish anything useful, and removes it.

You can also tell LLVM that %item.sroa.2 doesn't become live until the memcpy basic block by inserting an llvm.lifetime.start call for it in bb1. That's enough to enable the optimization even with the allocation in a separate block.

Why doesn't the issue occur with -Z mir-opt-level=1? Here's the LLVM-IR just before the memcpy optimization:

define void @_ZN3lib1g17h19b6423a2d95f243E(i1 zeroext %clip) unnamed_addr #0 {
start:
  %_8.sroa.4 = alloca [22 x i64], align 8
  %_7 = alloca %DI, align 8
  %item.sroa.4 = alloca [22 x i64], align 8
  br i1 %clip, label %bb4, label %bb1

bb1:                                              ; preds = %start
  %item.sroa.4.0.sroa_cast = bitcast [22 x i64]* %item.sroa.4 to i8*
  call void @llvm.lifetime.start.p0i8(i64 176, i8* nonnull %item.sroa.4.0.sroa_cast)
  %0 = bitcast %DI* %_7 to i8*
  call void @llvm.lifetime.start.p0i8(i64 184, i8* nonnull %0)
  %_8.sroa.4.0.sroa_cast10 = bitcast [22 x i64]* %_8.sroa.4 to i8*
  call void @llvm.lifetime.start.p0i8(i64 176, i8* nonnull %_8.sroa.4.0.sroa_cast10)
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(176) %_8.sroa.4.0.sroa_cast10, i8* nonnull align 8 dereferenceable(176) %item.sroa.4.0.sroa_cast, i64 176, i1 false)
  %_8.sroa.0.0..sroa_idx = getelementptr inbounds %DI, %DI* %_7, i64 0, i32 0, i64 0
  store i64 0, i64* %_8.sroa.0.0..sroa_idx, align 8
  %_8.sroa.4.0..sroa_idx7 = getelementptr inbounds %DI, %DI* %_7, i64 0, i32 1, i32 2
  %_8.sroa.4.0..sroa_cast = bitcast [22 x i64]* %_8.sroa.4.0..sroa_idx7 to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(176) %_8.sroa.4.0..sroa_cast, i8* nonnull align 8 dereferenceable(176) %_8.sroa.4.0.sroa_cast10, i64 176, i1 false)
  call void @llvm.lifetime.end.p0i8(i64 176, i8* nonnull %_8.sroa.4.0.sroa_cast10)
  call fastcc void @_ZN3lib7do_item17h4ec77638412d024eE(%DI* noalias nonnull readonly align 8 dereferenceable(184) %_7)
  call void @llvm.lifetime.end.p0i8(i64 184, i8* nonnull %0)
  call void @llvm.lifetime.end.p0i8(i64 176, i8* nonnull %item.sroa.4.0.sroa_cast)
  br label %bb4

bb4:                                              ; preds = %start, %bb1
  ret void
}

Here, the sources of the memcpys in bb1 have their lifetimes start in bb1, and are never initialized (or are initialized from something that is never initialized), so the optimizer can see that the memcpys aren't useful, and it optimizes them away.

Possible fixes:

Have LLVM make the optimization using context across multiple basic blocks. There was already work done in this area, but it was reverted. See https://reviews.llvm.org/D38374 and https://reviews.llvm.org/rL321510.
Add precise llvm.lifetime.start and llvm.lifetime.end annotations where we don't already, or at least where it can help with optimizations. I wonder if we're missing out on other optimizations by not doing this. I know we strip some of these out intentionally by removing some StorageLive/Dead annotations in MIR. However, I tried removing the pass that removes those annotations, and it wasn't enough to fix the issue. It's possible the annotations aren't precise enough (that is, their scope is too large), but I haven't investigated further.
Do the optimization in MIR.

1 would be desirable regardless of any other fix. 2 might be good to enable other optimizations. 3 may or may not be worthwhile.

nikic · 2020-10-14T07:58:57Z

Have LLVM make the optimization using context across multiple basic blocks. There was already work done in this area, but it was reverted. See https://reviews.llvm.org/D38374 and https://reviews.llvm.org/rL321510.

FWIW I have a new patch to enabled that at https://reviews.llvm.org/D89207. Will take a while until this is enabled by default though.

tgnottingham · 2020-10-14T22:16:13Z

Looks like we have #44948 tracking the underlying issue.

Also relevant: #56172.

nikic · 2021-03-12T20:05:11Z

Extra memcpy is no longer present on nightly, presumably as a result of #82806.

jrmuizel added the C-bug Category: This is a bug. label Oct 6, 2020

jonas-schievink added A-mir-opt Area: MIR optimizations I-slow Issue: Problems and improvements with respect to performance of generated code. requires-nightly This issue requires a nightly compiler in some way. labels Oct 6, 2020

tgnottingham mentioned this issue Oct 14, 2020

Unnecessary local variable and memcpy not elided during struct initialization #77955

Closed

nikic closed this as completed Mar 12, 2021

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

An extra memcpy with -Zmir-opt-level=2 #77613

An extra memcpy with -Zmir-opt-level=2 #77613

jrmuizel commented Oct 6, 2020

MSxDOS commented Oct 6, 2020

jrmuizel commented Oct 6, 2020

tgnottingham commented Oct 12, 2020

tgnottingham commented Oct 14, 2020

nikic commented Oct 14, 2020

tgnottingham commented Oct 14, 2020 •

edited

Loading

nikic commented Mar 12, 2021

An extra memcpy with -Zmir-opt-level=2 #77613

An extra memcpy with -Zmir-opt-level=2 #77613

Comments

jrmuizel commented Oct 6, 2020

MSxDOS commented Oct 6, 2020

jrmuizel commented Oct 6, 2020

tgnottingham commented Oct 12, 2020

tgnottingham commented Oct 14, 2020

nikic commented Oct 14, 2020

tgnottingham commented Oct 14, 2020 • edited Loading

nikic commented Mar 12, 2021

tgnottingham commented Oct 14, 2020 •

edited

Loading