From 2bb65660ae8b9b2e1896b07b881505a4ffc0393b Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 17 Jul 2024 21:37:28 +0100 Subject: [PATCH] [LV] Allow re-processing of operands of instrs feeding interleave group Follow up to d216615518 to update dead interleave group pointer detection to allow re-processing of operands of instructions determined to only feed interleave groups. This is needed because instructions feeding interleave group pointers can become dead in any order, as per the newly added test case. --- .../Transforms/Vectorize/LoopVectorize.cpp | 9 +- .../LoopVectorize/X86/interleave-cost.ll | 228 ++++++++++++++++++ 2 files changed, 231 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index c276a2995f54c6..40919c944d21fc 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7027,7 +7027,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { // Ignore ephemeral values. CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); - SmallVector InitialInterleavePointersOps; + SmallVector DeadInterleavePointerOps; for (BasicBlock *BB : TheLoop->blocks()) for (Instruction &I : *BB) { // Find all stores to invariant variables. Since they are going to sink @@ -7045,13 +7045,10 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { if (Group->getInsertPos() == &I) continue; Value *PointerOp = getLoadStorePointerOperand(&I); - InitialInterleavePointersOps.push_back(PointerOp); + DeadInterleavePointerOps.push_back(PointerOp); } } - SmallSetVector DeadInterleavePointerOps( - InitialInterleavePointersOps.rbegin(), - InitialInterleavePointersOps.rend()); // Mark ops feeding interleave group members as free, if they are only used // by other dead computations. for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) { @@ -7064,7 +7061,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { })) continue; VecValuesToIgnore.insert(Op); - DeadInterleavePointerOps.insert(Op->op_begin(), Op->op_end()); + DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end()); } // Ignore type-promoting instructions we identified during reduction diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll index 9bba1a90096e6f..b1f7516f3c8dca 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll @@ -373,7 +373,230 @@ exit: ret void } +define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N) #1 { +; CHECK-LABEL: define void @geps_feeding_interleave_groups_with_reuse2( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[N]], 3 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP1]], 28 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; CHECK: [[VECTOR_SCEVCHECK]]: +; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[N]], 3 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 24 +; CHECK-NEXT: [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]]) +; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 0, [[MUL_RESULT]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[MUL_RESULT]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[TMP4]], [[SCEVGEP]] +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 28 +; CHECK-NEXT: [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]]) +; CHECK-NEXT: [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 0, [[MUL_RESULT3]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[MUL_RESULT3]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp ult ptr [[TMP8]], [[SCEVGEP1]] +; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW4]] +; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[A]], i64 20 +; CHECK-NEXT: [[MUL6:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]]) +; CHECK-NEXT: [[MUL_RESULT7:%.*]] = extractvalue { i64, i1 } [[MUL6]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW8:%.*]] = extractvalue { i64, i1 } [[MUL6]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = sub i64 0, [[MUL_RESULT7]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[SCEVGEP5]], i64 [[MUL_RESULT7]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp ult ptr [[TMP12]], [[SCEVGEP5]] +; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP13]], [[MUL_OVERFLOW8]] +; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr i8, ptr [[A]], i64 16 +; CHECK-NEXT: [[MUL10:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]]) +; CHECK-NEXT: [[MUL_RESULT11:%.*]] = extractvalue { i64, i1 } [[MUL10]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW12:%.*]] = extractvalue { i64, i1 } [[MUL10]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = sub i64 0, [[MUL_RESULT11]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[SCEVGEP9]], i64 [[MUL_RESULT11]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp ult ptr [[TMP16]], [[SCEVGEP9]] +; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP17]], [[MUL_OVERFLOW12]] +; CHECK-NEXT: [[SCEVGEP13:%.*]] = getelementptr i8, ptr [[A]], i64 12 +; CHECK-NEXT: [[MUL14:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]]) +; CHECK-NEXT: [[MUL_RESULT15:%.*]] = extractvalue { i64, i1 } [[MUL14]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW16:%.*]] = extractvalue { i64, i1 } [[MUL14]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = sub i64 0, [[MUL_RESULT15]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[SCEVGEP13]], i64 [[MUL_RESULT15]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp ult ptr [[TMP20]], [[SCEVGEP13]] +; CHECK-NEXT: [[TMP22:%.*]] = or i1 [[TMP21]], [[MUL_OVERFLOW16]] +; CHECK-NEXT: [[SCEVGEP17:%.*]] = getelementptr i8, ptr [[A]], i64 8 +; CHECK-NEXT: [[MUL18:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]]) +; CHECK-NEXT: [[MUL_RESULT19:%.*]] = extractvalue { i64, i1 } [[MUL18]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW20:%.*]] = extractvalue { i64, i1 } [[MUL18]], 1 +; CHECK-NEXT: [[TMP23:%.*]] = sub i64 0, [[MUL_RESULT19]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[SCEVGEP17]], i64 [[MUL_RESULT19]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp ult ptr [[TMP24]], [[SCEVGEP17]] +; CHECK-NEXT: [[TMP26:%.*]] = or i1 [[TMP25]], [[MUL_OVERFLOW20]] +; CHECK-NEXT: [[SCEVGEP21:%.*]] = getelementptr i8, ptr [[A]], i64 4 +; CHECK-NEXT: [[MUL22:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]]) +; CHECK-NEXT: [[MUL_RESULT23:%.*]] = extractvalue { i64, i1 } [[MUL22]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW24:%.*]] = extractvalue { i64, i1 } [[MUL22]], 1 +; CHECK-NEXT: [[TMP27:%.*]] = sub i64 0, [[MUL_RESULT23]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[SCEVGEP21]], i64 [[MUL_RESULT23]] +; CHECK-NEXT: [[TMP29:%.*]] = icmp ult ptr [[TMP28]], [[SCEVGEP21]] +; CHECK-NEXT: [[TMP30:%.*]] = or i1 [[TMP29]], [[MUL_OVERFLOW24]] +; CHECK-NEXT: [[MUL25:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]]) +; CHECK-NEXT: [[MUL_RESULT26:%.*]] = extractvalue { i64, i1 } [[MUL25]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW27:%.*]] = extractvalue { i64, i1 } [[MUL25]], 1 +; CHECK-NEXT: [[TMP31:%.*]] = sub i64 0, [[MUL_RESULT26]] +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[MUL_RESULT26]] +; CHECK-NEXT: [[TMP33:%.*]] = icmp ult ptr [[TMP32]], [[A]] +; CHECK-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[MUL_OVERFLOW27]] +; CHECK-NEXT: [[TMP35:%.*]] = or i1 [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[TMP36:%.*]] = or i1 [[TMP35]], [[TMP14]] +; CHECK-NEXT: [[TMP37:%.*]] = or i1 [[TMP36]], [[TMP18]] +; CHECK-NEXT: [[TMP38:%.*]] = or i1 [[TMP37]], [[TMP22]] +; CHECK-NEXT: [[TMP39:%.*]] = or i1 [[TMP38]], [[TMP26]] +; CHECK-NEXT: [[TMP40:%.*]] = or i1 [[TMP39]], [[TMP30]] +; CHECK-NEXT: [[TMP41:%.*]] = or i1 [[TMP40]], [[TMP34]] +; CHECK-NEXT: br i1 [[TMP41]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP42:%.*]] = lshr i64 [[N]], 3 +; CHECK-NEXT: [[TMP43:%.*]] = shl i64 [[TMP42]], 5 +; CHECK-NEXT: [[TMP44:%.*]] = add i64 [[TMP43]], 32 +; CHECK-NEXT: [[SCEVGEP28:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP44]] +; CHECK-NEXT: [[TMP45:%.*]] = add nuw nsw i64 [[TMP43]], 4 +; CHECK-NEXT: [[SCEVGEP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP45]] +; CHECK-NEXT: [[TMP46:%.*]] = shl i64 [[TMP42]], 4 +; CHECK-NEXT: [[TMP47:%.*]] = add nuw nsw i64 [[TMP46]], 8 +; CHECK-NEXT: [[SCEVGEP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP47]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP29]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP28]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND031:%.*]] = icmp ult ptr [[A]], [[SCEVGEP30]] +; CHECK-NEXT: [[BOUND132:%.*]] = icmp ult ptr [[B]], [[SCEVGEP28]] +; CHECK-NEXT: [[FOUND_CONFLICT33:%.*]] = and i1 [[BOUND031]], [[BOUND132]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT33]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4 +; CHECK-NEXT: [[TMP48:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP49:%.*]] = select i1 [[TMP48]], i64 4, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[TMP49]] +; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 8 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP51:%.*]] = lshr exact i64 [[TMP50]], 1 +; CHECK-NEXT: [[TMP52:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP51]] +; CHECK-NEXT: [[TMP53:%.*]] = getelementptr i32, ptr [[TMP52]], i32 0 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP53]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC34:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP54:%.*]] = getelementptr i32, ptr [[B]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP54]], i32 4, <4 x i1> , <4 x i32> poison), !alias.scope [[META6:![0-9]+]] +; CHECK-NEXT: [[TMP55:%.*]] = or disjoint i64 [[TMP50]], 7 +; CHECK-NEXT: [[TMP56:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP55]] +; CHECK-NEXT: [[TMP57:%.*]] = getelementptr i32, ptr [[TMP56]], i32 -7 +; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC34]], <4 x i32> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_GATHER]], <4 x i32> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <8 x i32> [[TMP58]], <8 x i32> [[TMP59]], <16 x i32> +; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <8 x i32> [[TMP60]], <8 x i32> zeroinitializer, <16 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <32 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[TMP63]], <32 x i32> poison, <32 x i32> +; CHECK-NEXT: store <32 x i32> [[INTERLEAVED_VEC]], ptr [[TMP57]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP64]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT_7:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[SHR_1:%.*]] = lshr exact i64 [[IV]], 1 +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr nusw i32, ptr [[B]], i64 [[SHR_1]] +; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_B]], align 4 +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: store i32 [[L]], ptr [[GEP_A]], align 4 +; CHECK-NEXT: [[IV_NEXT:%.*]] = or disjoint i64 [[IV]], 1 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV_NEXT]] +; CHECK-NEXT: store i32 0, ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: [[IV_NEXT_1:%.*]] = or disjoint i64 [[IV]], 2 +; CHECK-NEXT: [[SHR_2:%.*]] = lshr exact i64 [[IV_NEXT_1]], 1 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr i32, ptr [[B]], i64 [[SHR_2]] +; CHECK-NEXT: [[TMP65:%.*]] = load i32, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV_NEXT_1]] +; CHECK-NEXT: store i32 [[TMP65]], ptr [[GEP_A_2]], align 4 +; CHECK-NEXT: [[IV_NEXT_2:%.*]] = or disjoint i64 [[IV]], 3 +; CHECK-NEXT: [[GEP_A_3:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV_NEXT_2]] +; CHECK-NEXT: store i32 0, ptr [[GEP_A_3]], align 4 +; CHECK-NEXT: [[IV_NEXT_3:%.*]] = or disjoint i64 [[IV]], 4 +; CHECK-NEXT: [[GEP_B_4:%.*]] = getelementptr i32, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[TMP66:%.*]] = load i32, ptr [[GEP_B_4]], align 4 +; CHECK-NEXT: [[GEP_A_4:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV_NEXT_3]] +; CHECK-NEXT: store i32 [[TMP66]], ptr [[GEP_A_4]], align 4 +; CHECK-NEXT: [[IV_NEXT_4:%.*]] = or disjoint i64 [[IV]], 5 +; CHECK-NEXT: [[GEP_A_5:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV_NEXT_4]] +; CHECK-NEXT: store i32 0, ptr [[GEP_A_5]], align 4 +; CHECK-NEXT: [[IV_NEXT_5:%.*]] = or disjoint i64 [[IV]], 6 +; CHECK-NEXT: [[GEP_A_6:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV_NEXT_5]] +; CHECK-NEXT: store i32 0, ptr [[GEP_A_6]], align 4 +; CHECK-NEXT: [[IV_NEXT_6:%.*]] = or disjoint i64 [[IV]], 7 +; CHECK-NEXT: [[GEP_A_7:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV_NEXT_6]] +; CHECK-NEXT: store i32 0, ptr [[GEP_A_7]], align 4 +; CHECK-NEXT: [[IV_NEXT_7]] = add nuw nsw i64 [[IV]], 8 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next.7, %loop ] + %shr.1 = lshr exact i64 %iv, 1 + %gep.B = getelementptr nusw i32, ptr %B, i64 %shr.1 + %l = load i32, ptr %gep.B, align 4 + %gep.A = getelementptr i32, ptr %A, i64 %iv + store i32 %l, ptr %gep.A, align 4 + %iv.next = or disjoint i64 %iv, 1 + %gep.A.1 = getelementptr i32, ptr %A, i64 %iv.next + store i32 0, ptr %gep.A.1, align 4 + %iv.next.1 = or disjoint i64 %iv, 2 + %shr.2 = lshr exact i64 %iv.next.1, 1 + %gep.B.2 = getelementptr i32, ptr %B, i64 %shr.2 + %1 = load i32, ptr %gep.B.2, align 4 + %gep.A.2 = getelementptr i32, ptr %A, i64 %iv.next.1 + store i32 %1, ptr %gep.A.2, align 4 + %iv.next.2 = or disjoint i64 %iv, 3 + %gep.A.3 = getelementptr i32, ptr %A, i64 %iv.next.2 + store i32 0, ptr %gep.A.3, align 4 + %iv.next.3 = or disjoint i64 %iv, 4 + %gep.B.4 = getelementptr i32, ptr %B, i64 %iv + %2 = load i32, ptr %gep.B.4, align 4 + %gep.A.4 = getelementptr i32, ptr %A, i64 %iv.next.3 + store i32 %2, ptr %gep.A.4, align 4 + %iv.next.4 = or disjoint i64 %iv, 5 + %gep.A.5 = getelementptr i32, ptr %A, i64 %iv.next.4 + store i32 0, ptr %gep.A.5, align 4 + %iv.next.5 = or disjoint i64 %iv, 6 + %gep.A.6 = getelementptr i32, ptr %A, i64 %iv.next.5 + store i32 0, ptr %gep.A.6, align 4 + %iv.next.6 = or disjoint i64 %iv, 7 + %gep.A.7 = getelementptr i32, ptr %A, i64 %iv.next.6 + store i32 0, ptr %gep.A.7, align 4 + %iv.next.7 = add nuw nsw i64 %iv, 8 + %ec = icmp eq i64 %iv, %N + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + attributes #0 = { "target-features"="+sse4.2" } +attributes #1 = { "min-legal-vector-width"="0" "target-cpu"="cascadelake" } ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} @@ -382,4 +605,9 @@ attributes #0 = { "target-features"="+sse4.2" } ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]} +; CHECK: [[META6]] = !{[[META7:![0-9]+]]} +; CHECK: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]]} +; CHECK: [[META8]] = distinct !{[[META8]], !"LVerDomain"} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]]} ;.