diff --git a/cpp/cmake/thrust.patch b/cpp/cmake/thrust.patch index 6f735b955cf..2f9201d8ab4 100644 --- a/cpp/cmake/thrust.patch +++ b/cpp/cmake/thrust.patch @@ -1,39 +1,52 @@ -diff --git a/cub/block/block_merge_sort.cuh b/cub/block/block_merge_sort.cuh -index 4769df36..d86d6342 100644 ---- a/cub/block/block_merge_sort.cuh -+++ b/cub/block/block_merge_sort.cuh -@@ -91,7 +91,7 @@ __device__ __forceinline__ void SerialMerge(KeyT *keys_shared, - KeyT key1 = keys_shared[keys1_beg]; - KeyT key2 = keys_shared[keys2_beg]; +diff --git a/thrust/system/cuda/detail/sort.h b/thrust/system/cuda/detail/sort.h +index 1ffeef0..5e80800 100644 +--- a/thrust/system/cuda/detail/sort.h ++++ b/thrust/system/cuda/detail/sort.h +@@ -108,7 +108,7 @@ namespace __merge_sort { + key_type key2 = keys_shared[keys2_beg]; + -#pragma unroll +#pragma unroll 1 - for (int item = 0; item < ITEMS_PER_THREAD; ++item) - { - bool p = (keys2_beg < keys2_end) && -@@ -383,7 +383,7 @@ public: - // - KeyT max_key = oob_default; - -- #pragma unroll -+ #pragma unroll 1 - for (int item = 1; item < ITEMS_PER_THREAD; ++item) + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + bool p = (keys2_beg < keys2_end) && +@@ -311,10 +311,10 @@ namespace __merge_sort { + void stable_odd_even_sort(key_type (&keys)[ITEMS_PER_THREAD], + item_type (&items)[ITEMS_PER_THREAD]) { - if (ITEMS_PER_THREAD * linear_tid + item < valid_items) -@@ -407,7 +407,7 @@ public: - // each thread has sorted keys - // merge sort keys in shared memory - // -- #pragma unroll -+ #pragma unroll 1 - for (int target_merged_threads_number = 2; - target_merged_threads_number <= NUM_THREADS; - target_merged_threads_number *= 2) -diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh -index b188c75f..3f36656f 100644 +-#pragma unroll ++#pragma unroll 1 + for (int i = 0; i < ITEMS_PER_THREAD; ++i) + { +-#pragma unroll ++#pragma unroll 1 + for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) + { + if (compare_op(keys[j + 1], keys[j])) +@@ -350,7 +350,7 @@ namespace __merge_sort { + // each thread has sorted keys_loc + // merge sort keys_loc in shared memory + // +-#pragma unroll ++#pragma unroll 1 + for (int coop = 2; coop <= BLOCK_THREADS; coop *= 2) + { + sync_threadblock(); +@@ -479,7 +479,7 @@ namespace __merge_sort { + // and fill the remainig keys with it + // + key_type max_key = keys_loc[0]; +-#pragma unroll ++#pragma unroll 1 + for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (ITEMS_PER_THREAD * tid + ITEM < num_remaining) +diff a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh +index 41eb1d2..f2893b4 100644 --- a/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/device/dispatch/dispatch_radix_sort.cuh -@@ -736,7 +736,7 @@ struct DeviceRadixSortPolicy +@@ -723,7 +723,7 @@ struct DeviceRadixSortPolicy /// SM60 (GP100) @@ -42,11 +55,11 @@ index b188c75f..3f36656f 100644 { enum { PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100) -diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh -index e0470ccb..6a0c2ed6 100644 +diff a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh +index f6aee45..dd64301 100644 --- a/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/device/dispatch/dispatch_reduce.cuh -@@ -280,7 +280,7 @@ struct DeviceReducePolicy +@@ -284,7 +284,7 @@ struct DeviceReducePolicy }; /// SM60 @@ -55,11 +68,11 @@ index e0470ccb..6a0c2ed6 100644 { // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items) typedef AgentReducePolicy< -diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh -index c2d04588..ac2d10e0 100644 +diff a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh +index c0c6d59..937ee31 100644 --- a/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/device/dispatch/dispatch_scan.cuh -@@ -177,7 +177,7 @@ struct DeviceScanPolicy +@@ -178,7 +178,7 @@ struct DeviceScanPolicy }; /// SM600 @@ -68,20 +81,3 @@ index c2d04588..ac2d10e0 100644 { typedef AgentScanPolicy< 128, 15, ///< Threads per block, items per thread -diff --git a/cub/thread/thread_sort.cuh b/cub/thread/thread_sort.cuh -index 5d486789..b42fb5f0 100644 ---- a/cub/thread/thread_sort.cuh -+++ b/cub/thread/thread_sort.cuh -@@ -83,10 +83,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], - { - constexpr bool KEYS_ONLY = std::is_same::value; - -- #pragma unroll -+ #pragma unroll 1 - for (int i = 0; i < ITEMS_PER_THREAD; ++i) - { -- #pragma unroll -+ #pragma unroll 1 - for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) - { - if (compare_op(keys[j + 1], keys[j]))