From 1d2038c7b4c32cf4ff60668bce332721c24ec66d Mon Sep 17 00:00:00 2001 From: Simon Boehm Date: Sat, 25 Feb 2023 17:55:40 +0000 Subject: [PATCH] Kernel 10: Autotuning --- scripts/kernel_10_autotuner.sh | 115 +++++++++++++++++++++++++++++++++ src/runner.cu | 18 +++--- 2 files changed, 124 insertions(+), 9 deletions(-) create mode 100755 scripts/kernel_10_autotuner.sh diff --git a/scripts/kernel_10_autotuner.sh b/scripts/kernel_10_autotuner.sh new file mode 100755 index 0000000..d6a1ba9 --- /dev/null +++ b/scripts/kernel_10_autotuner.sh @@ -0,0 +1,115 @@ +#!/usr/bin/env bash + +set -u + +# Define the range of values for each parameter +BK_VALUES=(8 16 32 64) +BM_VALUES=(64 128 256) +BN_VALUES=(64 128 256) +WM_VALUES=(32 64 128 256) +WN_VALUES=(32 64 128 256) +WNITER_VALUES=(1 2 4 8) +TM_VALUES=(4 8 16 32) +TN_VALUES=(4 8 16 32) +NUM_THREADS_VALUES=(128 256) + +cd "$(dirname "$0")" +cd "../build" + +RUNNER="../src/runner.cu" +OUTPUT="../benchmark_results/kernel_10_autotune_results.txt" + +# Clear the output file +echo "" > $OUTPUT + +# Set GPU to use +export DEVICE="0" +WARPSIZE=32 + + +TOTAL_CONFIGS="$(( ${#BK_VALUES[@]} * ${#BM_VALUES[@]} * ${#BN_VALUES[@]} * ${#WM_VALUES[@]} * ${#WN_VALUES[@]} * ${#WNITER_VALUES[@]} * ${#TM_VALUES[@]} * ${#TN_VALUES[@]} * ${#NUM_THREADS_VALUES[@]} ))" +CONFIG_NUM=0 + +# Loop through all combinations of parameters +for BK in "${BK_VALUES[@]}"; do +for BM in "${BM_VALUES[@]}"; do +for BN in "${BN_VALUES[@]}"; do +for WM in "${WM_VALUES[@]}"; do +for WN in "${WN_VALUES[@]}"; do +for WN_ITER in "${WNITER_VALUES[@]}"; do +for TM in "${TM_VALUES[@]}"; do +for TN in "${TN_VALUES[@]}"; do +for NUM_THREADS in "${NUM_THREADS_VALUES[@]}"; do +echo "" +CONFIG_NUM=$(( CONFIG_NUM + 1 )) +# skip configurations that don't fullfil preconditions +NUM_WARPS=$(( NUM_THREADS / 32 )) +if ! (( BN % WN == 0 && BM % WM == 0 )); then + echo "Error: BN % WN must be 0 and BM % WM must be 0." + continue +fi +if ! (( (BN / WN) * (BM / WM) == NUM_WARPS )); then + echo "Error: (BN / WN) * (BM / WM) must be equal to NUM_WARPS." + continue +fi +if ! (( (WM * WN) % (WARPSIZE * TM * TN * WN_ITER) == 0 )); then + echo "Error: (WM * WN) % (WARPSIZE * TM * TN * WN_ITER) must be 0." + continue +fi +WM_ITER=$(( (WM * WN) / (WARPSIZE * TM * TN * WN_ITER) )) +if ! (( WM % WM_ITER == 0 && WN % WN_ITER == 0 )); then + echo "Error: WM % WM_ITER must be 0 and WN % WN_ITER must be 0." + continue +fi +if ! (( (NUM_THREADS * 4) % BK == 0 )); then + echo "Error: (NUM_THREADS * 4) % BK must be 0." + continue +fi +if ! (( (NUM_THREADS * 4) % BN == 0 )); then + echo "Error: (NUM_THREADS * 4) % BN must be 0." + continue +fi +if ! (( BN % (16 * TN) == 0 )); then + echo "Error: BN must be a multiple of 16 * TN." + continue +fi +if ! (( BM % (16 * TM) == 0 )); then + echo "Error: BM must be a multiple of 16 * TM." + continue +fi +if ! (( (BM * BK) % (4 * NUM_THREADS) == 0 )); then + echo "Error: (BM * BK) % (4 * NUM_THREADS) must be 0." + continue +fi +if ! (( (BN * BK) % (4 * NUM_THREADS) == 0 )); then + echo "Error: (BN * BK) % (4 * NUM_THREADS) must be 0." + continue +fi + +# Update the parameters in the source code +sed -i "s/const uint K10_NUM_THREADS = .*/const uint K10_NUM_THREADS = $NUM_THREADS;/" $RUNNER +sed -i "s/const uint K10_BN = .*/const uint K10_BN = $BN;/" $RUNNER +sed -i "s/const uint K10_BM = .*/const uint K10_BM = $BM;/" $RUNNER +sed -i "s/const uint K10_BK = .*/const uint K10_BK = $BK;/" $RUNNER +sed -i "s/const uint K10_WM = .*/const uint K10_WM = $WM;/" $RUNNER +sed -i "s/const uint K10_WN = .*/const uint K10_WN = $WN;/" $RUNNER +sed -i "s/const uint K10_WNITER = .*/const uint K10_WNITER = $WN_ITER;/" $RUNNER +sed -i "s/const uint K10_TM = .*/const uint K10_TM = $TM;/" $RUNNER +sed -i "s/const uint K10_TN = .*/const uint K10_TN = $TN;/" $RUNNER + +# Rebuild the program +ninja + +echo "($CONFIG_NUM/$TOTAL_CONFIGS): BK=$BK BM=$BM BN=$BN WM=$WM WN=$WN WN_ITER=$WN_ITER TM=$TM TN=$TN NUM_THREADS=$NUM_THREADS" |& tee -a $OUTPUT +# Run the benchmark and get the result +# Kill the program after 4 seconds if it doesn't finish +timeout -v 8 ./sgemm 10 | tee -a $OUTPUT +done +done +done +done +done +done +done +done +done \ No newline at end of file diff --git a/src/runner.cu b/src/runner.cu index 5d04bd4..eb05392 100644 --- a/src/runner.cu +++ b/src/runner.cu @@ -325,15 +325,13 @@ void runSgemmWarptiling(int M, int N, int K, float alpha, float *A, float *B, float beta, float *C) { const uint K10_NUM_THREADS = 128; const uint K10_BN = 128; - const uint K10_BM = 128; - const uint K10_BK = 16; + const uint K10_BM = 64; + const uint K10_BK = 8; const uint K10_WN = 64; - const uint K10_WM = 64; - const uint K10_WMITER = 2; - const uint K10_TN = 8; + const uint K10_WM = 32; + const uint K10_WNITER = 2; + const uint K10_TN = 4; const uint K10_TM = 4; - constexpr uint K10_WNITER = - (K10_WM * K10_WN) / (32 * K10_TM * K10_TN * K10_WMITER); dim3 blockDim(K10_NUM_THREADS); constexpr uint NUM_WARPS = K10_NUM_THREADS / 32; @@ -342,11 +340,13 @@ void runSgemmWarptiling(int M, int N, int K, float alpha, float *A, float *B, static_assert((K10_BN % K10_WN == 0) and (K10_BM % K10_WM == 0)); static_assert((K10_BN / K10_WN) * (K10_BM / K10_WM) == NUM_WARPS); - // warpsubtile in warptile - static_assert((K10_WM % K10_WMITER == 0) and (K10_WN % K10_WNITER == 0)); // threads in warpsubtile static_assert((K10_WM * K10_WN) % (WARPSIZE * K10_TM * K10_TN * K10_WNITER) == 0); + constexpr uint K10_WMITER = + (K10_WM * K10_WN) / (32 * K10_TM * K10_TN * K10_WNITER); + // warpsubtile in warptile + static_assert((K10_WM % K10_WMITER == 0) and (K10_WN % K10_WNITER == 0)); static_assert((K10_NUM_THREADS * 4) % K10_BK == 0, "NUM_THREADS*4 must be multiple of K9_BK to avoid quantization "