diff --git a/scripts/kernel_9_autotuner.sh b/scripts/kernel_9_autotuner.sh index 53a4575..ad3ba9e 100755 --- a/scripts/kernel_9_autotuner.sh +++ b/scripts/kernel_9_autotuner.sh @@ -3,15 +3,11 @@ set -u # Define the range of values for each parameter -BK_VALUES="8 16 32 64" -TM_VALUES="4 8 16 32" -TN_VALUES="4 8 16 32" -BM_VALUES="64 128 256" -BN_VALUES="64 128 256" - -# Keep track of best combination and its result -best_result=0 -best_params="" +BK_VALUES=(8 16 32 64) +TM_VALUES=(4 8 16 32) +TN_VALUES=(4 8 16 32) +BM_VALUES=(64 128 256) +BN_VALUES=(64 128 256) cd "$(dirname "$0")" cd "../build" @@ -19,15 +15,43 @@ cd "../build" RUNNER="../src/runner.cu" OUTPUT="../scripts/kernel_9_autotune_results.txt" +# Clear the output file +echo "" > $OUTPUT + # Set GPU to use export DEVICE="2" +TOTAL_CONFIGS="$(( ${#BK_VALUES[@]} * ${#TM_VALUES[@]} * ${#TN_VALUES[@]} * ${#BM_VALUES[@]} * ${#BN_VALUES[@]} ))" +CONFIG_NUM=0 + # Loop through all combinations of parameters -for bk in $BK_VALUES; do - for tm in $TM_VALUES; do - for tn in $TN_VALUES; do - for bm in $BM_VALUES; do - for bn in $BN_VALUES; do +for bk in ${BK_VALUES[@]}; do + for tm in ${TM_VALUES[@]}; do + for tn in ${TN_VALUES[@]}; do + for bm in ${BM_VALUES[@]}; do + for bn in ${BN_VALUES[@]}; do + echo "" + CONFIG_NUM=$(( $CONFIG_NUM + 1 )) + + # skip configurations that don't fullfil preconditions + config="BK=$bk TM=$tm TN=$tn BM=$bm BN=$bn" + if [[ $(( $bn % (16 * $tn ) )) -ne 0 ]]; then + echo "QUANTIZATION: Skipping $config because BN % (16 * TN) = $(( $bn % (16 * $tn ) )) != 0))" + continue + fi + if [[ $(( $bm % (16 * $tm ) )) -ne 0 ]]; then + echo "QUANTIZATION: Skipping $config because BM % (16 * TM) = $(( $bm % (16 * $tm ) )) != 0))" + continue + fi + if [[ $(( ($bm * $bk) % ( 4 * 256 ) )) -ne 0 ]]; then + echo "VECTORIZE: Skipping $config because (BM * BK) % (4 * NUM_THREADS) = $(( ($bm * $bk) % ( 4 * 256 ) )) != 0))" + continue + fi + if [[ $(( ($bn * $bk) % ( 4 * 256 ) )) -ne 0 ]]; then + echo "VECTORIZE: Skipping $config because (BN * BK) % (4 * NUM_THREADS) = $(( ($bn * $bk) % ( 4 * 256 ) )) != 0))" + continue + fi + # Update the parameters in the source code sed -i "s/const uint K9_BK = .*/const uint K9_BK = $bk;/" $RUNNER sed -i "s/const uint K9_TM = .*/const uint K9_TM = $tm;/" $RUNNER @@ -38,7 +62,7 @@ for bk in $BK_VALUES; do # Rebuild the program ninja - echo "BK=$bk TM=$tm TN=$tn BM=$bm BN=$bn" | tee -a $OUTPUT + echo "($CONFIG_NUM/$TOTAL_CONFIGS): BK=$bk TM=$tm TN=$tn BM=$bm BN=$bn" |& tee -a $OUTPUT # Run the benchmark and get the result # Kill the program after 4 seconds if it doesn't finish timeout -v 4 ./sgemm 9 | tee -a $OUTPUT diff --git a/sgemm.cu b/sgemm.cu index 6849415..77326f8 100644 --- a/sgemm.cu +++ b/sgemm.cu @@ -89,7 +89,7 @@ int main(int argc, char **argv) { cudaCheck(cudaMemcpy(dC_ref, C, sizeof(float) * max_size * max_size, cudaMemcpyHostToDevice)); - int repeat_times = 10; + int repeat_times = 25; for (int size : SIZE) { m = n = k = size; diff --git a/src/runner.cu b/src/runner.cu index 46aa9fb..34dd8cf 100644 --- a/src/runner.cu +++ b/src/runner.cu @@ -288,26 +288,28 @@ void runSgemmResolveBankExtraCol(int M, int N, int K, float alpha, float *A, void runSgemmAutotuned(int M, int N, int K, float alpha, float *A, float *B, float beta, float *C) { + const uint NUM_THREADS = 256; const uint K9_BK = 16; const uint K9_TM = 8; const uint K9_TN = 8; - if (M >= 128 and N >= 128) { - const uint K9_BM = 128; - const uint K9_BN = 128; - dim3 gridDim(CEIL_DIV(N, K9_BN), CEIL_DIV(M, K9_BM)); - dim3 blockDim((K9_BM * K9_BN) / (K9_TM * K9_TN)); - sgemmAutotuned - <<>>(M, N, K, alpha, A, B, beta, C); - } else { - // this is a hacky solution to the underlying problem - // of not having proper bounds checking in the kernel - const uint K9_BM = 64; - const uint K9_BN = 64; - dim3 gridDim(CEIL_DIV(N, K9_BN), CEIL_DIV(M, K9_BM)); - dim3 blockDim((K9_BM * K9_BN) / (K9_TM * K9_TN)); - sgemmAutotuned - <<>>(M, N, K, alpha, A, B, beta, C); - } + dim3 blockDim(NUM_THREADS); + const uint K9_BM = 128; + const uint K9_BN = 128; + + static_assert( + K9_BN % (16 * K9_TN) == 0, + "K9_BN must be a multiple of 16*K9_TN to avoid quantization effects"); + static_assert( + K9_BM % (16 * K9_TM) == 0, + "K9_BM must be a multiple of 16*K9_TM to avoid quantization effects"); + static_assert((K9_BM * K9_BK) % (4 * NUM_THREADS) == 0, + "K9_BM*K9_BK must be a multiple of 4*256 to vectorize loads"); + static_assert((K9_BN * K9_BK) % (4 * NUM_THREADS) == 0, + "K9_BN*K9_BK must be a multiple of 4*256 to vectorize loads"); + + dim3 gridDim(CEIL_DIV(N, K9_BN), CEIL_DIV(M, K9_BM)); + sgemmAutotuned + <<>>(M, N, K, alpha, A, B, beta, C); } void run_kernel(int kernel_num, int M, int N, int K, float alpha, float *A,