Skip to content

Commit

Permalink
Kernel 10: Autotuning
Browse files Browse the repository at this point in the history
  • Loading branch information
siboehm committed Feb 25, 2023
1 parent 9696ef8 commit 1d2038c
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 9 deletions.
115 changes: 115 additions & 0 deletions scripts/kernel_10_autotuner.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#!/usr/bin/env bash

set -u

# Define the range of values for each parameter
BK_VALUES=(8 16 32 64)
BM_VALUES=(64 128 256)
BN_VALUES=(64 128 256)
WM_VALUES=(32 64 128 256)
WN_VALUES=(32 64 128 256)
WNITER_VALUES=(1 2 4 8)
TM_VALUES=(4 8 16 32)
TN_VALUES=(4 8 16 32)
NUM_THREADS_VALUES=(128 256)

cd "$(dirname "$0")"
cd "../build"

RUNNER="../src/runner.cu"
OUTPUT="../benchmark_results/kernel_10_autotune_results.txt"

# Clear the output file
echo "" > $OUTPUT

# Set GPU to use
export DEVICE="0"
WARPSIZE=32


TOTAL_CONFIGS="$(( ${#BK_VALUES[@]} * ${#BM_VALUES[@]} * ${#BN_VALUES[@]} * ${#WM_VALUES[@]} * ${#WN_VALUES[@]} * ${#WNITER_VALUES[@]} * ${#TM_VALUES[@]} * ${#TN_VALUES[@]} * ${#NUM_THREADS_VALUES[@]} ))"
CONFIG_NUM=0

# Loop through all combinations of parameters
for BK in "${BK_VALUES[@]}"; do
for BM in "${BM_VALUES[@]}"; do
for BN in "${BN_VALUES[@]}"; do
for WM in "${WM_VALUES[@]}"; do
for WN in "${WN_VALUES[@]}"; do
for WN_ITER in "${WNITER_VALUES[@]}"; do
for TM in "${TM_VALUES[@]}"; do
for TN in "${TN_VALUES[@]}"; do
for NUM_THREADS in "${NUM_THREADS_VALUES[@]}"; do
echo ""
CONFIG_NUM=$(( CONFIG_NUM + 1 ))
# skip configurations that don't fullfil preconditions
NUM_WARPS=$(( NUM_THREADS / 32 ))
if ! (( BN % WN == 0 && BM % WM == 0 )); then
echo "Error: BN % WN must be 0 and BM % WM must be 0."
continue
fi
if ! (( (BN / WN) * (BM / WM) == NUM_WARPS )); then
echo "Error: (BN / WN) * (BM / WM) must be equal to NUM_WARPS."
continue
fi
if ! (( (WM * WN) % (WARPSIZE * TM * TN * WN_ITER) == 0 )); then
echo "Error: (WM * WN) % (WARPSIZE * TM * TN * WN_ITER) must be 0."
continue
fi
WM_ITER=$(( (WM * WN) / (WARPSIZE * TM * TN * WN_ITER) ))
if ! (( WM % WM_ITER == 0 && WN % WN_ITER == 0 )); then
echo "Error: WM % WM_ITER must be 0 and WN % WN_ITER must be 0."
continue
fi
if ! (( (NUM_THREADS * 4) % BK == 0 )); then
echo "Error: (NUM_THREADS * 4) % BK must be 0."
continue
fi
if ! (( (NUM_THREADS * 4) % BN == 0 )); then
echo "Error: (NUM_THREADS * 4) % BN must be 0."
continue
fi
if ! (( BN % (16 * TN) == 0 )); then
echo "Error: BN must be a multiple of 16 * TN."
continue
fi
if ! (( BM % (16 * TM) == 0 )); then
echo "Error: BM must be a multiple of 16 * TM."
continue
fi
if ! (( (BM * BK) % (4 * NUM_THREADS) == 0 )); then
echo "Error: (BM * BK) % (4 * NUM_THREADS) must be 0."
continue
fi
if ! (( (BN * BK) % (4 * NUM_THREADS) == 0 )); then
echo "Error: (BN * BK) % (4 * NUM_THREADS) must be 0."
continue
fi

# Update the parameters in the source code
sed -i "s/const uint K10_NUM_THREADS = .*/const uint K10_NUM_THREADS = $NUM_THREADS;/" $RUNNER
sed -i "s/const uint K10_BN = .*/const uint K10_BN = $BN;/" $RUNNER
sed -i "s/const uint K10_BM = .*/const uint K10_BM = $BM;/" $RUNNER
sed -i "s/const uint K10_BK = .*/const uint K10_BK = $BK;/" $RUNNER
sed -i "s/const uint K10_WM = .*/const uint K10_WM = $WM;/" $RUNNER
sed -i "s/const uint K10_WN = .*/const uint K10_WN = $WN;/" $RUNNER
sed -i "s/const uint K10_WNITER = .*/const uint K10_WNITER = $WN_ITER;/" $RUNNER
sed -i "s/const uint K10_TM = .*/const uint K10_TM = $TM;/" $RUNNER
sed -i "s/const uint K10_TN = .*/const uint K10_TN = $TN;/" $RUNNER

# Rebuild the program
ninja

echo "($CONFIG_NUM/$TOTAL_CONFIGS): BK=$BK BM=$BM BN=$BN WM=$WM WN=$WN WN_ITER=$WN_ITER TM=$TM TN=$TN NUM_THREADS=$NUM_THREADS" |& tee -a $OUTPUT
# Run the benchmark and get the result
# Kill the program after 4 seconds if it doesn't finish
timeout -v 8 ./sgemm 10 | tee -a $OUTPUT
done
done
done
done
done
done
done
done
done
18 changes: 9 additions & 9 deletions src/runner.cu
Original file line number Diff line number Diff line change
Expand Up @@ -325,15 +325,13 @@ void runSgemmWarptiling(int M, int N, int K, float alpha, float *A, float *B,
float beta, float *C) {
const uint K10_NUM_THREADS = 128;
const uint K10_BN = 128;
const uint K10_BM = 128;
const uint K10_BK = 16;
const uint K10_BM = 64;
const uint K10_BK = 8;
const uint K10_WN = 64;
const uint K10_WM = 64;
const uint K10_WMITER = 2;
const uint K10_TN = 8;
const uint K10_WM = 32;
const uint K10_WNITER = 2;
const uint K10_TN = 4;
const uint K10_TM = 4;
constexpr uint K10_WNITER =
(K10_WM * K10_WN) / (32 * K10_TM * K10_TN * K10_WMITER);
dim3 blockDim(K10_NUM_THREADS);

constexpr uint NUM_WARPS = K10_NUM_THREADS / 32;
Expand All @@ -342,11 +340,13 @@ void runSgemmWarptiling(int M, int N, int K, float alpha, float *A, float *B,
static_assert((K10_BN % K10_WN == 0) and (K10_BM % K10_WM == 0));
static_assert((K10_BN / K10_WN) * (K10_BM / K10_WM) == NUM_WARPS);

// warpsubtile in warptile
static_assert((K10_WM % K10_WMITER == 0) and (K10_WN % K10_WNITER == 0));
// threads in warpsubtile
static_assert((K10_WM * K10_WN) % (WARPSIZE * K10_TM * K10_TN * K10_WNITER) ==
0);
constexpr uint K10_WMITER =
(K10_WM * K10_WN) / (32 * K10_TM * K10_TN * K10_WNITER);
// warpsubtile in warptile
static_assert((K10_WM % K10_WMITER == 0) and (K10_WN % K10_WNITER == 0));

static_assert((K10_NUM_THREADS * 4) % K10_BK == 0,
"NUM_THREADS*4 must be multiple of K9_BK to avoid quantization "
Expand Down

0 comments on commit 1d2038c

Please sign in to comment.