Skip to content

Commit

Permalink
Autotuner: check preconstraints
Browse files Browse the repository at this point in the history
  • Loading branch information
Simon Boehm committed Feb 5, 2023
1 parent 692e3dc commit c02f95d
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 33 deletions.
54 changes: 39 additions & 15 deletions scripts/kernel_9_autotuner.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,55 @@
set -u

# Define the range of values for each parameter
BK_VALUES="8 16 32 64"
TM_VALUES="4 8 16 32"
TN_VALUES="4 8 16 32"
BM_VALUES="64 128 256"
BN_VALUES="64 128 256"

# Keep track of best combination and its result
best_result=0
best_params=""
BK_VALUES=(8 16 32 64)
TM_VALUES=(4 8 16 32)
TN_VALUES=(4 8 16 32)
BM_VALUES=(64 128 256)
BN_VALUES=(64 128 256)

cd "$(dirname "$0")"
cd "../build"

RUNNER="../src/runner.cu"
OUTPUT="../scripts/kernel_9_autotune_results.txt"

# Clear the output file
echo "" > $OUTPUT

# Set GPU to use
export DEVICE="2"

TOTAL_CONFIGS="$(( ${#BK_VALUES[@]} * ${#TM_VALUES[@]} * ${#TN_VALUES[@]} * ${#BM_VALUES[@]} * ${#BN_VALUES[@]} ))"
CONFIG_NUM=0

# Loop through all combinations of parameters
for bk in $BK_VALUES; do
for tm in $TM_VALUES; do
for tn in $TN_VALUES; do
for bm in $BM_VALUES; do
for bn in $BN_VALUES; do
for bk in ${BK_VALUES[@]}; do
for tm in ${TM_VALUES[@]}; do
for tn in ${TN_VALUES[@]}; do
for bm in ${BM_VALUES[@]}; do
for bn in ${BN_VALUES[@]}; do
echo ""
CONFIG_NUM=$(( $CONFIG_NUM + 1 ))

# skip configurations that don't fullfil preconditions
config="BK=$bk TM=$tm TN=$tn BM=$bm BN=$bn"
if [[ $(( $bn % (16 * $tn ) )) -ne 0 ]]; then
echo "QUANTIZATION: Skipping $config because BN % (16 * TN) = $(( $bn % (16 * $tn ) )) != 0))"
continue
fi
if [[ $(( $bm % (16 * $tm ) )) -ne 0 ]]; then
echo "QUANTIZATION: Skipping $config because BM % (16 * TM) = $(( $bm % (16 * $tm ) )) != 0))"
continue
fi
if [[ $(( ($bm * $bk) % ( 4 * 256 ) )) -ne 0 ]]; then
echo "VECTORIZE: Skipping $config because (BM * BK) % (4 * NUM_THREADS) = $(( ($bm * $bk) % ( 4 * 256 ) )) != 0))"
continue
fi
if [[ $(( ($bn * $bk) % ( 4 * 256 ) )) -ne 0 ]]; then
echo "VECTORIZE: Skipping $config because (BN * BK) % (4 * NUM_THREADS) = $(( ($bn * $bk) % ( 4 * 256 ) )) != 0))"
continue
fi

# Update the parameters in the source code
sed -i "s/const uint K9_BK = .*/const uint K9_BK = $bk;/" $RUNNER
sed -i "s/const uint K9_TM = .*/const uint K9_TM = $tm;/" $RUNNER
Expand All @@ -38,7 +62,7 @@ for bk in $BK_VALUES; do
# Rebuild the program
ninja

echo "BK=$bk TM=$tm TN=$tn BM=$bm BN=$bn" | tee -a $OUTPUT
echo "($CONFIG_NUM/$TOTAL_CONFIGS): BK=$bk TM=$tm TN=$tn BM=$bm BN=$bn" |& tee -a $OUTPUT
# Run the benchmark and get the result
# Kill the program after 4 seconds if it doesn't finish
timeout -v 4 ./sgemm 9 | tee -a $OUTPUT
Expand Down
2 changes: 1 addition & 1 deletion sgemm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ int main(int argc, char **argv) {
cudaCheck(cudaMemcpy(dC_ref, C, sizeof(float) * max_size * max_size,
cudaMemcpyHostToDevice));

int repeat_times = 10;
int repeat_times = 25;
for (int size : SIZE) {
m = n = k = size;

Expand Down
36 changes: 19 additions & 17 deletions src/runner.cu
Original file line number Diff line number Diff line change
Expand Up @@ -288,26 +288,28 @@ void runSgemmResolveBankExtraCol(int M, int N, int K, float alpha, float *A,

void runSgemmAutotuned(int M, int N, int K, float alpha, float *A, float *B,
float beta, float *C) {
const uint NUM_THREADS = 256;
const uint K9_BK = 16;
const uint K9_TM = 8;
const uint K9_TN = 8;
if (M >= 128 and N >= 128) {
const uint K9_BM = 128;
const uint K9_BN = 128;
dim3 gridDim(CEIL_DIV(N, K9_BN), CEIL_DIV(M, K9_BM));
dim3 blockDim((K9_BM * K9_BN) / (K9_TM * K9_TN));
sgemmAutotuned<K9_BM, K9_BN, K9_BK, K9_TM, K9_TN>
<<<gridDim, blockDim>>>(M, N, K, alpha, A, B, beta, C);
} else {
// this is a hacky solution to the underlying problem
// of not having proper bounds checking in the kernel
const uint K9_BM = 64;
const uint K9_BN = 64;
dim3 gridDim(CEIL_DIV(N, K9_BN), CEIL_DIV(M, K9_BM));
dim3 blockDim((K9_BM * K9_BN) / (K9_TM * K9_TN));
sgemmAutotuned<K9_BM, K9_BN, K9_BK, K9_TM, K9_TN>
<<<gridDim, blockDim>>>(M, N, K, alpha, A, B, beta, C);
}
dim3 blockDim(NUM_THREADS);
const uint K9_BM = 128;
const uint K9_BN = 128;

static_assert(
K9_BN % (16 * K9_TN) == 0,
"K9_BN must be a multiple of 16*K9_TN to avoid quantization effects");
static_assert(
K9_BM % (16 * K9_TM) == 0,
"K9_BM must be a multiple of 16*K9_TM to avoid quantization effects");
static_assert((K9_BM * K9_BK) % (4 * NUM_THREADS) == 0,
"K9_BM*K9_BK must be a multiple of 4*256 to vectorize loads");
static_assert((K9_BN * K9_BK) % (4 * NUM_THREADS) == 0,
"K9_BN*K9_BK must be a multiple of 4*256 to vectorize loads");

dim3 gridDim(CEIL_DIV(N, K9_BN), CEIL_DIV(M, K9_BM));
sgemmAutotuned<K9_BM, K9_BN, K9_BK, K9_TM, K9_TN>
<<<gridDim, blockDim>>>(M, N, K, alpha, A, B, beta, C);
}

void run_kernel(int kernel_num, int M, int N, int K, float alpha, float *A,
Expand Down

0 comments on commit c02f95d

Please sign in to comment.