diff --git a/scripts/kernel_9_autotuner.sh b/scripts/kernel_9_autotuner.sh
index 53a4575..ad3ba9e 100755
--- a/scripts/kernel_9_autotuner.sh
+++ b/scripts/kernel_9_autotuner.sh
@@ -3,15 +3,11 @@
 set -u
 
 # Define the range of values for each parameter
-BK_VALUES="8 16 32 64"
-TM_VALUES="4 8 16 32"
-TN_VALUES="4 8 16 32"
-BM_VALUES="64 128 256"
-BN_VALUES="64 128 256"
-
-# Keep track of best combination and its result
-best_result=0
-best_params=""
+BK_VALUES=(8 16 32 64)
+TM_VALUES=(4 8 16 32)
+TN_VALUES=(4 8 16 32)
+BM_VALUES=(64 128 256)
+BN_VALUES=(64 128 256)
 
 cd "$(dirname "$0")"
 cd "../build"
@@ -19,15 +15,43 @@ cd "../build"
 RUNNER="../src/runner.cu"
 OUTPUT="../scripts/kernel_9_autotune_results.txt"
 
+# Clear the output file
+echo "" > $OUTPUT
+
 # Set GPU to use
 export DEVICE="2"
 
+TOTAL_CONFIGS="$(( ${#BK_VALUES[@]} * ${#TM_VALUES[@]} * ${#TN_VALUES[@]} * ${#BM_VALUES[@]} * ${#BN_VALUES[@]} ))"
+CONFIG_NUM=0
+
 # Loop through all combinations of parameters
-for bk in $BK_VALUES; do
-  for tm in $TM_VALUES; do
-    for tn in $TN_VALUES; do
-      for bm in $BM_VALUES; do
-        for bn in $BN_VALUES; do
+for bk in ${BK_VALUES[@]}; do
+  for tm in ${TM_VALUES[@]}; do
+    for tn in ${TN_VALUES[@]}; do
+      for bm in ${BM_VALUES[@]}; do
+        for bn in ${BN_VALUES[@]}; do
+          echo ""
+          CONFIG_NUM=$(( $CONFIG_NUM + 1 ))
+
+          # skip configurations that don't fullfil preconditions
+          config="BK=$bk TM=$tm TN=$tn BM=$bm BN=$bn"
+          if [[ $(( $bn % (16 * $tn ) )) -ne 0 ]]; then
+            echo "QUANTIZATION: Skipping $config because BN % (16 * TN) = $(( $bn % (16 * $tn ) )) != 0))"
+            continue
+          fi
+          if [[ $(( $bm % (16 * $tm ) )) -ne 0 ]]; then
+            echo "QUANTIZATION: Skipping $config because BM % (16 * TM) = $(( $bm % (16 * $tm ) )) != 0))"
+            continue
+          fi
+          if [[ $(( ($bm * $bk) % ( 4 * 256 ) )) -ne 0 ]]; then
+            echo "VECTORIZE: Skipping $config because (BM * BK) % (4 * NUM_THREADS) = $(( ($bm * $bk) % ( 4 * 256 ) )) != 0))"
+            continue
+          fi
+          if [[ $(( ($bn * $bk) % ( 4 * 256 ) )) -ne 0 ]]; then
+            echo "VECTORIZE: Skipping $config because (BN * BK) % (4 * NUM_THREADS) = $(( ($bn * $bk) % ( 4 * 256 ) )) != 0))"
+            continue
+          fi
+
           # Update the parameters in the source code
           sed -i "s/const uint K9_BK = .*/const uint K9_BK = $bk;/" $RUNNER
           sed -i "s/const uint K9_TM = .*/const uint K9_TM = $tm;/" $RUNNER
@@ -38,7 +62,7 @@ for bk in $BK_VALUES; do
           # Rebuild the program
           ninja 
 
-          echo "BK=$bk TM=$tm TN=$tn BM=$bm BN=$bn" | tee -a $OUTPUT
+          echo "($CONFIG_NUM/$TOTAL_CONFIGS): BK=$bk TM=$tm TN=$tn BM=$bm BN=$bn" |& tee -a $OUTPUT
           # Run the benchmark and get the result
           # Kill the program after 4 seconds if it doesn't finish
           timeout -v 4 ./sgemm 9 | tee -a $OUTPUT
diff --git a/sgemm.cu b/sgemm.cu
index 6849415..77326f8 100644
--- a/sgemm.cu
+++ b/sgemm.cu
@@ -89,7 +89,7 @@ int main(int argc, char **argv) {
   cudaCheck(cudaMemcpy(dC_ref, C, sizeof(float) * max_size * max_size,
                        cudaMemcpyHostToDevice));
 
-  int repeat_times = 10;
+  int repeat_times = 25;
   for (int size : SIZE) {
     m = n = k = size;
 
diff --git a/src/runner.cu b/src/runner.cu
index 46aa9fb..34dd8cf 100644
--- a/src/runner.cu
+++ b/src/runner.cu
@@ -288,26 +288,28 @@ void runSgemmResolveBankExtraCol(int M, int N, int K, float alpha, float *A,
 
 void runSgemmAutotuned(int M, int N, int K, float alpha, float *A, float *B,
                        float beta, float *C) {
+  const uint NUM_THREADS = 256;
   const uint K9_BK = 16;
   const uint K9_TM = 8;
   const uint K9_TN = 8;
-  if (M >= 128 and N >= 128) {
-    const uint K9_BM = 128;
-    const uint K9_BN = 128;
-    dim3 gridDim(CEIL_DIV(N, K9_BN), CEIL_DIV(M, K9_BM));
-    dim3 blockDim((K9_BM * K9_BN) / (K9_TM * K9_TN));
-    sgemmAutotuned<K9_BM, K9_BN, K9_BK, K9_TM, K9_TN>
-        <<<gridDim, blockDim>>>(M, N, K, alpha, A, B, beta, C);
-  } else {
-    // this is a hacky solution to the underlying problem
-    // of not having proper bounds checking in the kernel
-    const uint K9_BM = 64;
-    const uint K9_BN = 64;
-    dim3 gridDim(CEIL_DIV(N, K9_BN), CEIL_DIV(M, K9_BM));
-    dim3 blockDim((K9_BM * K9_BN) / (K9_TM * K9_TN));
-    sgemmAutotuned<K9_BM, K9_BN, K9_BK, K9_TM, K9_TN>
-        <<<gridDim, blockDim>>>(M, N, K, alpha, A, B, beta, C);
-  }
+  dim3 blockDim(NUM_THREADS);
+  const uint K9_BM = 128;
+  const uint K9_BN = 128;
+
+  static_assert(
+      K9_BN % (16 * K9_TN) == 0,
+      "K9_BN must be a multiple of 16*K9_TN to avoid quantization effects");
+  static_assert(
+      K9_BM % (16 * K9_TM) == 0,
+      "K9_BM must be a multiple of 16*K9_TM to avoid quantization effects");
+  static_assert((K9_BM * K9_BK) % (4 * NUM_THREADS) == 0,
+                "K9_BM*K9_BK must be a multiple of 4*256 to vectorize loads");
+  static_assert((K9_BN * K9_BK) % (4 * NUM_THREADS) == 0,
+                "K9_BN*K9_BK must be a multiple of 4*256 to vectorize loads");
+
+  dim3 gridDim(CEIL_DIV(N, K9_BN), CEIL_DIV(M, K9_BM));
+  sgemmAutotuned<K9_BM, K9_BN, K9_BK, K9_TM, K9_TN>
+      <<<gridDim, blockDim>>>(M, N, K, alpha, A, B, beta, C);
 }
 
 void run_kernel(int kernel_num, int M, int N, int K, float alpha, float *A,