From 8203533a8c9a1af5c273796a39de4a5a7f8e0626 Mon Sep 17 00:00:00 2001 From: James <78285353+linjames0@users.noreply.github.com> Date: Thu, 21 Sep 2023 17:19:23 -0400 Subject: [PATCH] Update matScale.cu --- matScale.cu | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/matScale.cu b/matScale.cu index 1df698c..6990402 100644 --- a/matScale.cu +++ b/matScale.cu @@ -2,50 +2,50 @@ #include // scale kernel -__global__ void matScale(float *d_A, float *d_B, float scale, int N, int M) { +__global__ void matScale(float *d_A, float *d_B, float scale, int M, int N) { int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; // scale matrix elements - if (row < N && col < M) { - d_B[row * M + col] = d_A[row * M + col] / scale; + if (row < M && col < N) { + d_B[row * N + col] = d_A[row * N + col] / scale; } } int main() { // var declaration - int N = 3; int M = 3; + int N = 3; float scale = 2.0f; - float *A, *B; + float *h_A, *h_B; float *d_A, *d_B; // allocate host memory - A = (float *)malloc(N * M * sizeof(float)); - B = (float *)malloc(N * M * sizeof(float)); + h_A = (float *)malloc(M * N * sizeof(float)); + h_B = (float *)malloc(M * N * sizeof(float)); // allocate device memory - cudaMalloc(&d_A, N * M * sizeof(float)); - cudaMalloc(&d_B, N * M * sizeof(float)); + cudaMalloc(&d_A, M * N * sizeof(float)); + cudaMalloc(&d_B, M * N * sizeof(float)); // initialize data - for (int i = 0; i < N * M; ++i) { - A[i] = i - 3; - B[i] = i; + for (int i = 0; i < M * N; ++i) { + h_A[i] = i - 3; + h_B[i] = i; } // copy host data to device - cudaMemcpy(d_A, A, N * M * sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(d_B, B, N * M * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_B, h_B, M * N * sizeof(float), cudaMemcpyHostToDevice); // launch kernel instance dim3 blockDim(16, 16); - dim3 gridDim((M + blockDim.x - 1)/blockDim.x, (N + blockDim.y - 1)/blockDim.y); - matScale<<>>(d_A, d_B, scale, N, M); + dim3 gridDim((N + blockDim.x - 1)/blockDim.x, (M + blockDim.y - 1)/blockDim.y); + matScale<<>>(d_A, d_B, scale, M, N); // copy result back to host - cudaMemcpy(A, d_A, N * M * sizeof(float), cudaMemcpyDeviceToHost); - cudaMemcpy(B, d_B, N * M * sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(A, d_A, M * N * sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(B, d_B, M * N * sizeof(float), cudaMemcpyDeviceToHost); // display results printf("Matrix A: \n");