diff --git a/matMul.cu b/matMul.cu index 29ef5a2..3c04173 100644 --- a/matMul.cu +++ b/matMul.cu @@ -1,89 +1,89 @@ #include #include -__global__ void matMul(float *d_A, float *d_B, float *d_C, int N, int M, int K) { +__global__ void matMul(float *d_A, float *d_B, float *d_C, int M, int N, int P) { int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; - if(row < N && col < K) { + if(row < M && col < P) { float sum = 0.0f; // compute the dot product for each row of A and col of B - for(int i = 0; i < M; ++i) { - sum += d_A[row * M + i] * d_B[i * K + col]; + for(int i = 0; i < N; ++i) { + sum += d_A[row * N + i] * d_B[i * P + col]; } - d_C[row * K + col] = sum; + d_C[row * P + col] = sum; } } int main() { // variable initialization - int N = 2; - int M = 3; - int K = 5; + int M = 2; + int N = 3; + int P = 5; float *h_A, *h_B, *h_C; float *d_A, *d_B, *d_C; // memory allocation - h_A = (float *)malloc(N * M * sizeof(float)); - h_B = (float *)malloc(M * K * sizeof(float)); - h_C = (float *)malloc(N * K * sizeof(float)); + h_A = (float *)malloc(M * N * sizeof(float)); + h_B = (float *)malloc(N * P * sizeof(float)); + h_C = (float *)malloc(M * P * sizeof(float)); - cudaMalloc((void**)&d_A, N * M * sizeof(float)); - cudaMalloc((void**)&d_B, M * K * sizeof(float)); - cudaMalloc((void**)&d_C, N * K * sizeof(float)); + cudaMalloc((void**)&d_A, M * N * sizeof(float)); + cudaMalloc((void**)&d_B, N * P * sizeof(float)); + cudaMalloc((void**)&d_C, M * P * sizeof(float)); // initial data - for(int i = 0; i < N; ++i) { - for(int j = 0; j < M; ++j) { - h_A[i * M + j] = (float) (rand() % 10 + 1); + for(int i = 0; i < M; ++i) { + for(int j = 0; j < N; ++j) { + h_A[i * N + j] = (float) (rand() % 10 + 1); } } - for(int i = 0; i < M; ++i) { - for(int j = 0; j < K; ++j) { - h_B[i * K + j] = (float) (rand() % 10 + 1); + for(int i = 0; i < N; ++i) { + for(int j = 0; j < P; ++j) { + h_B[i * P + j] = (float) (rand() % 10 + 1); } } // copy CPU data to GPU memory blocks - cudaMemcpy(d_A, h_A, N * M * sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(d_B, h_B, M * K * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_B, h_B, N * P * sizeof(float), cudaMemcpyHostToDevice); // set grid and block dimensions dim3 blockDim(16, 16); - dim3 gridDim((K + blockDim.x - 1)/blockDim.x, (N + blockDim.y - 1)/blockDim.y); + dim3 gridDim((P + blockDim.x - 1)/blockDim.x, (M + blockDim.y - 1)/blockDim.y); // run matmul - matMul<<>>(d_A, d_B, d_C, N, M, K); + matMul<<>>(d_A, d_B, d_C, M, N, P); // transfer data from device to host - cudaMemcpy(h_C, d_C, N * K * sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(h_C, d_C, M * P * sizeof(float), cudaMemcpyDeviceToHost); // print statements printf("Matrix A:\n--------\n"); - for(int i = 0; i < N; ++i) { - for(int j = 0; j < M; ++j) { - printf("%f ", h_A[i * M + j]); + for(int i = 0; i < M; ++i) { + for(int j = 0; j < N; ++j) { + printf("%f ", h_A[i * N + j]); } printf("\n"); } printf("--------\n"); printf("Matrix B:\n--------\n"); - for(int i = 0; i < M; ++i) { - for(int j = 0; j < K; ++j) { - printf("%f ", h_B[i * K + j]); + for(int i = 0; i < N; ++i) { + for(int j = 0; j < P; ++j) { + printf("%f ", h_B[i * P + j]); } printf("\n"); } printf("--------\n"); printf("Matrix C:\n--------\n"); - for(int i = 0; i < N; ++i) { - for(int j = 0; j < K; ++j) { - printf("%f ", h_C[i * K + j]); + for(int i = 0; i < M; ++i) { + for(int j = 0; j < P; ++j) { + printf("%f ", h_C[i * P + j]); } printf("\n"); }