CUDA编程之矩阵乘法
生活随笔
收集整理的這篇文章主要介紹了
CUDA编程之矩阵乘法
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
cuda編程矩陣乘法
剛學 所以來個簡單例子
a? n*n? ? X? ?b? n * n
a 的一行? *? b的一列? 作為一個線程
代碼 matMul.cu
// // Created by smallflyfly on 2021/6/3. //#include <stdio.h> #include <cuda_runtime.h> #include "cuda_code.h"__global__ void matMultiply(float *a, float *b, float *c, int width, int height) {int tx = threadIdx.x + blockIdx.x * blockDim.x;int ty = threadIdx.y + blockIdx.y * blockDim.y;if (tx >= width || ty >= height) return;float mulValue = 0.0;for (int i = 0; i < width; i++) {mulValue += a[ty * width + i] * b[tx * height + i];}c[ty * width + tx] = mulValue; }int main() {int width = 1 << 2;int height = 1 << 2;float *ah, *bh, *ch;unsigned nBytes = width * height * sizeof(float);ah = (float*)malloc(nBytes);bh = (float*)malloc(nBytes);ch = (float*)malloc(nBytes);for (int i = 0; i < width * height; i++) {ah[i] = 1.0;bh[i] = 2.0;}float *ad, *bd, *cd;// malloc deviceCHECK(cudaMalloc((void**)&ad, nBytes));CHECK(cudaMalloc((void**)&bd, nBytes));CHECK(cudaMalloc((void**)&cd, nBytes));// copy host data to deviceCHECK(cudaMemcpyAsync(ad, ah, nBytes, cudaMemcpyHostToDevice));CHECK(cudaMemcpyAsync(bd, bh, nBytes, cudaMemcpyHostToDevice));// block griddim3 blockSize(2, 4);dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);// run kernelmatMultiply<<<gridSize, blockSize>>>(ad, bd, cd, width, height);cudaDeviceSynchronize();// copy result from device to hostCHECK(cudaMemcpyAsync(ch, cd, nBytes, cudaMemcpyDeviceToHost));float maxError = 0.0;for (int i = 0; i < width * height; i++) {printf("%.2f ", ch[i]);if ((i+1) % width == 0) printf("\n");}// 釋放內存cudaFree(ad);cudaFree(bd);cudaFree(cd);free(ah);free(bh);free(ch);cudaDeviceReset();return 0; }?
https://github.com/Smallflyfly/cuda_basic.git
總結
以上是生活随笔為你收集整理的CUDA编程之矩阵乘法的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Markdown MarkdownPa
- 下一篇: iOS给图片打水印,并将打过水印的图片生