博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
CUDA 矩阵相乘完整代码
阅读量:4960 次
发布时间:2019-06-12

本文共 13753 字,大约阅读时间需要 45 分钟。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include <stdlib.h>
#include <time.h>
#include "cublas_v2.h"

#define BLOCK_SIZE 16

cudaError_t multiCuda(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW);

__global__ void multiKernel(float *c, float *a, float*b, unsigned int aW, unsigned int bW)

{
  //saved in register
  int xBlock = blockIdx.x;
  int yBlock = blockIdx.y;
  int xThread = threadIdx.x;
  int yThread = threadIdx.y;

  unsigned int aWidth = aW;

  unsigned int bWidth = bW;

  float Cvalue= 0;

  for(int i=0; i< aWidth/BLOCK_SIZE; ++i)

  {
    __shared__ int aSub[BLOCK_SIZE][BLOCK_SIZE];
    __shared__ int bSub[BLOCK_SIZE][BLOCK_SIZE];

    aSub[yThread][xThread] = a[(yBlock*blockDim.y + yThread)*aWidth + i*blockDim.x + xThread];

    bSub[yThread][xThread] = b[(i*blockDim.y + yThread)*bWidth + xBlock*blockDim.x + xThread];

    __syncthreads();

    for(int e=0; e<BLOCK_SIZE; ++e)

    {
      Cvalue += aSub[yThread][e]*bSub[e][xThread];
    }
    __syncthreads();
  }

  int cIndex = (yBlock*blockDim.y + yThread)*bWidth + xBlock*blockDim.x + xThread;

  c[cIndex] = Cvalue;
}

__global__ void multiKernel_NoLoop(float *c, float *a, float*b, unsigned int aW, unsigned int bW)

{
  int xBlock = blockIdx.x;
  int yBlock = blockIdx.y;
  int xThread = threadIdx.x;
  int yThread = threadIdx.y;

  unsigned int aWidth = aW;

  unsigned int bWidth = bW;

  float Cvalue= 0;

  for(int i=0; i< aWidth/BLOCK_SIZE; ++i)

  {
    __shared__ int aSub[BLOCK_SIZE][BLOCK_SIZE];
    __shared__ int bSub[BLOCK_SIZE][BLOCK_SIZE];

    aSub[yThread][xThread] = a[(yBlock*blockDim.y + yThread)*aWidth + i*blockDim.x + xThread];

    bSub[yThread][xThread] = b[(i*blockDim.y + yThread)*bWidth + xBlock*blockDim.x + xThread];

    __syncthreads();

    Cvalue += aSub[yThread][0]*bSub[0][xThread] + aSub[yThread][1]*bSub[1][xThread] + \

          aSub[yThread][2]*bSub[2][xThread] + aSub[yThread][3]*bSub[3][xThread] + \
          aSub[yThread][4]*bSub[4][xThread] + aSub[yThread][5]*bSub[5][xThread] + \
          aSub[yThread][6]*bSub[6][xThread] + aSub[yThread][7]*bSub[7][xThread] + \
          aSub[yThread][8]*bSub[8][xThread] + aSub[yThread][9]*bSub[9][xThread] + \
          aSub[yThread][10]*bSub[10][xThread] + aSub[yThread][11]*bSub[11][xThread] + \
          aSub[yThread][12]*bSub[12][xThread] + aSub[yThread][13]*bSub[13][xThread] + \
          aSub[yThread][14]*bSub[14][xThread] + aSub[yThread][15]*bSub[15][xThread] ;
    __syncthreads();
  }

  int cIndex = (yBlock*blockDim.y + yThread)*bWidth + xBlock*blockDim.x + xThread;

  c[cIndex] = Cvalue;
}

cudaError_t multiWithcuBlase(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW);

void multiCPU(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW);

int main()

{
  const unsigned int aH = 320;
  const unsigned int aW = 320;
  const unsigned int bW = 320;
  const unsigned int bH = aW;
  const unsigned int cH = aH;
  const unsigned int cW = bW;
  float *cpu_a, *cpu_b,*cpu_c;

  cpu_a = (float*)malloc(aH*aW*sizeof(float));

  cpu_b = (float*)malloc(bH*bW*sizeof(float));
  cpu_c = (float*)malloc(cH*cW*sizeof(float));

  for(int y=0; y<aH; ++y)

  {
    for(int x =0; x<aW; ++x)
    {
      int index = y*aW + x;
      cpu_a[index] = (float)(x<y?x:y);
    }
  }

  for(int y=0; y<bH; ++y)

  {
    for(int x =0; x<bW; ++x)
    {
      int index = y*bW + x;
      cpu_b[index] = (float)(x<y?x:y);
    }
  }
  cudaError_t cudaStatus = multiCuda(cpu_c, cpu_a, cpu_b, aH, aW, bH, bW);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "multiCuda failed!");
  return 1;
  }
  /*
  for(int y=0; y<cH; ++y)
  {
    for(int x =0; x<cW; ++x)
    {
      if(x==1&&y==1)
      {
        int index = y*cW + x;
        printf("c(1,1)=%.1f\n",cpu_c[index]);
      }
    }
  }
*/
  cudaStatus = multiWithcuBlase(cpu_c, cpu_a, cpu_b, aH, aW, bH, bW);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "multiCuda failed!");
  return 1;
  }
  /*
  for(int y=0; y<cH; ++y)
  {
    for(int x =0; x<cW; ++x)
    {
      if(x==1&&y==1)
      {
        int index = y*cW + x;
        printf("c(1,1)=%.1f\n",cpu_c[index]);
      }
    }
  }
*/
  // cudaDeviceReset must be called before exiting in order for profiling and
  // tracing tools such as Nsight and Visual Profiler to show complete traces.
  cudaStatus = cudaDeviceReset();
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaDeviceReset failed!");
  return 1;
  }

  float start,end;

  start = clock();
  multiCPU(cpu_c, cpu_a, cpu_b, aH, aW, bH, bW);
  end = clock();
  printf("CPU runtime is %f msec\n ",end - start);

  free(cpu_a);

  free(cpu_b);
  free(cpu_c);

  getchar();

  return 0;
}

cudaError_t multiCuda(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW)

{
  float *gpu_a = 0;
  float *gpu_b = 0;
  float *gpu_c = 0;
  cudaError_t cudaStatus;

  cudaStatus = cudaSetDevice(0);

  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
  goto Error;
  }
  size_t size_a = aH*aW*sizeof(float);
  cudaStatus = cudaMalloc((void**)&gpu_a, size_a);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMalloc failed!");
  goto Error;
  }

  size_t size_b = bH*bW*sizeof(float);

  cudaStatus = cudaMalloc((void**)&gpu_b, size_b);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMalloc failed!");
  goto Error;
  }

  size_t size_c = aH*bW*sizeof(float);

  cudaStatus = cudaMalloc((void**)&gpu_c,size_c);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMalloc failed!");
  goto Error;
  }

  cudaStatus = cudaMemcpy(gpu_a, a, size_a, cudaMemcpyHostToDevice);

  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMemcpy failed!");
  goto Error;
  }

  cudaStatus = cudaMemcpy(gpu_b, b,size_b, cudaMemcpyHostToDevice);

  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMemcpy failed!");
  goto Error;
  }

  dim3 blocks(BLOCK_SIZE,BLOCK_SIZE);

  dim3 grids(bW/BLOCK_SIZE,aH/BLOCK_SIZE);

  cudaEvent_t start;

  cudaStatus = cudaEventCreate(&start);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(cudaStatus));
  goto Error;
  }
  cudaEvent_t stop;
  cudaStatus = cudaEventCreate(&stop);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(cudaStatus));
  goto Error;
  }

  cudaStatus = cudaEventRecord(start, NULL);

  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(cudaStatus));
  goto Error;
  }

  multiKernel<<<grids,blocks>>>(gpu_c,gpu_a,gpu_b,aW,bW);

  cudaStatus = cudaEventRecord(stop, NULL);

  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(cudaStatus));
  goto Error;
  }
  cudaStatus = cudaEventSynchronize(stop);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(cudaStatus));
  goto Error;
  }

  float msecTotal = 0.0f;

  cudaStatus = cudaEventElapsedTime(&msecTotal, start, stop);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(cudaStatus));
  goto Error;
  }
  printf("HS__ GPU runtime is %f msec\n",msecTotal);

/*******************************************************/

  cudaEvent_t start1;
  cudaStatus = cudaEventCreate(&start1);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(cudaStatus));
  goto Error;
  }
  cudaEvent_t stop1;
  cudaStatus = cudaEventCreate(&stop1);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(cudaStatus));
  goto Error;
  }

  cudaStatus = cudaEventRecord(start1, NULL);

  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(cudaStatus));
  goto Error;
  }

  multiKernel_NoLoop<<<grids,blocks>>>(gpu_c,gpu_a,gpu_b,aW,bW);

  cudaStatus = cudaEventRecord(stop1, NULL);

  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(cudaStatus));
  goto Error;
  }
  cudaStatus = cudaEventSynchronize(stop1);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(cudaStatus));
  goto Error;
  }

  float msecTotal1 = 0.0f;

  cudaStatus = cudaEventElapsedTime(&msecTotal1, start1, stop1);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(cudaStatus));
  goto Error;
  }
  printf("HS__NoLoop GPU runtime is %f msec\n",msecTotal1);

/***********************************************************/

  cudaStatus = cudaGetLastError();

  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
  goto Error;
  }
  // cudaDeviceSynchronize waits for the kernel to finish, and returns
  // any errors encountered during the launch.
  cudaStatus = cudaDeviceSynchronize();
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
  goto Error;
  }

  cudaStatus = cudaMemcpy(c, gpu_c, size_c, cudaMemcpyDeviceToHost);

  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMemcpy failed!");
  goto Error;
  }

Error:

  cudaFree(gpu_a);
  cudaFree(gpu_b);
  cudaFree(gpu_c);
  return cudaStatus;
}

void inline checkError(cublasStatus_t status, const char *msg)

{
  if (status != CUBLAS_STATUS_SUCCESS)
  {
  printf("%s", msg);
  exit(EXIT_FAILURE);
  }
}

cudaError_t multiWithcuBlase(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW)

{
  float *gpu_a = 0;
  float *gpu_b = 0;
  float *gpu_c = 0;
  cudaError_t cudaStatus;

  cudaStatus = cudaSetDevice(0);

  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
  goto Error;
  }
  size_t size_a = aH*aW*sizeof(float);
  cudaStatus = cudaMalloc((void**)&gpu_a, size_a);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMalloc failed!");
  goto Error;
  }

  size_t size_b = bH*bW*sizeof(float);

  cudaStatus = cudaMalloc((void**)&gpu_b, size_b);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMalloc failed!");
  goto Error;
  }

  size_t size_c = aH*bW*sizeof(float);

  cudaStatus = cudaMalloc((void**)&gpu_c,size_c);
  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMalloc failed!");
  goto Error;
  }

  cudaStatus = cudaMemcpy(gpu_a, a, size_a, cudaMemcpyHostToDevice);

  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMemcpy failed!");
  goto Error;
  }

  cudaStatus = cudaMemcpy(gpu_b, b,size_b, cudaMemcpyHostToDevice);

  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMemcpy failed!");
  goto Error;
  }
  dim3 blocks(BLOCK_SIZE,BLOCK_SIZE);
  dim3 grids(bW/BLOCK_SIZE,aH/BLOCK_SIZE);
  //printf("Computing result using CUBLAS...\n");

  cublasHandle_t handle;

  cublasStatus_t ret;
  ret = cublasCreate(&handle);
  if (ret != CUBLAS_STATUS_SUCCESS)
  {
  printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
  goto Error;
  }
  const float alpha = 1.0f;
  const float beta = 0.0f;

  cudaEvent_t start;

  cudaStatus = cudaEventCreate(&start);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(cudaStatus));
  goto Error;
  }
  cudaEvent_t stop;
  cudaStatus = cudaEventCreate(&stop);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(cudaStatus));
  goto Error;
  }

  cudaStatus = cudaEventRecord(start, NULL);

  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(cudaStatus));
  goto Error;
  }

  ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, aH, bW, aW, &alpha, gpu_a, aH, gpu_b, bH, &beta, gpu_c, aH);

  cudaStatus = cudaEventRecord(stop, NULL);

  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(cudaStatus));
  goto Error;
  }
  cudaStatus = cudaEventSynchronize(stop);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(cudaStatus));
  goto Error;
  }

  float msecTotal = 0.0f;

  cudaStatus = cudaEventElapsedTime(&msecTotal, start, stop);
  if (cudaStatus != cudaSuccess){
  fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(cudaStatus));
  goto Error;
  }
  printf("cuBlas__ GPU runtime is %f msec\n",msecTotal);

  cudaStatus = cudaMemcpy(c, gpu_c, size_c, cudaMemcpyDeviceToHost);

  if (cudaStatus != cudaSuccess) {
  fprintf(stderr, "cudaMemcpy failed!");
  goto Error;
  }
  checkError(cublasDestroy(handle), "cublasDestroy() error!\n");

Error:

  cudaFree(gpu_a);
  cudaFree(gpu_b);
  cudaFree(gpu_c);
  return cudaStatus;

}

void multiCPU(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW)
{
  for(int y=0; y<aH; ++y)
  {
    for(int x =0; x<bW; ++x)
    {
      int index = y*bW + x;
      c[index] = 0.0f;
      for(int i=0; i<aW; ++i)
      {
        c[index] += a[y*aW+i]*b[i*bW + x];
      }
    }
  }
}

转载于:https://www.cnblogs.com/huangshan/p/3916918.html

你可能感兴趣的文章
C++内联函数
查看>>
LNMP 1.1 php编译安装
查看>>
jw player参数设定(转)
查看>>
mysql 不常用备忘
查看>>
Mybatis自动化生成代码
查看>>
asp.net 动态添加多附件上传.
查看>>
sscanf()函数
查看>>
WEEX学习网站
查看>>
uDig介绍
查看>>
后台调用外部程序的完美实现
查看>>
python随机数random模块
查看>>
03-body标签中相关标签
查看>>
JavaScript:对Object对象的一些常用操作总结
查看>>
node assert.equal()
查看>>
buf.readUIntBE()
查看>>
Beta 冲刺(1/7)
查看>>
【luogu2747】 [USACO5.4]周游加拿大Canada Tour[动态规划]
查看>>
ubuntu安装mysql 时未提示输入密码
查看>>
L1-006 连续因子
查看>>
RabbitMQ入门(4)——路由(Routing)
查看>>