#include <stdio.h>
#include <stdlib.h>#include <time.h>#include "cublas_v2.h"#define BLOCK_SIZE 16
cudaError_t multiCuda(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW);
__global__ void multiKernel(float *c, float *a, float*b, unsigned int aW, unsigned int bW)
{ //saved in register int xBlock = blockIdx.x; int yBlock = blockIdx.y; int xThread = threadIdx.x; int yThread = threadIdx.y;unsigned int aWidth = aW;
unsigned int bWidth = bW;float Cvalue= 0;
for(int i=0; i< aWidth/BLOCK_SIZE; ++i)
{ __shared__ int aSub[BLOCK_SIZE][BLOCK_SIZE]; __shared__ int bSub[BLOCK_SIZE][BLOCK_SIZE];aSub[yThread][xThread] = a[(yBlock*blockDim.y + yThread)*aWidth + i*blockDim.x + xThread];
bSub[yThread][xThread] = b[(i*blockDim.y + yThread)*bWidth + xBlock*blockDim.x + xThread];__syncthreads();
for(int e=0; e<BLOCK_SIZE; ++e)
{ Cvalue += aSub[yThread][e]*bSub[e][xThread]; } __syncthreads(); }int cIndex = (yBlock*blockDim.y + yThread)*bWidth + xBlock*blockDim.x + xThread;
c[cIndex] = Cvalue; }__global__ void multiKernel_NoLoop(float *c, float *a, float*b, unsigned int aW, unsigned int bW)
{ int xBlock = blockIdx.x; int yBlock = blockIdx.y; int xThread = threadIdx.x; int yThread = threadIdx.y;unsigned int aWidth = aW;
unsigned int bWidth = bW;float Cvalue= 0;
for(int i=0; i< aWidth/BLOCK_SIZE; ++i)
{ __shared__ int aSub[BLOCK_SIZE][BLOCK_SIZE]; __shared__ int bSub[BLOCK_SIZE][BLOCK_SIZE];aSub[yThread][xThread] = a[(yBlock*blockDim.y + yThread)*aWidth + i*blockDim.x + xThread];
bSub[yThread][xThread] = b[(i*blockDim.y + yThread)*bWidth + xBlock*blockDim.x + xThread];__syncthreads();
Cvalue += aSub[yThread][0]*bSub[0][xThread] + aSub[yThread][1]*bSub[1][xThread] + \
aSub[yThread][2]*bSub[2][xThread] + aSub[yThread][3]*bSub[3][xThread] + \ aSub[yThread][4]*bSub[4][xThread] + aSub[yThread][5]*bSub[5][xThread] + \ aSub[yThread][6]*bSub[6][xThread] + aSub[yThread][7]*bSub[7][xThread] + \ aSub[yThread][8]*bSub[8][xThread] + aSub[yThread][9]*bSub[9][xThread] + \ aSub[yThread][10]*bSub[10][xThread] + aSub[yThread][11]*bSub[11][xThread] + \ aSub[yThread][12]*bSub[12][xThread] + aSub[yThread][13]*bSub[13][xThread] + \ aSub[yThread][14]*bSub[14][xThread] + aSub[yThread][15]*bSub[15][xThread] ; __syncthreads(); }int cIndex = (yBlock*blockDim.y + yThread)*bWidth + xBlock*blockDim.x + xThread;
c[cIndex] = Cvalue; }cudaError_t multiWithcuBlase(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW);
void multiCPU(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW);
int main()
{ const unsigned int aH = 320; const unsigned int aW = 320; const unsigned int bW = 320; const unsigned int bH = aW; const unsigned int cH = aH; const unsigned int cW = bW; float *cpu_a, *cpu_b,*cpu_c;cpu_a = (float*)malloc(aH*aW*sizeof(float));
cpu_b = (float*)malloc(bH*bW*sizeof(float)); cpu_c = (float*)malloc(cH*cW*sizeof(float));for(int y=0; y<aH; ++y)
{ for(int x =0; x<aW; ++x) { int index = y*aW + x; cpu_a[index] = (float)(x<y?x:y); } }for(int y=0; y<bH; ++y)
{ for(int x =0; x<bW; ++x) { int index = y*bW + x; cpu_b[index] = (float)(x<y?x:y); } } cudaError_t cudaStatus = multiCuda(cpu_c, cpu_a, cpu_b, aH, aW, bH, bW); if (cudaStatus != cudaSuccess) { fprintf(stderr, "multiCuda failed!"); return 1; } /* for(int y=0; y<cH; ++y) { for(int x =0; x<cW; ++x) { if(x==1&&y==1) { int index = y*cW + x; printf("c(1,1)=%.1f\n",cpu_c[index]); } } } */ cudaStatus = multiWithcuBlase(cpu_c, cpu_a, cpu_b, aH, aW, bH, bW); if (cudaStatus != cudaSuccess) { fprintf(stderr, "multiCuda failed!"); return 1; } /* for(int y=0; y<cH; ++y) { for(int x =0; x<cW; ++x) { if(x==1&&y==1) { int index = y*cW + x; printf("c(1,1)=%.1f\n",cpu_c[index]); } } } */ // cudaDeviceReset must be called before exiting in order for profiling and // tracing tools such as Nsight and Visual Profiler to show complete traces. cudaStatus = cudaDeviceReset(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceReset failed!"); return 1; }float start,end;
start = clock(); multiCPU(cpu_c, cpu_a, cpu_b, aH, aW, bH, bW); end = clock(); printf("CPU runtime is %f msec\n ",end - start);free(cpu_a);
free(cpu_b); free(cpu_c);getchar();
return 0;}cudaError_t multiCuda(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW)
{ float *gpu_a = 0; float *gpu_b = 0; float *gpu_c = 0; cudaError_t cudaStatus;cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); goto Error; } size_t size_a = aH*aW*sizeof(float); cudaStatus = cudaMalloc((void**)&gpu_a, size_a); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; }size_t size_b = bH*bW*sizeof(float);
cudaStatus = cudaMalloc((void**)&gpu_b, size_b); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; }size_t size_c = aH*bW*sizeof(float);
cudaStatus = cudaMalloc((void**)&gpu_c,size_c); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; }cudaStatus = cudaMemcpy(gpu_a, a, size_a, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; }cudaStatus = cudaMemcpy(gpu_b, b,size_b, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; }dim3 blocks(BLOCK_SIZE,BLOCK_SIZE);
dim3 grids(bW/BLOCK_SIZE,aH/BLOCK_SIZE);cudaEvent_t start;
cudaStatus = cudaEventCreate(&start); if (cudaStatus != cudaSuccess){ fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(cudaStatus)); goto Error; } cudaEvent_t stop; cudaStatus = cudaEventCreate(&stop); if (cudaStatus != cudaSuccess){ fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(cudaStatus)); goto Error; }cudaStatus = cudaEventRecord(start, NULL);
if (cudaStatus != cudaSuccess){ fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(cudaStatus)); goto Error; }multiKernel<<<grids,blocks>>>(gpu_c,gpu_a,gpu_b,aW,bW);
cudaStatus = cudaEventRecord(stop, NULL);
if (cudaStatus != cudaSuccess){ fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(cudaStatus)); goto Error; } cudaStatus = cudaEventSynchronize(stop); if (cudaStatus != cudaSuccess){ fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(cudaStatus)); goto Error; }float msecTotal = 0.0f;
cudaStatus = cudaEventElapsedTime(&msecTotal, start, stop); if (cudaStatus != cudaSuccess){ fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(cudaStatus)); goto Error; } printf("HS__ GPU runtime is %f msec\n",msecTotal);/*******************************************************/
cudaEvent_t start1; cudaStatus = cudaEventCreate(&start1); if (cudaStatus != cudaSuccess){ fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(cudaStatus)); goto Error; } cudaEvent_t stop1; cudaStatus = cudaEventCreate(&stop1); if (cudaStatus != cudaSuccess){ fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(cudaStatus)); goto Error; }cudaStatus = cudaEventRecord(start1, NULL);
if (cudaStatus != cudaSuccess){ fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(cudaStatus)); goto Error; }multiKernel_NoLoop<<<grids,blocks>>>(gpu_c,gpu_a,gpu_b,aW,bW);
cudaStatus = cudaEventRecord(stop1, NULL);
if (cudaStatus != cudaSuccess){ fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(cudaStatus)); goto Error; } cudaStatus = cudaEventSynchronize(stop1); if (cudaStatus != cudaSuccess){ fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(cudaStatus)); goto Error; }float msecTotal1 = 0.0f;
cudaStatus = cudaEventElapsedTime(&msecTotal1, start1, stop1); if (cudaStatus != cudaSuccess){ fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(cudaStatus)); goto Error; } printf("HS__NoLoop GPU runtime is %f msec\n",msecTotal1);/***********************************************************/
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) { fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); goto Error; } // cudaDeviceSynchronize waits for the kernel to finish, and returns // any errors encountered during the launch. cudaStatus = cudaDeviceSynchronize(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); goto Error; }cudaStatus = cudaMemcpy(c, gpu_c, size_c, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; }Error:
cudaFree(gpu_a); cudaFree(gpu_b); cudaFree(gpu_c); return cudaStatus;}void inline checkError(cublasStatus_t status, const char *msg)
{ if (status != CUBLAS_STATUS_SUCCESS) { printf("%s", msg); exit(EXIT_FAILURE); }}cudaError_t multiWithcuBlase(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW)
{ float *gpu_a = 0; float *gpu_b = 0; float *gpu_c = 0; cudaError_t cudaStatus;cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); goto Error; } size_t size_a = aH*aW*sizeof(float); cudaStatus = cudaMalloc((void**)&gpu_a, size_a); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; }size_t size_b = bH*bW*sizeof(float);
cudaStatus = cudaMalloc((void**)&gpu_b, size_b); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; }size_t size_c = aH*bW*sizeof(float);
cudaStatus = cudaMalloc((void**)&gpu_c,size_c); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; }cudaStatus = cudaMemcpy(gpu_a, a, size_a, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; }cudaStatus = cudaMemcpy(gpu_b, b,size_b, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } dim3 blocks(BLOCK_SIZE,BLOCK_SIZE); dim3 grids(bW/BLOCK_SIZE,aH/BLOCK_SIZE); //printf("Computing result using CUBLAS...\n");cublasHandle_t handle;
cublasStatus_t ret; ret = cublasCreate(&handle); if (ret != CUBLAS_STATUS_SUCCESS) { printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__); goto Error; } const float alpha = 1.0f; const float beta = 0.0f;cudaEvent_t start;
cudaStatus = cudaEventCreate(&start); if (cudaStatus != cudaSuccess){ fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(cudaStatus)); goto Error; } cudaEvent_t stop; cudaStatus = cudaEventCreate(&stop); if (cudaStatus != cudaSuccess){ fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(cudaStatus)); goto Error; }cudaStatus = cudaEventRecord(start, NULL);
if (cudaStatus != cudaSuccess){ fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(cudaStatus)); goto Error; }ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, aH, bW, aW, &alpha, gpu_a, aH, gpu_b, bH, &beta, gpu_c, aH);
cudaStatus = cudaEventRecord(stop, NULL);
if (cudaStatus != cudaSuccess){ fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(cudaStatus)); goto Error; } cudaStatus = cudaEventSynchronize(stop); if (cudaStatus != cudaSuccess){ fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(cudaStatus)); goto Error; }float msecTotal = 0.0f;
cudaStatus = cudaEventElapsedTime(&msecTotal, start, stop); if (cudaStatus != cudaSuccess){ fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(cudaStatus)); goto Error; } printf("cuBlas__ GPU runtime is %f msec\n",msecTotal);cudaStatus = cudaMemcpy(c, gpu_c, size_c, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } checkError(cublasDestroy(handle), "cublasDestroy() error!\n");Error:
cudaFree(gpu_a); cudaFree(gpu_b); cudaFree(gpu_c); return cudaStatus;}
void multiCPU(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW){ for(int y=0; y<aH; ++y) { for(int x =0; x<bW; ++x) { int index = y*bW + x; c[index] = 0.0f; for(int i=0; i<aW; ++i) { c[index] += a[y*aW+i]*b[i*bW + x]; } } }}