您的当前位置：首页 cuda c语言编程入门,CUDA C初学者编程（VS2017）

cuda c语言编程入门,CUDA C初学者编程（VS2017）

来源：叨叨游戏网

打开VS2017后，文件——新建——项目；

找到NVIDIA，有的人说自己的VS中没看见NVIDIA这一项啊，那是因为没有你没有安装CUDA，或者你在安装CUDA的时候参照某教程将Visual Studio Integration 取消勾选安装，其实后来再重新装上就行。

创建一个文件夹名为 cuda_test 的项目，然后我们发现其实里面已经有 .cu 文件了，如下图所示。

然后，我们像C语言一样生成编译文件，最终结果如下：

接下来，我们修改代码如下，并运行以下代码。

#include

const int N = 16;

const int blocksize = 16;

__global__ void hello(char *a, int *b)

{

a[threadIdx.x] += b[threadIdx.x];

}

int main()

{

char a[N] = "Hello \0\0\0\0\0\0";

int b[N] = { 15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };

char *ad;

int *bd;

const int csize = N * sizeof(char);

const int isize = N * sizeof(int);

printf("%s", a);

cudaMalloc((void**)&ad, csize);

cudaMalloc((void**)&bd, isize);

cudaMemcpy(ad, a, csize, cudaMemcpyHostToDevice);

cudaMemcpy(bd, b, isize, cudaMemcpyHostToDevice);

dim3 dimBlock(blocksize, 1);

dim3 dimGrid(1, 1);

hello << > > (ad, bd);

cudaMemcpy(a, ad, csize, cudaMemcpyDeviceToHost);

cudaFree(ad);

cudaFree(bd);

printf("%s\n", a);

return EXIT_SUCCESS;

}

编译运行成功，最终结果如下：

运行以下代码，查看电脑设备。

/*********************************************/

#include

void printDeviceProp(const cudaDeviceProp &prop)

{

printf("Device Name : %s.\n", prop.name);

printf("totalGlobalMem : %d.\n", prop.totalGlobalMem);

printf("sharedMemPerBlock : %d.\n", prop.sharedMemPerBlock);

printf("regsPerBlock : %d.\n", prop.regsPerBlock);

printf("warpSize : %d.\n", prop.warpSize);

printf("memPitch : %d.\n", prop.memPitch);

printf("maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock);

printf("maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);

printf("maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);

printf("totalConstMem : %d.\n", prop.totalConstMem);

printf("major.minor : %d.%d.\n", prop.major, prop.minor);

printf("clockRate : %d.\n", prop.clockRate); // 输出的是GPU的时钟频率

printf("textureAlignment : %d.\n", prop.textureAlignment);

printf("deviceOverlap : %d.\n", prop.deviceOverlap);

printf("multiProcessorCount : %d.\n", prop.multiProcessorCount);

}

bool InitCUDA()

{

//used to count the device numbers

int count;

// 获取CUDA设备数

cudaGetDeviceCount(&count);

if (count == 0) {

fprintf(stderr, "There is no device.\n");

return false;

}

// find the device >= 1.X

int i;

for (i = 0; i < count; ++i) {

cudaDeviceProp prop;

if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) { //cudaGetDeviceProperties获取 CUDA 设备属性

if (prop.major >= 1) {

printDeviceProp(prop);

break;

}

// if can't find the device

if (i == count) {

fprintf(stderr, "There is no device supporting CUDA 1.x.\n");

return false;

}

// set cuda device

cudaSetDevice(i);

return true;

}

int main(int argc, char const *argv[])

{

if (InitCUDA()) {

printf("CUDA initialized.\n");

}

return 0;

}

程序运行结果：

查看当前系统上安装的GPU设备的详细参数

打开文件夹C:\ProgramData\NVIDIA Corporation\CUDA Samples\v9.2\1_Utilities

其中有一个名为deviceQuery的程序，可编译执行，即可打出当前系统上安装的GPU设备的详细参数。结果如下：

CUDA 初始化

CUDA初始化

#include "cuda_runtime.h"

#include

//CUDA 初始化

bool InitCUDA(){

int count;

cudaGetDeviceCount(&count);//取得支持Cuda的装置的数目

if (count == 0) {

fprintf(stderr, "There is no device.\n");

return false;

}

int i;

for (i = 0; i < count; i++) {

cudaDeviceProp prop;

if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {

if (prop.major >= 1) {

break;

}

if (i == count) {

fprintf(stderr, "There is no device supporting CUDA 1.x.\n");

return false;

}

cudaSetDevice(i);

return true;

}

int main(){

if (!InitCUDA()) {

return 0;

}//CUDA 初始化

printf("CUDA initialized.\n");

return 0;

}

运行得到结果：

说明系统上有支持CUDA的装置。

完整的向量点积CUDA程序

a=[a1,a2,…an], b=[b1,b2,…bn]

a*b=a1*b1+a2*b2+…+an*bn

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include "device_functions.h"

#include

#define N 10

__global__ void Dot(int *a, int *b, int *c) //声明kernel函数

{

__shared__ int temp[N]; // 声明在共享存储中的变量

temp[threadIdx.x] = a[threadIdx.x] * b[threadIdx.x];

__syncthreads(); // 此处有下划线，并不是错误，而是VS智能感知有问题，

if (0 == threadIdx.x)

{

int sum = 0;

for (int i = 0; i < N; i++)

sum += temp[i];

*c = sum;

printf("sum Calculated on Device:%d\n", *c);

}

void random_ints(int *a, int n)

{

for (int i = 0; i < n; i++)

*(a + i) = rand() % 10;

}

int main()

{

int *a, *b, *c;

int *d_a, *d_b, *d_c;

int size = N * sizeof(int);

cudaMalloc((void **)&d_a, size);

cudaMalloc((void **)&d_b, size);

cudaMalloc((void **)&d_c, sizeof(int));

a = (int *)malloc(size); random_ints(a, N);

b = (int *)malloc(size); random_ints(b, N);

c = (int *)malloc(sizeof(int));

printf("Array a[N]:\n");

for (int i = 0; i < N; i++) printf("%d ", a[i]);

printf("\n");

printf("Array b[N]:\n");

for (int i = 0; i < N; i++) printf("%d ", b[i]);

printf("\n\n");

cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);

cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

Dot << <1, N >> > (d_a, d_b, d_c); //单block多thread

cudaMemcpy(c, d_c, sizeof(int), cudaMemcpyDeviceToHost);

int sumHost = 0;

for (int i = 0; i < N; i++)

sumHost += a[i] * b[i];

printf("sum Calculated on Host=%d\n", sumHost);

printf("Device to Host: a*b=%d\n", *c);

free(a); free(b); free(c);

cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);

return 0;

}

运行结果如下：

计算随机数向量立方和的CUDA程序

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include "device_functions.h"

#include

#include // 为了使用rand 函数

#include

#define DATA_SIZE 1048576 //数据

int data[DATA_SIZE];

//产生大量0-9之间的随机数

void GenerateNumbers(int *number, int size)

{

for (int i = 0; i < size; i++) {

number[i] = rand() % 10; // 0~32767之间的随机数

}

//CUDA 初始化

bool InitCUDA()

{

int count;

//取得支持Cuda的装置的数目

cudaGetDeviceCount(&count);

if (count == 0) {

fprintf(stderr, "There is no device.\n");

return false;

}

int i;

for (i = 0; i < count; i++) {

cudaDeviceProp prop;

if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {

if (prop.major >= 1) {

break;

}

if (i == count) {

fprintf(stderr, "There is no device supporting CUDA 1.x.\n");

return false;

}

cudaSetDevice(i);

return true;

}

// __global__ 函数(GPU上执行) 计算立方和

__global__ static void sumOfcubes(int *num, int* result)

{

int sum = 0;

int i;

for (i = 0; i < DATA_SIZE; i++) {

sum += num[i] * num[i] * num[i];

}

*result = sum;

}

int main()

{ //CUDA 初始化

if (!InitCUDA()) {

return 0;

}

//生成随机数

GenerateNumbers(data, DATA_SIZE);

int* gpudata, *result; // device中的参数

int gpusum = 0; //CPU中的参数，但是是为了记录gpu中求到的sum.

cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);

cudaMalloc((void**)&result, sizeof(int));

cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice); //cudaMemcpy 将产生的随机数data从cpu复制到显卡内存中

sumOfcubes <<<1,1,0>>> (gpudata, result); // gpudata, result 是传给设备本身的参数

cudaMemcpy(&gpusum, result, sizeof(int), cudaMemcpyDeviceToHost); //cudaMemcpy将GPU中的结果result 传给 CPU中的参数gpusum

cudaFree(gpudata);

cudaFree(result);

printf("GPUsum: %d \n", gpusum);

int sum = 0;

for (int i = 0; i < DATA_SIZE; i++) {

sum += data[i] * data[i] * data[i];

}

printf("CPUsum: %d \n", sum);

return 0;

}

运行结果如下：

计算随机数向量立方和的CUDA程序(加入统计时间的函数)

计算随机数向量立方和的CUDA程序

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include "device_functions.h"

#include // 为了使用clock_t

#include

#include // 为了使用rand 函数

#include

#define DATA_SIZE 1048576 //数据

int data[DATA_SIZE];

//产生大量0-9之间的随机数

void GenerateNumbers(int *number, int size)

{

for (int i = 0; i < size; i++) {

number[i] = rand() % 10; // 0~32767之间的随机数

}

//CUDA 初始化

bool InitCUDA()

{

int count;

//取得支持Cuda的装置的数目

cudaGetDeviceCount(&count);

if (count == 0) {

fprintf(stderr, "There is no device.\n");

return false;

}

int i;

for (i = 0; i < count; i++) {

cudaDeviceProp prop;

if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {

if (prop.major >= 1) {

break;

}

if (i == count) {

fprintf(stderr, "There is no device supporting CUDA 1.x.\n");

return false;

}

cudaSetDevice(i);

return true;

}

// __global__ 函数(GPU上执行) 计算立方和

__global__ static void sumOfcubes(int *num, int *result, clock_t *time)

{

int sum = 0;

int i;

clock_t start = clock();

for (i = 0; i < DATA_SIZE; i++) {

sum += num[i] * num[i] * num[i];

}

*result = sum;

*time = clock() - start;

}

int main()

{ //CUDA 初始化

if (!InitCUDA()) {

return 0;

}

//生成随机数

GenerateNumbers(data, DATA_SIZE);

int* gpudata, *result; // device中的参数

int gpusum = 0; //CPU中的参数，但是是为了记录gpu中求到的sum.

clock_t* time;

clock_t time_used;

cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);

cudaMalloc((void**)&result, sizeof(int));

cudaMalloc((void**)&time, sizeof(clock_t));

cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice); //cudaMemcpy 将产生的随机数data从cpu复制到显卡内存中

sumOfcubes <<<1,1,0>>> (gpudata, result,time); // gpudata, result,time 是传给设备本身的参数

cudaMemcpy(&gpusum, result, sizeof(int), cudaMemcpyDeviceToHost); //cudaMemcpy将GPU中的结果result 传给 CPU中的参数gpusum

cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);

cudaFree(gpudata);

cudaFree(result);

cudaFree(time);

printf("GPUsum: %d——time:%d\n", gpusum, time_used);

int sum = 0;

for (int i = 0; i < DATA_SIZE; i++) {

sum += data[i] * data[i] * data[i];

}

printf("CPUsum: %d \n", sum);

return 0;

}

结果如下：

GTX 1050上运行核函数部分的时间约为：

1992949251/ 1493000 = 1334.8 mS 其中1493000是GPU频率。

重要！！！

也就是clock() 统计出来的 (end - start) / GPU频率 = 毫秒级单位的时间其中利用***获取的GPU频率单位是千赫兹，比如我的GTX1050 的频率是 1493000千赫兹。

进一步优化上述求立方和的程序(利用单block多thread)

计算随机数向量立方和的CUDA程序

#include "cuda_runtime.h"

#include // 为了使用clock_t

#include

#include // 为了使用rand 函数

//#include "device_launch_parameters.h"

//#include "device_functions.h"

//#include

#define DATA_SIZE 1048576 //数据

#define THREAD_NUM 256 //线程的数量

int data[DATA_SIZE];

//产生大量0-9之间的随机数

void GenerateNumbers(int *number, int size){

for (int i = 0; i < size; i++) {

number[i] = rand() % 10; // 0~32767之间的随机数

}

//CUDA 初始化

bool InitCUDA(){

int count;

//取得支持Cuda的装置的数目

cudaGetDeviceCount(&count);

if (count == 0) {

fprintf(stderr, "There is no device.\n");

return false;

}

int i;

for (i = 0; i < count; i++) {

cudaDeviceProp prop;

if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {

if (prop.major >= 1) {

break;

}

if (i == count) {

fprintf(stderr, "There is no device supporting CUDA 1.x.\n");

return false;

}

cudaSetDevice(i);

return true;

}

// __global__ 函数(GPU上执行) 计算立方和

__global__ static void sumOfcubes(int *num, int *result, clock_t *time){

const int tid = threadIdx.x;

const int size = DATA_SIZE / THREAD_NUM;//计算每个线程需要完成的量

int sum = 0;

int i;

clock_t start; // 记录运算开始的时间

if (tid == 0) start = clock(); //只在thread 0(即threadIdx.x = 0 的时候)进行记录

for (i = tid * size; i < (tid + 1) * size; i++){

sum += num[i] * num[i] * num[i];

}

result[tid] = sum;

//计算时间的动作，只在thread 0(即threadIdx.x = 0 的时候)进行

if (tid == 0) *time = clock() - start;

}

int main(){

if (!InitCUDA()) {return 0;}//CUDA 初始化

GenerateNumbers(data, DATA_SIZE); //生成随机数

int* gpudata, *result; // device中的参数

int gpusum[THREAD_NUM]; // CPU中记录GPU中记录求和的参数

clock_t* time;

clock_t time_used;

cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);

cudaMalloc((void**)&time, sizeof(clock_t));

cudaMalloc((void**)&result, sizeof(int)*THREAD_NUM);

//cudaMemcpy 将产生的随机数data从cpu复制到显卡内存中

cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

// 启动kernel函数

sumOfcubes<<<1,THREAD_NUM,0>>> (gpudata, result,time); // gpudata, result,time 是传给设备本身的参数

clock_t time_use;

//cudaMemcpy 将结果从显存中复制回CPU内存 gpusum

cudaMemcpy(&gpusum, result, sizeof(int) * THREAD_NUM, cudaMemcpyDeviceToHost);

cudaMemcpy(&time_use, time, sizeof(clock_t), cudaMemcpyDeviceToHost);

cudaFree(gpudata);

cudaFree(result);

cudaFree(time);

int final_sum = 0; /*立方和归约*/

for (int i = 0; i < THREAD_NUM; i++){

final_sum += gpusum[i];

}

printf("GPUsum: %d gputime: %d\n", final_sum, time_use);

// 计算CPU中计算时的结果。

final_sum = 0;

for (int i = 0; i < DATA_SIZE; i++) {

final_sum += data[i] * data[i] * data[i];

}

printf("CPUsum: %d \n", final_sum);

return 0;

}

结果如下：

然而，上述程序还能够优化，上述代码：

if (tid == 0) start = clock(); //只在thread 0(即threadIdx.x = 0 的时候)进行记录

for (i = tid * size; i < (tid + 1) * size; i++){

sum += num[i] * num[i] * num[i];

}

result[tid] = sum;

//计算时间的动作，只在thread 0(即threadIdx.x = 0 的时候)进行

if (tid == 0) *time = clock() - start;

}

中核函数并不是连续存取的，读取数字完全是跳跃式的读取，这会非常影响内存的存取效率。因此我们下一步要将取数字的过程变成：thread 0 读取第一个数字，thread 1 读取第二个数字，以此类推......

程序部分修改为：

if (tid == 0) start = clock(); //只在thread 0(即threadIdx.x = 0 的时候)进行记录

for (i = tid; i < DATA_SIZE; i+= THREAD_NUM){

sum += num[i] * num[i] * num[i];

}

result[tid] = sum;

//计算时间的动作，只在thread 0(即threadIdx.x = 0 的时候)进行

if (tid == 0) *time = clock() - start;

实验运行结果：

？？？问题来了，老师上课的时候明明说，这样的优化会使得程序运行时间缩短很多倍，但是为啥我的运行时间结果没啥大的改变呢。8746726/8365217=1.04，几乎就是没有改进嘛.....难道是程序出现了问题，于是，我就将两个程序放在服务器上运行了。结果显示 1103530/185263=5.96，有近6倍的提，说明这样的优化程序是正确的，只是在VS2017的编译运行中没有体现出来，可这又是为啥？

经过一番查询资料，发现VS2017中我在编译运行的时候用的是debug模式，其实需要改成Release模式。

debug.png

Release.png

修改完编译器中的模式之后，发现结果果然不一样了，1060761/166149 = 6.38，说明优化后的程序有很大的提升，因此，得出一个结果，VS中的debug模式只能用来修改程序中的错误，但是不能说明真正的程序性能的好坏，谨记，要改为Release模式。

优化前.png

优化后.png

更进一步的优化，多block多thread

CUDA不仅提供了Thread，还提供了Grid和Block以及Share Memory这些非常重要的机制，每个block中Thread极限是1024，但是通过block和Grid，线程的数量还能成倍增长，甚至用几万个线程。

计算随机数向量立方和的CUDA程序

#include "cuda_runtime.h"

#include // 为了使用clock_t

#include

#include // 为了使用rand 函数

//#include "device_launch_parameters.h"

//#include "device_functions.h"

//#include

#define DATA_SIZE 1048576 //数据

#define THREAD_NUM 256 //thread 数量

#define BLOCK_NUM 32// block 数量

//配置32 个blocks，每个blocks 有256个threads，总共有32*256= 8192个threads。

int data[DATA_SIZE];

//产生大量0-9之间的随机数

void GenerateNumbers(int *number, int size) {

for (int i = 0; i < size; i++) {

number[i] = rand() % 10; // 0~32767之间的随机数

}

//CUDA 初始化

bool InitCUDA() {

int count;

cudaGetDeviceCount(&count);//取得支持Cuda的装置的数目

if (count == 0) {

fprintf(stderr, "There is no device.\n");

return false;

}

int i;

for (i = 0; i < count; i++) {

cudaDeviceProp prop;

if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {

if (prop.major >= 1) {

break;

}

if (i == count) {

fprintf(stderr, "There is no device supporting CUDA 1.x.\n");

return false;

}

cudaSetDevice(i);

return true;

}

// __global__ 函数(GPU上执行) 计算立方和

__global__ static void sumOfcubes(int *num, int *result, clock_t *time) {

const int tid = threadIdx.x;

const int bid = blockIdx.x;

int sum = 0;

int i;

//只在thread 0(即threadIdx.x = 0 的时候)进行记录，每个block 都会记录开始时间及结束时间

if (tid == 0) time[bid] = clock();

//thread需要同时通过tid和bid来确定，并保证内存连续性

for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM) {

sum += num[i] * num[i] * num[i];

}

//Result的数量随之增加

result[bid * THREAD_NUM + tid] = sum;

//计算时间的动作，只在thread 0(即threadIdx.x = 0 的时候)进行，每个block 都会记录开始时间及结束时间

if (tid == 0) time[bid + BLOCK_NUM] = clock();

}

int main() {

if (!InitCUDA()) { return 0; }//CUDA 初始化

GenerateNumbers(data, DATA_SIZE); //生成随机数

int *gpudata, *result; // device中的参数

clock_t *time;

cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);

cudaMalloc((void**)&time, sizeof(clock_t)*BLOCK_NUM * 2);

cudaMalloc((void**)&result, sizeof(int)*THREAD_NUM*BLOCK_NUM);

//cudaMemcpy 将数据从cpu复制到device内存中

cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

// 启动kernel函数

sumOfcubes << > > (gpudata, result, time); // gpudata, result,time 是传给设备本身的参数

int gpusum[THREAD_NUM*BLOCK_NUM]; // CPU中记录GPU中记录求和的参数

clock_t time_use[BLOCK_NUM * 2];

//cudaMemcpy 将结果从显存中复制回CPU内存

cudaMemcpy(&gpusum, result, sizeof(int) * THREAD_NUM * BLOCK_NUM, cudaMemcpyDeviceToHost);

cudaMemcpy(&time_use, time, sizeof(clock_t)*BLOCK_NUM * 2, cudaMemcpyDeviceToHost);

cudaFree(gpudata);

cudaFree(result);

cudaFree(time);

int final_sum = 0; /*立方和归约*/

for (int i = 0; i < THREAD_NUM * BLOCK_NUM; i++) {

final_sum += gpusum[i];

}

//采取新的计时策略把每个block 最早的开始时间，和最晚的结束时间相减，取得总运行时间

clock_t min_start, max_end;

min_start = time_use[0];

max_end = time_use[BLOCK_NUM];

for (int i = 1; i < BLOCK_NUM; i++) { //把每个block最早的开始时间和最晚结束的时间相减，取得最终的运行时间

if (min_start > time_use[i])

min_start = time_use[i];

if (max_end < time_use[i + BLOCK_NUM])

max_end = time_use[i + BLOCK_NUM];

}

printf("GPUsum: %d gputime: %ld\n", final_sum, max_end - min_start);

// 计算CPU中计算时的结果。

final_sum = 0;

for (int i = 0; i < DATA_SIZE; i++) {

final_sum += data[i] * data[i] * data[i];

}

printf("CPUsum: %d \n", final_sum);

return 0;

}

会出现以下错误：

error.png

解决方案：release模式会优化很多东西，但是多重新编译几次，又会成功，真是搞笑，搞不懂这些编译器......... 算了，还是建议去服务器上运行吧，VS中实在有太多无法未知的错误了。

程序运行结果：

Release_threads256_blocks32.png

后来修改 THREAD_NUM = 1024 ， BLOCK_NUM = 128 ，总共 1024 * 128 = 131073 个threads，性能并没有很大提升了。

#define THREAD_NUM 1024 //thread 数量

#define BLOCK_NUM 128 // block 数量

Release_threads1024_blocks128.png

为什么不用极限的1024个线程呢？

如果1024个线程全部用上，那样就是32*1024 = 32768 个线程，难道不是更好吗？从线程运行的原理来看，线程数量达到一定大小后，再一味地增加线程也不会取得性能的提升，反而有可能会让性能下降。线程数达到隐藏各种latency的程度后，之后线程数量的提升就没有太大的作用了。

程序中加和部分是在CPU上进行的，越多的线程意味着越多的结果，而这也意味着CPU上的运算压力会越来越大。

Thread 的同步

一个block内的所有的 thread 可以有共享的内存，也可以进行同步，因此，还可以让每个block 内所有 thread 将自己的计算结果相加起来。代码可以进一步修改：

#include "cuda_runtime.h"

#include // 为了使用clock_t

#include

#include // 为了使用rand 函数

//#include "device_launch_parameters.h"

//#include "device_functions.h"

//#include

#define DATA_SIZE 1048576 //数据

#define THREAD_NUM 256 //thread 数量

#define BLOCK_NUM 32// block 数量

//配置32 个blocks，每个blocks 有256个threads，总共有32*256= 8192个threads。

int data[DATA_SIZE];

//产生大量0-9之间的随机数

void GenerateNumbers(int *number, int size) {

for (int i = 0; i < size; i++) {

number[i] = rand() % 10; // 0~32767之间的随机数

}

//CUDA 初始化

bool InitCUDA() {

int count;

cudaGetDeviceCount(&count);//取得支持Cuda的装置的数目

if (count == 0) {

fprintf(stderr, "There is no device.\n");

return false;

}

int i;

for (i = 0; i < count; i++) {

cudaDeviceProp prop;

if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {

if (prop.major >= 1) {

break;

}

if (i == count) {

fprintf(stderr, "There is no device supporting CUDA 1.x.\n");

return false;

}

cudaSetDevice(i);

return true;

}

// __global__ 函数(GPU上执行) 计算立方和

__global__ static void sumOfcubes(int *num, int *result, clock_t *time) {

extern __shared__ int shared[];

const int tid = threadIdx.x;

const int bid = blockIdx.x;

int i;

//只在thread 0(即threadIdx.x = 0 的时候)进行记录，每个block 都会记录开始时间及结束时间

if (tid == 0) time[bid] = clock();

shared[tid] = 0;

//thread需要同时通过tid和bid来确定，并保证内存连续性

for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM) {

shared[tid] += num[i] * num[i] * num[i];

}

__syncthreads();

if (tid == 0) {

for (i = 1; i < THREAD_NUM; i++) {

shared[0] += shared[i];

}

result[tid] = shared[0];

}

if (tid == 0) time[bid + BLOCK_NUM] = clock();

}

int main() {

if (!InitCUDA()) { return 0; }//CUDA 初始化

GenerateNumbers(data, DATA_SIZE); //生成随机数

int *gpudata, *result; // device中的参数

clock_t *time;

cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);

cudaMalloc((void**)&time, sizeof(clock_t)*BLOCK_NUM * 2);

cudaMalloc((void**)&result, sizeof(int)*BLOCK_NUM);

cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

// 启动kernel函数

sumOfcubes << > > (gpudata, result, time); // gpudata, result,time 是传给设备本身的参数

int gpusum[BLOCK_NUM]; // CPU中记录GPU中记录求和的参数

clock_t time_use[BLOCK_NUM * 2];

cudaMemcpy(&gpusum, result, sizeof(int) * BLOCK_NUM, cudaMemcpyDeviceToHost);

cudaMemcpy(&time_use, time, sizeof(clock_t)*BLOCK_NUM * 2, cudaMemcpyDeviceToHost);

cudaFree(gpudata);

cudaFree(result);

cudaFree(time);

int final_sum = 0; /*立方和归约*/

for (int i = 0; i

final_sum += gpusum[i];

}

//采取新的计时策略把每个block 最早的开始时间，和最晚的结束时间相减，取得总运行时间

clock_t min_start, max_end;

min_start = time_use[0];

max_end = time_use[BLOCK_NUM];

for (int i = 1; i < BLOCK_NUM; i++) { //把每个block最早的开始时间和最晚结束的时间相减，取得最终的运行时间

if (min_start > time_use[i])

min_start = time_use[i];

if (max_end < time_use[i + BLOCK_NUM])

max_end = time_use[i + BLOCK_NUM];

}

printf("GPUsum: %d gputime: %ld\n", final_sum, max_end - min_start);

// 计算CPU中计算时的结果。

final_sum = 0;

for (int i = 0; i < DATA_SIZE; i++) {

final_sum += data[i] * data[i] * data[i];

}

printf("CPUsum: %d \n", final_sum);

return 0;

}

树状加法

#include "device_functions.h"

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include "device_functions.h"

#include

#define BLOCK_DIM 16

/**

* Computes the squared Euclidean distance matrix between the query points and the reference points.

* @param ref refence points stored in the global memory

* @param ref_width number of reference points

* @param ref_pitch pitch of the reference points array in number of column

* @param query query points stored in the global memory

* @param query_width number of query points

* @param query_pitch pitch of the query points array in number of columns

* @param height dimension of points = height of texture `ref` and of the array `query`

* @param dist array containing the query_width x ref_width computed distances

__global__ void compute_distances(float * ref,

int ref_width,

int ref_pitch,

float * query,

int query_width,

int query_pitch,

int height,

float * dist) {

// Declaration of the shared memory arrays As and Bs used to store the sub-matrix of A and B

__shared__ float shared_A[BLOCK_DIM][BLOCK_DIM];

__shared__ float shared_B[BLOCK_DIM][BLOCK_DIM];

// Sub-matrix of A (begin, step, end) and Sub-matrix of B (begin, step)

__shared__ int begin_A;

__shared__ int begin_B;

__shared__ int step_A;

__shared__ int step_B;

__shared__ int end_A;

// Thread index

int tx = threadIdx.x;

int ty = threadIdx.y;

// Initializarion of the SSD for the current thread

float ssd = 0.f;

// Loop parameters

begin_A = BLOCK_DIM * blockIdx.y;

begin_B = BLOCK_DIM * blockIdx.x;

step_A = BLOCK_DIM * ref_pitch;

step_B = BLOCK_DIM * query_pitch;

end_A = begin_A + (height - 1) * ref_pitch;

// Conditions

int cond0 = (begin_A + tx < ref_width); // used to write in shared memory

int cond1 = (begin_B + tx < query_width); // used to write in shared memory & to computations and to write in output array

int cond2 = (begin_A + ty < ref_width); // used to computations and to write in output matrix

// Loop over all the sub-matrices of A and B required to compute the block sub-matrix

for (int a = begin_A, b = begin_B; a <= end_A; a += step_A, b += step_B) {

// Load the matrices from device memory to shared memory; each thread loads one element of each matrix

if (a / ref_pitch + ty < height) {

shared_A[ty][tx] = (cond0) ? ref[a + ref_pitch * ty + tx] : 0;

shared_B[ty][tx] = (cond1) ? query[b + query_pitch * ty + tx] : 0;

}

else {

shared_A[ty][tx] = 0;

shared_B[ty][tx] = 0;

}

// Synchronize to make sure the matrices are loaded

__syncthreads();

// Compute the difference between the two matrixes; each thread computes one element of the block sub-matrix

if (cond2 && cond1) {

for (int k = 0; k < BLOCK_DIM; ++k) {

float tmp = shared_A[k][ty] - shared_B[k][tx];

ssd += tmp * tmp;

}

// Synchronize to make sure that the preceeding computation is done before loading two new sub-matrices of A and B in the next iteration

__syncthreads();

}

// Write the block sub-matrix to device memory; each thread writes one element

if (cond2 && cond1) {

dist[(begin_A + ty) * query_pitch + begin_B + tx] = ssd;

}

/**

* Computes the squared Euclidean distance matrix between the query points and the reference points.

* @param ref refence points stored in the texture memory

* @param ref_width number of reference points

* @param query query points stored in the global memory

* @param query_width number of query points

* @param query_pitch pitch of the query points array in number of columns

* @param height dimension of points = height of texture `ref` and of the array `query`

* @param dist array containing the query_width x ref_width computed distances

__global__ void compute_distance_texture(cudaTextureObject_t ref,

int ref_width,

float * query,

int query_width,

int query_pitch,

int height,

float* dist) {

unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;

unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;

if (xIndex < query_width && yIndex < ref_width) {

float ssd = 0.f;

for (int i = 0; i < height; i++) {

float tmp = tex2D(ref, (float)yIndex, (float)i) - query[i * query_pitch + xIndex];

ssd += tmp * tmp;

}

dist[yIndex * query_pitch + xIndex] = ssd;

}

/**

* For each reference point (i.e. each column) finds the k-th smallest distances

* of the distance matrix and their respective indexes and gathers them at the top

* of the 2 arrays.

* Since we only need to locate the k smallest distances, sorting the entire array

* would not be very efficient if k is relatively small. Instead, we perform a

* simple insertion sort by eventually inserting a given distance in the first

* k values.

* @param dist distance matrix

* @param dist_pitch pitch of the distance matrix given in number of columns

* @param index index matrix

* @param index_pitch pitch of the index matrix given in number of columns

* @param width width of the distance matrix and of the index matrix

* @param height height of the distance matrix

* @param k number of values to find

__global__ void modified_insertion_sort(float * dist,

int dist_pitch,

int * index,

int index_pitch,

int width,

int height,

int k) {

// Column position

unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;

// Do nothing if we are out of bounds

if (xIndex < width) {

// Pointer shift

float * p_dist = dist + xIndex;

int * p_index = index + xIndex;

// Initialise the first index

p_index[0] = 0;

// Go through all points

for (int i = 1; i < height; ++i) {

// Store current distance and associated index

float curr_dist = p_dist[i*dist_pitch];

int curr_index = i;

// Skip the current value if its index is >= k and if it's higher the k-th slready sorted mallest value

if (i >= k && curr_dist >= p_dist[(k - 1)*dist_pitch]) {

continue;

}

// Shift values (and indexes) higher that the current distance to the right

int j = min(i, k - 1);

while (j > 0 && p_dist[(j - 1)*dist_pitch] > curr_dist) {

p_dist[j*dist_pitch] = p_dist[(j - 1)*dist_pitch];

p_index[j*index_pitch] = p_index[(j - 1)*index_pitch];

--j;

}

// Write the current distance and index at their position

p_dist[j*dist_pitch] = curr_dist;

p_index[j*index_pitch] = curr_index;

}

/**

* Computes the square root of the first k lines of the distance matrix.

* @param dist distance matrix

* @param width width of the distance matrix

* @param pitch pitch of the distance matrix given in number of columns

* @param k number of values to consider

__global__ void compute_sqrt(float * dist, int width, int pitch, int k) {

unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;

unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;

if (xIndex < width && yIndex < k)

dist[yIndex*pitch + xIndex] = sqrt(dist[yIndex*pitch + xIndex]);

}

/**

* Computes the squared norm of each column of the input array.

* @param array input array

* @param width number of columns of `array` = number of points

* @param pitch pitch of `array` in number of columns

* @param height number of rows of `array` = dimension of the points

* @param norm output array containing the squared norm values

__global__ void compute_squared_norm(float * array, int width, int pitch, int height, float * norm) {

unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;

if (xIndex < width) {

float sum = 0.f;

for (int i = 0; i < height; i++) {

float val = array[i*pitch + xIndex];

sum += val * val;

}

norm[xIndex] = sum;

}

/**

* Add the reference points norm (column vector) to each colum of the input array.

* @param array input array

* @param width number of columns of `array` = number of points

* @param pitch pitch of `array` in number of columns

* @param height number of rows of `array` = dimension of the points

* @param norm reference points norm stored as a column vector

__global__ void add_reference_points_norm(float * array, int width, int pitch, int height, float * norm) {

unsigned int tx = threadIdx.x;

unsigned int ty = threadIdx.y;

unsigned int xIndex = blockIdx.x * blockDim.x + tx;

unsigned int yIndex = blockIdx.y * blockDim.y + ty;

__shared__ float shared_vec[16];

if (tx == 0 && yIndex < height)

shared_vec[ty] = norm[yIndex];

__syncthreads();

if (xIndex < width && yIndex < height)

array[yIndex*pitch + xIndex] += shared_vec[ty];

}

/**

* Adds the query points norm (row vector) to the k first lines of the input

* array and computes the square root of the resulting values.

* @param array input array

* @param width number of columns of `array` = number of points

* @param pitch pitch of `array` in number of columns

* @param k number of neighbors to consider

* @param norm query points norm stored as a row vector

__global__ void add_query_points_norm_and_sqrt(float * array, int width, int pitch, int k, float * norm) {

unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;

unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;

if (xIndex < width && yIndex < k)

array[yIndex*pitch + xIndex] = sqrt(array[yIndex*pitch + xIndex] + norm[xIndex]);

}

bool knn_cuda_global(const float * ref,

int ref_nb,

const float * query,

int query_nb,

int dim,

int k,

float * knn_dist,

int * knn_index) {

// Constants

const unsigned int size_of_float = sizeof(float);

const unsigned int size_of_int = sizeof(int);

// Return variables

cudaError_t err0, err1, err2, err3;

// Check that we have at least one CUDA device

int nb_devices;

err0 = cudaGetDeviceCount(&nb_devices);

if (err0 != cudaSuccess || nb_devices == 0) {

printf("ERROR: No CUDA device found\n");

return false;

}

// Select the first CUDA device as default

err0 = cudaSetDevice(0);

if (err0 != cudaSuccess) {

printf("ERROR: Cannot set the chosen CUDA device\n");

return false;

}

// Allocate global memory

float * ref_dev = NULL;

float * query_dev = NULL;

float * dist_dev = NULL;

int * index_dev = NULL;

size_t ref_pitch_in_bytes;

size_t query_pitch_in_bytes;

size_t dist_pitch_in_bytes;

size_t index_pitch_in_bytes;

err0 = cudaMallocPitch((void**)&ref_dev, &ref_pitch_in_bytes, ref_nb * size_of_float, dim);

err1 = cudaMallocPitch((void**)&query_dev, &query_pitch_in_bytes, query_nb * size_of_float, dim);

err2 = cudaMallocPitch((void**)&dist_dev, &dist_pitch_in_bytes, query_nb * size_of_float, ref_nb);

err3 = cudaMallocPitch((void**)&index_dev, &index_pitch_in_bytes, query_nb * size_of_int, k);

if (err0 != cudaSuccess || err1 != cudaSuccess || err2 != cudaSuccess || err3 != cudaSuccess) {

printf("ERROR: Memory allocation error\n");

cudaFree(ref_dev);

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

return false;

}

// Deduce pitch values

size_t ref_pitch = ref_pitch_in_bytes / size_of_float;

size_t query_pitch = query_pitch_in_bytes / size_of_float;

size_t dist_pitch = dist_pitch_in_bytes / size_of_float;

size_t index_pitch = index_pitch_in_bytes / size_of_int;

// Check pitch values

if (query_pitch != dist_pitch || query_pitch != index_pitch) {

printf("ERROR: Invalid pitch value\n");

cudaFree(ref_dev);

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

return false;

}

// Copy reference and query data from the host to the device

err0 = cudaMemcpy2D(ref_dev, ref_pitch_in_bytes, ref, ref_nb * size_of_float, ref_nb * size_of_float, dim, cudaMemcpyHostToDevice);

err1 = cudaMemcpy2D(query_dev, query_pitch_in_bytes, query, query_nb * size_of_float, query_nb * size_of_float, dim, cudaMemcpyHostToDevice);

if (err0 != cudaSuccess || err1 != cudaSuccess) {

printf("ERROR: Unable to copy data from host to device\n");

cudaFree(ref_dev);

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

return false;

}

// Compute the squared Euclidean distances

dim3 block0(BLOCK_DIM, BLOCK_DIM, 1);

dim3 grid0(query_nb / BLOCK_DIM, ref_nb / BLOCK_DIM, 1);

if (query_nb % BLOCK_DIM != 0) grid0.x += 1;

if (ref_nb % BLOCK_DIM != 0) grid0.y += 1;

compute_distances << > > (ref_dev, ref_nb, ref_pitch, query_dev, query_nb, query_pitch, dim, dist_dev);

if (cudaGetLastError() != cudaSuccess) {

printf("ERROR: Unable to execute kernel\n");

cudaFree(ref_dev);

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

return false;

}

// Sort the distances with their respective indexes

dim3 block1(256, 1, 1);

dim3 grid1(query_nb / 256, 1, 1);

if (query_nb % 256 != 0) grid1.x += 1;

modified_insertion_sort << > > (dist_dev, dist_pitch, index_dev, index_pitch, query_nb, ref_nb, k);

if (cudaGetLastError() != cudaSuccess) {

printf("ERROR: Unable to execute kernel\n");

cudaFree(ref_dev);

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

return false;

}

// Compute the square root of the k smallest distances

dim3 block2(16, 16, 1);

dim3 grid2(query_nb / 16, k / 16, 1);

if (query_nb % 16 != 0) grid2.x += 1;

if (k % 16 != 0) grid2.y += 1;

compute_sqrt << > > (dist_dev, query_nb, query_pitch, k);

if (cudaGetLastError() != cudaSuccess) {

printf("ERROR: Unable to execute kernel\n");

cudaFree(ref_dev);

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

return false;

}

// Copy k smallest distances / indexes from the device to the host

err0 = cudaMemcpy2D(knn_dist, query_nb * size_of_float, dist_dev, dist_pitch_in_bytes, query_nb * size_of_float, k, cudaMemcpyDeviceToHost);

err1 = cudaMemcpy2D(knn_index, query_nb * size_of_int, index_dev, index_pitch_in_bytes, query_nb * size_of_int, k, cudaMemcpyDeviceToHost);

if (err0 != cudaSuccess || err1 != cudaSuccess) {

printf("ERROR: Unable to copy data from device to host\n");

cudaFree(ref_dev);

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

return false;

}

// Memory clean-up

cudaFree(ref_dev);

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

return true;

}

bool knn_cuda_texture(const float * ref,

int ref_nb,

const float * query,

int query_nb,

int dim,

int k,

float * knn_dist,

int * knn_index) {

// Constants

unsigned int size_of_float = sizeof(float);

unsigned int size_of_int = sizeof(int);

// Return variables

cudaError_t err0, err1, err2;

// Check that we have at least one CUDA device

int nb_devices;

err0 = cudaGetDeviceCount(&nb_devices);

if (err0 != cudaSuccess || nb_devices == 0) {

printf("ERROR: No CUDA device found\n");

return false;

}

// Select the first CUDA device as default

err0 = cudaSetDevice(0);

if (err0 != cudaSuccess) {

printf("ERROR: Cannot set the chosen CUDA device\n");

return false;

}

// Allocate global memory

float * query_dev = NULL;

float * dist_dev = NULL;

int * index_dev = NULL;

size_t query_pitch_in_bytes;

size_t dist_pitch_in_bytes;

size_t index_pitch_in_bytes;

err0 = cudaMallocPitch((void**)&query_dev, &query_pitch_in_bytes, query_nb * size_of_float, dim);

err1 = cudaMallocPitch((void**)&dist_dev, &dist_pitch_in_bytes, query_nb * size_of_float, ref_nb);

err2 = cudaMallocPitch((void**)&index_dev, &index_pitch_in_bytes, query_nb * size_of_int, k);

if (err0 != cudaSuccess || err1 != cudaSuccess || err2 != cudaSuccess) {

printf("ERROR: Memory allocation error (cudaMallocPitch)\n");

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

return false;

}

// Deduce pitch values

size_t query_pitch = query_pitch_in_bytes / size_of_float;

size_t dist_pitch = dist_pitch_in_bytes / size_of_float;

size_t index_pitch = index_pitch_in_bytes / size_of_int;

// Check pitch values

if (query_pitch != dist_pitch || query_pitch != index_pitch) {

printf("ERROR: Invalid pitch value\n");

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

return false;

}

// Copy query data from the host to the device

err0 = cudaMemcpy2D(query_dev, query_pitch_in_bytes, query, query_nb * size_of_float, query_nb * size_of_float, dim, cudaMemcpyHostToDevice);

if (err0 != cudaSuccess) {

printf("ERROR: Unable to copy data from host to device\n");

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

return false;

}

// Allocate CUDA array for reference points

cudaArray* ref_array_dev = NULL;

cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);

err0 = cudaMallocArray(&ref_array_dev, &channel_desc, ref_nb, dim);

if (err0 != cudaSuccess) {

printf("ERROR: Memory allocation error (cudaMallocArray)\n");

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

return false;

}

// Copy reference points from host to device

err0 = cudaMemcpyToArray(ref_array_dev, 0, 0, ref, ref_nb * size_of_float * dim, cudaMemcpyHostToDevice);

if (err0 != cudaSuccess) {

printf("ERROR: Unable to copy data from host to device\n");

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

cudaFreeArray(ref_array_dev);

return false;

}

// Resource descriptor

struct cudaResourceDesc res_desc;

memset(&res_desc, 0, sizeof(res_desc));

res_desc.resType = cudaResourceTypeArray;

res_desc.res.array.array = ref_array_dev;

// Texture descriptor

struct cudaTextureDesc tex_desc;

memset(&tex_desc, 0, sizeof(tex_desc));

tex_desc.addressMode[0] = cudaAddressModeClamp;

tex_desc.addressMode[1] = cudaAddressModeClamp;

tex_desc.filterMode = cudaFilterModePoint;

tex_desc.readMode = cudaReadModeElementType;

tex_desc.normalizedCoords = 0;

// Create the texture

cudaTextureObject_t ref_tex_dev = 0;

err0 = cudaCreateTextureObject(&ref_tex_dev, &res_desc, &tex_desc, NULL);

if (err0 != cudaSuccess) {

printf("ERROR: Unable to create the texture\n");

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

cudaFreeArray(ref_array_dev);

return false;

}

// Compute the squared Euclidean distances

dim3 block0(16, 16, 1);

dim3 grid0(query_nb / 16, ref_nb / 16, 1);

if (query_nb % 16 != 0) grid0.x += 1;

if (ref_nb % 16 != 0) grid0.y += 1;

compute_distance_texture << > > (ref_tex_dev, ref_nb, query_dev, query_nb, query_pitch, dim, dist_dev);

if (cudaGetLastError() != cudaSuccess) {

printf("ERROR: Unable to execute kernel\n");

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

cudaFreeArray(ref_array_dev);

cudaDestroyTextureObject(ref_tex_dev);

return false;

}

// Sort the distances with their respective indexes

dim3 block1(256, 1, 1);

dim3 grid1(query_nb / 256, 1, 1);

if (query_nb % 256 != 0) grid1.x += 1;

modified_insertion_sort << > > (dist_dev, dist_pitch, index_dev, index_pitch, query_nb, ref_nb, k);

if (cudaGetLastError() != cudaSuccess) {

printf("ERROR: Unable to execute kernel\n");

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

cudaFreeArray(ref_array_dev);

cudaDestroyTextureObject(ref_tex_dev);

return false;

}

// Compute the square root of the k smallest distances

dim3 block2(16, 16, 1);

dim3 grid2(query_nb / 16, k / 16, 1);

if (query_nb % 16 != 0) grid2.x += 1;

if (k % 16 != 0) grid2.y += 1;

compute_sqrt << > > (dist_dev, query_nb, query_pitch, k);

if (cudaGetLastError() != cudaSuccess) {

printf("ERROR: Unable to execute kernel\n");

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

cudaFreeArray(ref_array_dev);

cudaDestroyTextureObject(ref_tex_dev);

return false;

}

// Copy k smallest distances / indexes from the device to the host

err0 = cudaMemcpy2D(knn_dist, query_nb * size_of_float, dist_dev, dist_pitch_in_bytes, query_nb * size_of_float, k, cudaMemcpyDeviceToHost);

err1 = cudaMemcpy2D(knn_index, query_nb * size_of_int, index_dev, index_pitch_in_bytes, query_nb * size_of_int, k, cudaMemcpyDeviceToHost);

if (err0 != cudaSuccess || err1 != cudaSuccess) {

printf("ERROR: Unable to copy data from device to host\n");

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

cudaFreeArray(ref_array_dev);

cudaDestroyTextureObject(ref_tex_dev);

return false;

}

// Memory clean-up

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

cudaFreeArray(ref_array_dev);

cudaDestroyTextureObject(ref_tex_dev);

return true;

}

bool knn_cublas(const float * ref,

int ref_nb,

const float * query,

int query_nb,

int dim,

int k,

float * knn_dist,

int * knn_index) {

// Constants

const unsigned int size_of_float = sizeof(float);

const unsigned int size_of_int = sizeof(int);

// Return variables

cudaError_t err0, err1, err2, err3, err4, err5;

// Check that we have at least one CUDA device

int nb_devices;

err0 = cudaGetDeviceCount(&nb_devices);

if (err0 != cudaSuccess || nb_devices == 0) {

printf("ERROR: No CUDA device found\n");

return false;

}

// Select the first CUDA device as default

err0 = cudaSetDevice(0);

if (err0 != cudaSuccess) {

printf("ERROR: Cannot set the chosen CUDA device\n");

return false;

}

// Initialize CUBLAS

cublasInit();

// Allocate global memory

float * ref_dev = NULL;

float * query_dev = NULL;

float * dist_dev = NULL;

int * index_dev = NULL;

float * ref_norm_dev = NULL;

float * query_norm_dev = NULL;

size_t ref_pitch_in_bytes;

size_t query_pitch_in_bytes;

size_t dist_pitch_in_bytes;

size_t index_pitch_in_bytes;

err0 = cudaMallocPitch((void**)&ref_dev, &ref_pitch_in_bytes, ref_nb * size_of_float, dim);

err1 = cudaMallocPitch((void**)&query_dev, &query_pitch_in_bytes, query_nb * size_of_float, dim);

err2 = cudaMallocPitch((void**)&dist_dev, &dist_pitch_in_bytes, query_nb * size_of_float, ref_nb);

err3 = cudaMallocPitch((void**)&index_dev, &index_pitch_in_bytes, query_nb * size_of_int, k);

err4 = cudaMalloc((void**)&ref_norm_dev, ref_nb * size_of_float);

err5 = cudaMalloc((void**)&query_norm_dev, query_nb * size_of_float);

if (err0 != cudaSuccess || err1 != cudaSuccess || err2 != cudaSuccess || err3 != cudaSuccess || err4 != cudaSuccess || err5 != cudaSuccess) {

printf("ERROR: Memory allocation error\n");

cudaFree(ref_dev);

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

cudaFree(ref_norm_dev);

cudaFree(query_norm_dev);

cublasShutdown();

return false;

}

// Deduce pitch values

size_t ref_pitch = ref_pitch_in_bytes / size_of_float;

size_t query_pitch = query_pitch_in_bytes / size_of_float;

size_t dist_pitch = dist_pitch_in_bytes / size_of_float;

size_t index_pitch = index_pitch_in_bytes / size_of_int;

// Check pitch values

if (query_pitch != dist_pitch || query_pitch != index_pitch) {

printf("ERROR: Invalid pitch value\n");

cudaFree(ref_dev);

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

cudaFree(ref_norm_dev);

cudaFree(query_norm_dev);

cublasShutdown();

return false;

}

// Copy reference and query data from the host to the device

err0 = cudaMemcpy2D(ref_dev, ref_pitch_in_bytes, ref, ref_nb * size_of_float, ref_nb * size_of_float, dim, cudaMemcpyHostToDevice);

err1 = cudaMemcpy2D(query_dev, query_pitch_in_bytes, query, query_nb * size_of_float, query_nb * size_of_float, dim, cudaMemcpyHostToDevice);

if (err0 != cudaSuccess || err1 != cudaSuccess) {

printf("ERROR: Unable to copy data from host to device\n");

cudaFree(ref_dev);

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

cudaFree(ref_norm_dev);

cudaFree(query_norm_dev);

cublasShutdown();

return false;

}

// Compute the squared norm of the reference points

dim3 block0(256, 1, 1);

dim3 grid0(ref_nb / 256, 1, 1);

if (ref_nb % 256 != 0) grid0.x += 1;

compute_squared_norm << > > (ref_dev, ref_nb, ref_pitch, dim, ref_norm_dev);

if (cudaGetLastError() != cudaSuccess) {

printf("ERROR: Unable to execute kernel\n");

cudaFree(ref_dev);

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

cudaFree(ref_norm_dev);

cudaFree(query_norm_dev);

cublasShutdown();

return false;

}

// Compute the squared norm of the query points

dim3 block1(256, 1, 1);

dim3 grid1(query_nb / 256, 1, 1);

if (query_nb % 256 != 0) grid1.x += 1;

compute_squared_norm << > > (query_dev, query_nb, query_pitch, dim, query_norm_dev);

if (cudaGetLastError() != cudaSuccess) {

printf("ERROR: Unable to execute kernel\n");

cudaFree(ref_dev);

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

cudaFree(ref_norm_dev);

cudaFree(query_norm_dev);

cublasShutdown();

return false;

}

// Computation of query*transpose(reference)

cublasSgemm('n', 't', (int)query_pitch, (int)ref_pitch, dim, (float)-2.0, query_dev, query_pitch, ref_dev, ref_pitch, (float)0.0, dist_dev, query_pitch);

if (cublasGetError() != CUBLAS_STATUS_SUCCESS) {

printf("ERROR: Unable to execute cublasSgemm\n");

cudaFree(ref_dev);

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

cudaFree(ref_norm_dev);

cudaFree(query_norm_dev);

cublasShutdown();

return false;

}

// Add reference points norm

dim3 block2(16, 16, 1);

dim3 grid2(query_nb / 16, ref_nb / 16, 1);

if (query_nb % 16 != 0) grid2.x += 1;

if (ref_nb % 16 != 0) grid2.y += 1;

add_reference_points_norm << > > (dist_dev, query_nb, dist_pitch, ref_nb, ref_norm_dev);

if (cudaGetLastError() != cudaSuccess) {

printf("ERROR: Unable to execute kernel\n");

cudaFree(ref_dev);

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

cudaFree(ref_norm_dev);

cudaFree(query_norm_dev);

cublasShutdown();

return false;

}

// Sort each column

modified_insertion_sort << > > (dist_dev, dist_pitch, index_dev, index_pitch, query_nb, ref_nb, k);

if (cudaGetLastError() != cudaSuccess) {

printf("ERROR: Unable to execute kernel\n");

cudaFree(ref_dev);

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

cudaFree(ref_norm_dev);

cudaFree(query_norm_dev);

cublasShutdown();

return false;

}

// Add query norm and compute the square root of the of the k first elements

dim3 block3(16, 16, 1);

dim3 grid3(query_nb / 16, k / 16, 1);

if (query_nb % 16 != 0) grid3.x += 1;

if (k % 16 != 0) grid3.y += 1;

add_query_points_norm_and_sqrt << > > (dist_dev, query_nb, dist_pitch, k, query_norm_dev);

if (cudaGetLastError() != cudaSuccess) {

printf("ERROR: Unable to execute kernel\n");

cudaFree(ref_dev);

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

cudaFree(ref_norm_dev);

cudaFree(query_norm_dev);

cublasShutdown();

return false;

}

// Copy k smallest distances / indexes from the device to the host

err0 = cudaMemcpy2D(knn_dist, query_nb * size_of_float, dist_dev, dist_pitch_in_bytes, query_nb * size_of_float, k, cudaMemcpyDeviceToHost);

err1 = cudaMemcpy2D(knn_index, query_nb * size_of_int, index_dev, index_pitch_in_bytes, query_nb * size_of_int, k, cudaMemcpyDeviceToHost);

if (err0 != cudaSuccess || err1 != cudaSuccess) {

printf("ERROR: Unable to copy data from device to host\n");

cudaFree(ref_dev);

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

cudaFree(ref_norm_dev);

cudaFree(query_norm_dev);

cublasShutdown();

return false;

}

// Memory clean-up and CUBLAS shutdown

cudaFree(ref_dev);

cudaFree(query_dev);

cudaFree(dist_dev);

cudaFree(index_dev);

cudaFree(ref_norm_dev);

cudaFree(query_norm_dev);

cublasShutdown();

return true;

}

因篇幅问题不能全部显示，请点此查看更多更全内容

查看全文

cuda c语言编程入门,CUDA C初学者编程 （VS2017）

cuda c语言编程入门,CUDA C初学者编程（VS2017）