VS2019 OpenCL安装和快速入门
本文基于NVIDIA GPU和VS2019講解如何配置OpenCL環境和快速上手編程。
文章目錄
- 1.OpenCL安裝在VS2019 上
- 1.安裝資源準備
- 2.安裝步驟
- 2.OpenCL 快速入門
- 1.原文和翻譯
- 2.代碼改動和調試
- 3.測試通過的代碼
1.OpenCL安裝在VS2019 上
1.安裝資源準備
從NVIDIA 官網下載CUDA 并雙擊運行安裝程序:
https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=7&target_type=exelocal
然后在安裝路徑找到以下資源:
1 .\lib\x64\OpenCL.lib
OpenCL.lib
2.CL 頭文件
CL 頭文件
3.在顯卡的默認驅動路徑找到
OpenCL64.dll
在這里插入圖片描述OpenCL64.dll
2.安裝步驟
新建OpenCL_inc 和OpenCL_lib 目錄。將上一步找到的資源分別復制到這兩個目錄。
其中OpenCL_inc 用來包含CL 頭文件,OpenCL_lib 目錄用來包含OpenCL.lib以及OpenCL.dll,OpenCL64.dll。
VS2019 環境配置:
1.
項目–> (項目名)屬性 -->C/C+±->常規 -->附加包含目錄 --> F:\OPENCL\code\OpenCL_inc
2.
項目–> (項目名)屬性 -->連接器 --> 常規 --> 附加庫目錄 --> F:\OPENCL\code\OpenCL_lib
3.
項目–> (項目名)屬性 -->連接器 -->輸入–> 附加依賴項 --> OpenCL.lib
2.OpenCL 快速入門
1.原文和翻譯
原文地址
參考翻譯
2.代碼改動和調試
嘗試將vector_add_gpu.cl kernel文件直接以字符串的方式放在代碼中,然后在kernel編譯(即執行 clBuildProgram)后再獲取PROGRAM SOURCE:
error = clGetProgramInfo(program, CL_PROGRAM_SOURCE, bufSize, programBuffer, &program_size_ret);發現這樣不使用.cl 文件也時可行的。但是需要注意的是將編譯后的字符串打印出來時buffer長度需要+1。
3.測試通過的代碼
#include <iostream> #include <string>//#include <oclUtils.h> #include "CL\opencl.h" #include <time.h> #include <windows.h>using namespace std; string getPlatformName(const cl_platform_id pid) {size_t param_value_size;clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, NULL, ¶m_value_size);char* param_value = new char[param_value_size];clGetPlatformInfo(pid, CL_PLATFORM_NAME, param_value_size, param_value, NULL);return param_value; }size_t shrRoundUp(size_t localWorkSize, size_t numItems) {size_t result = localWorkSize;while (result < numItems)result += localWorkSize;return result; }void vector_add_cpu(const float* const src_a,const float* const src_b,float* const res,const int size) {for (int i = 0; i < size; i++) {res[i] = src_a[i] + src_b[i];} }int main() {cl_uint num_platforms;cl_int error = 0;cl_context context;cl_command_queue queue;cl_device_id device;cl_platform_id platformNvidia;// PlatformclGetPlatformIDs(0, NULL, &num_platforms);cl_platform_id* platforms = new cl_platform_id[num_platforms];clGetPlatformIDs(num_platforms, platforms, NULL);for (cl_uint i = 0; i < num_platforms; i++) {string platname = getPlatformName(platforms[i]);cout << "<" << i << "> " << "Platform name is :" << platname << endl;}platformNvidia = platforms[1];string platname1 = getPlatformName(platformNvidia);cout << "<" << platname1 << "> " << "choose Platform 1 :" << platname1 << endl;//Devicecl_uint num_devices;error = clGetDeviceIDs(platformNvidia, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);if (error != CL_SUCCESS) {cout << "Error getting device ids: " << error << endl;exit(error);}cout << "num of devices is : "<< num_devices << endl;//contextcontext = clCreateContext(0, 1, &device, NULL, NULL, &error);if (error != CL_SUCCESS) {cout << "Error creating context: " << error << endl;exit(error);}// Command-queuequeue = clCreateCommandQueue(context, device, 0, &error);if (error != CL_SUCCESS) {cout << "Error creating command queue: " << error << endl;exit(error);}///memoryconst int size = 123456;float* src_a_h = new float[size];float* src_b_h = new float[size];float* res_h = new float[size];//cl_mem_ion_host_ptr ion_src_a;// init vectorsfor (int i = 0; i < size; i++){src_a_h[i] = src_b_h[i] = (float) i;}const int mem_size = sizeof(float) * size;//allocate device buffercl_mem src_a_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, src_a_h, &error );cl_mem src_b_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, src_b_h, &error );cl_mem res_d = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size, NULL, &error);// create the programconst char* programSource ="__kernel void vector_add_gpu(__global const float* src_a, "" __global const float* src_b, "" __global float* res, "" const int num) \n ""{ \n "" /* get_global_id(0) 返回正在執行的這個線程的ID。 \n "" 許多線程會在同一時間開始執行同一個kernel, \n "" 每個線程都會收到一個不同的ID,所以必然會執行一個不同的計算。*/ \n "" const int idx = get_global_id(0); \n "" \n "" /* 每個work-item都會檢查自己的id是否在向量數組的區間內。 \n "" 如果在,work-item就會執行相應的計算。*/ \n "" if (idx < num) \n "" res[idx] = src_a[idx] + src_b[idx]; \n "" } \n ";printf("programSource\n %s \n", programSource);cl_program program = clCreateProgramWithSource(context, 1, reinterpret_cast<const char**>(&programSource), NULL, &error);if (error != CL_SUCCESS) {cout << "Error creating program: " << error << endl;exit(error);}//builds the programerror = clBuildProgram(program, 1, &device, NULL, NULL, NULL);if (error != CL_SUCCESS) {cout << "Error Build program: " << error << endl;exit(error);}// check the kernel codesize_t bufSize = strlen(programSource) +1; //為什么大一個字節char* programBuffer = (char*)malloc(bufSize);size_t program_size_ret;error = clGetProgramInfo(program, CL_PROGRAM_SOURCE, bufSize, programBuffer, &program_size_ret);if (error != CL_SUCCESS){cout << "Error clGetProgramInfo:" << error << endl;}printf(" program_size_ret %ld\n", program_size_ret);printf("bufSize = %ld \n", bufSize);printf("Print Program Source:\n");printf("\n %s \n", programBuffer);// shows the logchar* build_log;size_t log_size;// 1st get the log_sizeclGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);build_log = new char[log_size + 1];cout << log_size << "log_size" << endl;//2nd get logclGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);build_log[log_size] = '\0';cout << build_log <<"build log" <<endl;delete[] build_log;//extacting the kernelcl_kernel vector_add_k = clCreateKernel(program, "vector_add_gpu", &error);if (error != CL_SUCCESS) {cout << "Error extacting the kernel: " << error << endl;exit(error);}// Enqueuing parameterserror = clSetKernelArg(vector_add_k, 0, sizeof(cl_mem), &src_a_d);if (error != CL_SUCCESS){cout << "Error Enqueuing 0 parameters:" << error << endl;}error |= clSetKernelArg(vector_add_k, 1, sizeof(cl_mem), &src_b_d);if (error != CL_SUCCESS){cout << "Error Enqueuing 1 parameters:" << error << endl;}error |= clSetKernelArg(vector_add_k, 2, sizeof(cl_mem), &res_d);if (error != CL_SUCCESS){cout << "Error Enqueuing 2 parameters:" << error << endl;}error |= clSetKernelArg(vector_add_k, 3, sizeof(int), &size);if (error != CL_SUCCESS){cout << "Error Enqueuing 3 parameters:" << error << endl;}//launching the kernelconst size_t local_ws = 512; // Number of work-items per work-groupconst size_t global_ws = shrRoundUp(local_ws, size);ULONGLONG t1 = GetTickCount64();error = clEnqueueNDRangeKernel(queue, vector_add_k, 1, NULL, &global_ws, &local_ws, 0, NULL, NULL);ULONGLONG t2 = GetTickCount64();cout << "GPU cost time :" << (t2 - t1) <<endl;if (error != CL_SUCCESS){cout << "Error Enqueuing EnqueueNDRangeKernel:" << error << endl;}float* check = new float[size];clEnqueueReadBuffer(queue, res_d, CL_TRUE, 0, mem_size, check, 0, NULL, NULL);// Checking with the CPU results;ULONGLONG t3 = GetTickCount64();vector_add_cpu(src_a_h, src_b_h, res_h, size); ULONGLONG t4 = GetTickCount64();cout << "CPU cost time :" << (t4 - t3) << endl;for (int i = 0; i < size; i++){if (check[i] != res_h[i]){printf("calc ERROR GPU value is %f CPU value is %f \n", check[i], res_h[i]);}}cout << "Congratulations, it's working! \n" << endl;// Cleaning updelete[] src_a_h;delete[] src_b_h;delete[] res_h;delete[] check;clReleaseKernel(vector_add_k);clReleaseCommandQueue(queue);clReleaseContext(context);clReleaseMemObject(src_a_d);clReleaseMemObject(src_b_d);clReleaseMemObject(res_d);return 0; }總結
以上是生活随笔為你收集整理的VS2019 OpenCL安装和快速入门的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 电商企业怎样用好大数据
- 下一篇: 删除Autorun.inf的方法