OpenCL, C++: Unexpected Results of simple sum float vector program -
it simple program read 2 float4 vectors files calculate sum of opposite numbers. result of not expected!!
the main file:
#include <limits.h> #include <stdio.h> #include <stdlib.h> #include <iostream> #include <iomanip> #include <array> #include <fstream> #include <sstream> #include <string> #include <algorithm> #include <iterator> #ifdef __apple__ #include <opencl/opencl.h> #else #include <cl/cl.h> #include <time.h> #endif const int number_of_points = 16; // number of points in both , b files (number of rows) const int number_of_axis = 4; // number of points axis in both , b files (number of columns) using namespace std; void checkerror(cl_int err, const char *operation) { if (err != cl_success) { fprintf(stderr, "error during operation '%s': %d\n", operation, err); exit(1); } } int main(int argc, char *argv[]) { clock_t tstart = clock(); // create 2 input vectors // working variables int i; ifstream input_filea, input_fileb; // input files string line; // transfer row file array float x; // transfer word file array int row = 0; // number of rows of file a,b (= array) int col = 0; // number of rows of file a,b (= array) // working arrays // working arrays // int mem_size_tempa = number_of_points * number_of_axis * sizeof(cl_float); // int mem_size_tempb = number_of_points * number_of_axis * sizeof(cl_float); float tempaarray[number_of_points][number_of_axis]={{0}}; // array contains file data float tempbarray[number_of_points][number_of_axis]={{0}}; // array contains file b data int mem_size_inputa = number_of_points ; int mem_size_inputb = number_of_points ; int mem_size_output = number_of_points ; float *inputaarray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file data float *inputbarray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file b data float *outputarray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file b data // import input files input_filea.open(argv[1]); input_fileb.open(argv[2]); // transfer input files data array // input file arraya row = 0; while (getline(input_filea, line)) { istringstream streama(line); col = 0; while(streama >> x){ tempaarray[row][col] = x; col++; } row++; } // input file b arrayb row = 0; while (getline(input_fileb, line)) { istringstream streamb(line); col = 0; while(streamb >> x){ tempbarray[row][col] = x; col++; } row++; } // switch columns of b array for(int row_of_arrayb = 0; row_of_arrayb < number_of_points; row_of_arrayb++ ) { float temporary = tempbarray[row_of_arrayb][2]; tempbarray[row_of_arrayb][2] = tempbarray[row_of_arrayb][1]; tempbarray[row_of_arrayb][1] = temporary; } // array 3d vectors // (int row_of_array = 0; row_of_array<number_of_points; row_of_array++) // { // inputaarray[row_of_array] = (tempaarray[row_of_array][0], tempaarray[row_of_array][1], tempaarray[row_of_array][2],0); // inputbarray[row_of_array] = (tempbarray[row_of_array][0], tempbarray[row_of_array][1], tempbarray[row_of_array][2],0); // } (int row_of_array=0; row_of_array < number_of_points; row_of_array++) { inputaarray[row_of_array*4+0] = tempaarray[row_of_array][0]; inputaarray[row_of_array*4+1] = tempaarray[row_of_array][1]; inputaarray[row_of_array*4+2] = tempaarray[row_of_array][2]; inputaarray[row_of_array*4+3] = 0.0f; // inputaarray[row_of_array]= float(4) (tempaarray[row_of_array][0], tempaarray[row_of_array][1], tempaarray[row_of_array][2], 0.0f); inputbarray[row_of_array*4+0] = tempbarray[row_of_array][0]; inputbarray[row_of_array*4+1] = tempbarray[row_of_array][1]; inputbarray[row_of_array*4+2] = tempbarray[row_of_array][2]; inputbarray[row_of_array*4+3] = 0.0f; outputarray[row_of_array*4+0] = 0.0f; outputarray[row_of_array*4+1] = 0.0f; outputarray[row_of_array*4+2] = 0.0f; outputarray[row_of_array*4+3] = 0.0f; // inputbarray[row_of_array] = (tempbarray[row_of_array][0], tempbarray[row_of_array][1], tempbarray[row_of_array][2],0); } // (int row_of_array=0; row_of_array < number_of_points; row_of_array++) // { // printf("0: %f, 1: %f, 2: %f, 3: %f \n", inputaarray[row_of_array*number_of_points+0], inputaarray[row_of_array*number_of_points+1], // inputaarray[row_of_array*number_of_points+2], inputaarray[row_of_array*number_of_points+3]); // } // close input files input_filea.close(); input_fileb.close(); // load kernel source code array source_str file *fp; char *source_str; size_t source_size; fp = fopen("calculate_bottom_snm_kernel.cl", "r"); if (!fp) { fprintf(stderr, "failed load kernel.\n"); exit(1); } fseek(fp, 0, seek_end); size_t programlength = ftell(fp); rewind(fp); source_str = (char*)malloc(programlength+1); source_size = fread( source_str, 1, programlength, fp); source_str[programlength] = '\0'; fclose( fp ); // platform , device information cl_platform_id platform_id = null; cl_device_id device_id = null; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret = clgetplatformids(1, &platform_id, &ret_num_platforms); ret = clgetdeviceids( platform_id, cl_device_type_all, 1, &device_id, &ret_num_devices); // create opencl context cl_context context = clcreatecontext( null, 1, &device_id, null, null, &ret); // create command queue cl_command_queue command_queue = clcreatecommandqueue(context, device_id, 0, &ret); // create memory buffers on device each vector cl_mem inputa_mem_obj = clcreatebuffer(context, cl_mem_read_only, mem_size_inputa*sizeof(cl_float4) , null, &ret); cl_mem inputb_mem_obj = clcreatebuffer(context, cl_mem_read_only, mem_size_inputb*sizeof(cl_float4), null, &ret); cl_mem output_mem_obj = clcreatebuffer(context, cl_mem_write_only, mem_size_output*sizeof(cl_float4), null, &ret); // copy lists , b respective memory buffers ret = clenqueuewritebuffer(command_queue, inputa_mem_obj, cl_true, 0, mem_size_inputa*sizeof(cl_float4), inputaarray, 0, null, null); ret = clenqueuewritebuffer(command_queue, inputb_mem_obj, cl_true, 0, mem_size_inputb*sizeof(cl_float4), inputbarray, 0, null, null); // create program kernel source cl_program program = clcreateprogramwithsource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); // build program ret = clbuildprogram(program, 1, &device_id, null, null, null); if (ret == cl_build_program_failure) { // size of build log size_t logsize; ret = clgetprogrambuildinfo(program, device_id, cl_program_build_log, 0, null, &logsize); checkerror(ret, "getting build log size"); // build log char log[logsize]; ret = clgetprogrambuildinfo(program, device_id, cl_program_build_log, logsize, log, null); checkerror(ret, "getting build log"); printf("opencl program build log:\n%s\n", log); exit(1); } // create opencl kernel cl_kernel kernel = clcreatekernel(program, "calculate_bottom_snm", &ret); // set arguments of kernel ret = clsetkernelarg(kernel, 0, sizeof(cl_mem), (void *)&inputa_mem_obj); ret = clsetkernelarg(kernel, 1, sizeof(cl_mem), (void *)&inputb_mem_obj); ret = clsetkernelarg(kernel, 2, sizeof(cl_mem), (void *)&output_mem_obj); // execute opencl kernel on list size_t global_item_size = number_of_points; // process entire lists size_t local_item_size = 4; // process in groups of 64 ret = clenqueuendrangekernel(command_queue, kernel, 1, null, &global_item_size, &local_item_size, 0, null, null); // read memory buffer c on device local variable c // int *c = (int*)malloc(sizeof(int)*number_of_points); // float *c = (float*)malloc(sizeof(float)*number_of_points); ret = clenqueuereadbuffer(command_queue, output_mem_obj, cl_true, 0, mem_size_output, outputarray, 0, null, null); // display result screen // float buttomsnm = 0; for(i = 0; < number_of_points; i++) { printf("%f + %f = %f, \n",inputaarray[i*4+0],inputbarray[i*4+0], outputarray[i*4+0]); } // clean ret = clflush(command_queue); ret = clfinish(command_queue); ret = clreleasekernel(kernel); ret = clreleaseprogram(program); ret = clreleasememobject(inputa_mem_obj); ret = clreleasememobject(inputb_mem_obj); ret = clreleasememobject(output_mem_obj); ret = clreleasecommandqueue(command_queue); ret = clreleasecontext(context); free (inputaarray); free (inputbarray); free (outputarray); printf("all time taken: %.2fs\n", (double)(clock() - tstart)/clocks_per_sec); return 0; }
kernel:
__kernel void calculate_bottom_snm(__global float4 *inputaarray, __global float4 *inputbarray, __global float4 *outputarray) { // index of current element int = get_global_id(0); outputarray[i].x = inputaarray[i].x + inputbarray[i].x; // first component outputarray[i].y = inputaarray[i].y + inputbarray[i].y; // second component outputarray[i].z = inputaarray[i].z + inputbarray[i].z; // third component outputarray[i].w = inputaarray[i].w + inputbarray[i].w; // third component }
the first input file a:
0 0.000000e+00 9.998994e-01 1 1.000000e-03 9.998981e-01 2 2.000000e-03 9.998967e-01 3 3.000000e-03 9.998953e-01 4 4.000000e-03 9.998939e-01 5 5.000000e-03 9.998925e-01 6 6.000000e-03 9.998911e-01 7 7.000000e-03 9.998896e-01 8 8.000000e-03 9.998881e-01 9 9.000000e-03 9.998865e-01 10 1.000000e-02 9.998850e-01 11 1.100000e-02 9.998834e-01 12 1.200000e-02 9.998817e-01 13 1.300000e-02 9.998800e-01 14 1.400000e-02 9.998783e-01 15 1.500000e-02 9.998766e-01
the second input file b:
0 0.000000e+00 9.998966e-01 1 1.000000e-03 9.998953e-01 2 2.000000e-03 9.998939e-01 3 3.000000e-03 9.998925e-01 4 4.000000e-03 9.998911e-01 5 5.000000e-03 9.998896e-01 6 6.000000e-03 9.998881e-01 7 7.000000e-03 9.998866e-01 8 8.000000e-03 9.998850e-01 9 9.000000e-03 9.998834e-01 10 1.000000e-02 9.998818e-01 11 1.100000e-02 9.998801e-01 12 1.200000e-02 9.998785e-01 13 1.300000e-02 9.998767e-01 14 1.400000e-02 9.998750e-01 15 1.500000e-02 9.998732e-01
the output should results of sum last 2 files, printed first columns it's same behavior others:
the output:
0.000000 + 0.000000 = 0.000000, 1.000000 + 1.000000 = 0.000000, 2.000000 + 2.000000 = 0.000000, 3.000000 + 3.000000 = 0.000000, 4.000000 + 4.000000 = 0.000000, 5.000000 + 5.000000 = 0.000000, 6.000000 + 6.000000 = 0.000000, 7.000000 + 7.000000 = 0.000000, 8.000000 + 8.000000 = 0.000000, 9.000000 + 9.000000 = 0.000000, 10.000000 + 10.000000 = 0.000000, 11.000000 + 11.000000 = 0.000000, 12.000000 + 12.000000 = 0.000000, 13.000000 + 13.000000 = 0.000000, 14.000000 + 14.000000 = 0.000000, 15.000000 + 15.000000 = 0.000000, time taken: 0.07s
thanks in advance,
you not copying correct number of bytes device host:
int mem_size_output = number_of_points ; ... ret = clenqueuereadbuffer(command_queue, output_mem_obj, cl_true, 0, mem_size_output, outputarray, 0, null, null);
the amount of data in buffer number_of_points * sizeof(cl_float4)
.
Comments
Post a Comment