fpcsrc/packages/opencl/examples/basicsample.pas


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248

{
The sample is give at
http://developer.apple.com/mac/library/samplecode/OpenCL_Hello_World_Example/index.html

===========================================================================
DESCRIPTION:

A simple "Hello World" compute example showing basic usage of OpenCL which
calculates the mathematical square (X[i] = pow(X[i],2)) for a buffer of
floating point values.

For simplicity, this example is intended to be run from the command line.
If run from within XCode, open the Run Log (Command-Shift-R) to see the
output.  Alternatively, run the applications from within a Terminal.app
session to launch from the command line.

===========================================================================
BUILD REQUIREMENTS:

Mac OS X v10.6 or later
  or
Windows with NVidia OpenCL SDK Installed

===========================================================================
RUNTIME REQUIREMENTS:

Mac OS X v10.6 or later

  To use the GPU as a compute device, use one of the following devices:
  - MacBook Pro w/NVidia GeForce 8600M
  - Mac Pro w/NVidia GeForce 8800GT
  
  If you don't have powerful GPU you can try to use CPU instead: 
  by changing  
    gpu := CL_DEVICE_TYPE_GPU; 
    to
    gpu := CL_DEVICE_TYPE_CPU; 

Windows with NVidia OpenCL SDK Installed and libOpenCLXX.dll available

CL_DEVICE_TYPE_CPU doesn't seem to work for windows

===========================================================================
}
program testcl;

{$mode objfpc}{$H+}

uses
  ctypes, cl;


// Use a static data size for simplicity

const
  DATA_SIZE = 1024;

// Simple compute kernel which computes the square of an input array
const
  KernelSource : PChar = 
  '__kernel void square(                   '#10+
  '   __global float* input,               '#10+
  '   __global float* output,              '#10+
  '   const unsigned int count)            '#10+
  '{                                       '#10+
  '   int i = get_global_id(0);            '#10+
  '   if(i < count)                        '#10+
  '       output[i] = input[i] * input[i]; '#10+
  '} '#0;


var
  err     : Integer; // error code returned from api calls
  data    : array [0..DATA_SIZE-1] of single; // original data set given to device
  results : array [0..DATA_SIZE-1] of single; // results returned from device
  correct : LongWord; // number of correct results returned

  global  : csize_t; // global domain size for our calculation
  local   : csize_t; // local domain size for our calculation

  device_id : cl_device_id;      // compute device id
  context   : cl_context;        // compute context
  commands  : cl_command_queue;  // compute command queue
  prog      : cl_program;        // compute program
  kernel    : cl_kernel;         // compute kernel

  input   : cl_mem; // device memory used for the input array
  output  : cl_mem; // device memory used for the output array

  i     : Integer;
  count : Integer;
  gpu   : cl_device_type;

  tmpd  : single;
begin
  // Fill our data set with random float values
  count := DATA_SIZE;
  for i:=0 to count - 1 do
    data[i]:= random;

  // Connect to a compute device
  // change CL_DEVICE_TYPE_CPU to CL_DEVICE_TYPE_GPU is you have powerful video (GeForce 8800/8600M or higher)
  gpu := CL_DEVICE_TYPE_GPU;

  device_id:=nil;
  err := clGetDeviceIDs(nil, gpu, 1, @device_id, nil);
  writeln('clGetDeviceIDs ', err);
  if (err <> CL_SUCCESS) then begin
    Writeln('Error: Failed to create a device group!');
    Halt($FF);
  end;

  // Create a compute context
  context := clCreateContext(nil, 1, @device_id, nil, nil, err);
  writeln('clCreateContext ', err);
  if context=nil then begin
    Writeln('Error: Failed to create a compute context!');
    Halt($FF);
  end;

  // Create a command commands
  commands := clCreateCommandQueue(context, device_id, 0, err);
  writeln('clCreateCommandQueue ', err);
  if commands=nil then begin
    Writeln('Error: Failed to create a command commands!');
    Halt($FF);
  end;

  // Create the compute program from the source buffer
  prog:= clCreateProgramWithSource(context, 1, PPChar(@KernelSource), nil, err);
  writeln('clCreateProgramWithSource ', err);
  if prog=nil then begin
    writeln('Error: Failed to create compute program! ');
    Halt($FF);
  end;

  // Build the program executable
  err := clBuildProgram(prog, 0, nil, nil, nil, nil);
  writeln('clBuildProgram ', err);
  if (err <> CL_SUCCESS) then begin
    writeln('Error: Failed to build program executable!');
    Halt(1);
  end;

  // Create the compute kernel in the program we wish to run
  kernel := clCreateKernel(prog, 'square', err);
  writeln('clCreateKernel ', err);
  if (kernel=nil) or (err <> CL_SUCCESS) then begin
    writeln('Error: Failed to create compute kernel!');
    Halt(1);
  end;
  
  err := clGetKernelWorkGroupInfo(kernel, device_id,  CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), @local, nil);
  writeln('clGetKernelWorkGroupInfo ', err);
  if (err<>CL_SUCCESS) then begin
    writeln('Error: Failed to retrieve kernel work group info!');
    Halt(1);
  end;
  
  
  // Create the input and output arrays in device memory for our calculation
  input := clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(single) * count, nil, err);
  writeln('clCreateBuffer ', err);
  output := clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(single) * count, nil, err);
  writeln('clCreateBuffer ', err);
  if (input=nil) or (output=nil) then begin
    writeln('Error: Failed to allocate device memory!');
    Halt(1);
  end;

  // Write our data set into the input array in device memory
  err := clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(single) * count, @data, 0, nil, nil);
  writeln('clEnqueueWriteBuffer ', err);
  if (err <> CL_SUCCESS) then begin
    writeln('Error: Failed to write to source array!');
    Halt(1);
  end;

  // Set the arguments to our compute kernel
  err := 0;
  err := clSetKernelArg(kernel, 0, sizeof(cl_mem), @input);
  writeln('clSetKernelArg ', err);
  err := err or clSetKernelArg(kernel, 1, sizeof(cl_mem), @output);
  writeln('clSetKernelArg ', err);
  err := err or clSetKernelArg(kernel, 2, sizeof(longword), @count);
  writeln('clSetKernelArg ', err);
  if (err <> CL_SUCCESS) then begin
    writeln('Error: Failed to set kernel arguments! ');
    Halt(1);
  end;

  // Get the maximum work group size for executing the kernel on the device
  err := clGetKernelWorkGroupInfo(kernel, device_id,  CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), @local, nil);
  writeln('clGetKernelWorkGroupInfo ', err);
  if (err<>CL_SUCCESS) then begin
    writeln('Error: Failed to retrieve kernel work group info!');
    Halt(1);
  end;
  
  // Execute the kernel over the entire range of our 1d input data set
  // using the maximum number of work group items for this device
  global := count;
  err := clEnqueueNDRangeKernel(commands, kernel, 1, nil, @global, @local, 0, nil, nil);
  writeln('clEnqueueNDRangeKernel ',err);
  if (err<>0) then begin
    writeln('Error: Failed to execute kernel!');
    Halt($FF);
  end;

  // Wait for the command commands to get serviced before reading back results
  err:=clFinish(commands);
  writeln('clFinish ',err);

  // Read back the results from the device to verify the output
  err := clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(single) * count, @results, 0, nil, nil);
  writeln('clEnqueueReadBuffer ',err);
  if (err <> CL_SUCCESS) then begin
    writeln('Error: Failed to read output array! ', err);
    Halt(1);
  end;

  // Validate our results
  correct := 0;
  for i:= 0 to count - 1 do begin
    // FPU warning:
    //
    // the following check (as in original C sample)
    // if results[i] = data[i] * data[i] then
    //
    // return the incorrect result (FP accuracy?),
    // must store the result to single type variable first,
    // and then compare:
    tmpd:=data[i] * data[i];
    if results[i] = tmpd then inc(correct);
  end;

  // Print a brief summary detailing the results
  writeln('Computed ', correct, '/', count,' correct values!');

  // Shutdown and cleanup
  clReleaseMemObject(input);
  clReleaseMemObject(output);
  clReleaseProgram(prog);
  clReleaseKernel(kernel);
  clReleaseCommandQueue(commands);
  clReleaseContext(context);
end.