From Documentation
Jump to: navigation, search
(Determining which OpenCL devices are available: update)
Line 66: Line 66:
  
 
int main(int argc, char** argv) {
 
int main(int argc, char** argv) {
 +
  int MAX_PLATFORMS=10;
 +
  int MAX_DEVICES=10;
 
   char dname[500];
 
   char dname[500];
   cl_device_id devices[10];
+
   cl_device_id devices[MAX_DEVICES];
 
   cl_uint num_devices,entries;
 
   cl_uint num_devices,entries;
 
   cl_ulong long_entries;
 
   cl_ulong long_entries;
   int d;
+
   int d,ip;
 
   cl_int err;
 
   cl_int err;
   cl_platform_id platform_id = NULL;
+
  cl_uint num_platforms;
 +
   cl_platform_id platform_id[MAX_PLATFORMS];
 
   size_t p_size;
 
   size_t p_size;
  
 
/* obtain list of platforms available */
 
/* obtain list of platforms available */
   err = clGetPlatformIDs(1, &platform_id,NULL);
+
   err = clGetPlatformIDs(2, platform_id,&num_platforms);
 
   if (err != CL_SUCCESS)
 
   if (err != CL_SUCCESS)
 
   {
 
   {
Line 82: Line 85:
 
       return 0;
 
       return 0;
 
   }
 
   }
 +
  printf("Found %d platforms \n", num_platforms);
  
 +
    for (ip=0;ip<num_platforms;ip++){
 
/* obtain information about platform */
 
/* obtain information about platform */
  clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,500,dname,NULL);
+
        clGetPlatformInfo(platform_id[ip],CL_PLATFORM_NAME,500,dname,NULL);
  printf("CL_PLATFORM_NAME = %s\n", dname);
+
        printf("CL_PLATFORM_NAME = %s\n", dname);
  clGetPlatformInfo(platform_id,CL_PLATFORM_VERSION,500,dname,NULL);
+
        clGetPlatformInfo(platform_id[ip],CL_PLATFORM_VERSION,500,dname,NULL);
  printf("CL_PLATFORM_VERSION = %s\n", dname);
+
        printf("CL_PLATFORM_VERSION = %s\n", dname);
  
 
/* obtain list of devices available on platform */
 
/* obtain list of devices available on platform */
  clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, 10, devices, &num_devices);
+
        clGetDeviceIDs(platform_id[ip], CL_DEVICE_TYPE_ALL, 10, devices, &num_devices);
  printf("%d devices found\n", num_devices);
+
        printf("%d devices found\n", num_devices);
  
 
/* query devices for information */
 
/* query devices for information */
  for (d = 0; d < num_devices; ++d) {
 
      clGetDeviceInfo(devices[d], CL_DEVICE_NAME, 500, dname,NULL);
 
      printf("Device #%d name = %s\n", d, dname);
 
      clGetDeviceInfo(devices[d],CL_DRIVER_VERSION, 500, dname,NULL);
 
      printf("\tDriver version = %s\n", dname);
 
      clGetDeviceInfo(devices[d],CL_DEVICE_GLOBAL_MEM_SIZE,sizeof(cl_ulong),&long_entries,NULL);
 
      printf("\tGlobal Memory (MB):\t%llu\n",long_entries/1024/1024);
 
      clGetDeviceInfo(devices[d],CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,sizeof(cl_ulong),&long_entries,NULL);
 
      printf("\tGlobal Memory Cache (MB):\t%llu\n",long_entries/1024/1024);
 
      clGetDeviceInfo(devices[d],CL_DEVICE_LOCAL_MEM_SIZE,sizeof(cl_ulong),&long_entries,NULL);
 
      printf("\tLocal Memory (KB):\t%llu\n",long_entries/1024);
 
      clGetDeviceInfo(devices[d],CL_DEVICE_MAX_CLOCK_FREQUENCY,sizeof(cl_ulong),&long_entries,NULL);
 
      printf("\tMax clock (MHz) :\t%llu\n",long_entries);
 
      clGetDeviceInfo(devices[d],CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(size_t),&p_size,NULL);
 
      printf("\tMax Work Group Size:\t%d\n",p_size);
 
      clGetDeviceInfo(devices[d],CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&entries,NULL);
 
      printf("\tNumber of parallel compute cores:\t%d\n",entries);
 
  }
 
  
  return 0;
+
        for (d = 0; d < num_devices; ++d) {
 +
            clGetDeviceInfo(devices[d], CL_DEVICE_NAME, 500, dname,NULL);
 +
            printf("Device #%d name = %s\n", d, dname);
 +
            clGetDeviceInfo(devices[d],CL_DRIVER_VERSION, 500, dname,NULL);
 +
            printf("\tDriver version = %s\n", dname);
 +
            clGetDeviceInfo(devices[d],CL_DEVICE_GLOBAL_MEM_SIZE,sizeof(cl_ulong),&long_entries,NULL);
 +
            printf("\tGlobal Memory (MB):\t%llu\n",long_entries/1024/1024);
 +
            clGetDeviceInfo(devices[d],CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,sizeof(cl_ulong),&long_entries,NULL);
 +
            printf("\tGlobal Memory Cache (MB):\t%llu\n",long_entries/1024/1024);
 +
            clGetDeviceInfo(devices[d],CL_DEVICE_LOCAL_MEM_SIZE,sizeof(cl_ulong),&long_entries,NULL);
 +
            printf("\tLocal Memory (KB):\t%llu\n",long_entries/1024);
 +
            clGetDeviceInfo(devices[d],CL_DEVICE_MAX_CLOCK_FREQUENCY,sizeof(cl_ulong),&long_entries,NULL);
 +
            printf("\tMax clock (MHz) :\t%llu\n",long_entries);
 +
            clGetDeviceInfo(devices[d],CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(size_t),&p_size,NULL);
 +
            printf("\tMax Work Group Size:\t%d\n",p_size);
 +
            clGetDeviceInfo(devices[d],CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&entries,NULL);
 +
            printf("\tNumber of parallel compute cores or multiprocessors:\t%d\n",entries);
 +
        }
 +
    }
 +
    return 0;
 
}
 
}
 
</source>
 
</source>
Line 119: Line 125:
 
===Output===
 
===Output===
  
Output of this program is shown below.  Note the useful information, including the version of OpenCL installed, and the capabilities of hardware visible to OpenCL.  On machines with NVIDIA cards using NVIDIA's OpenCL only the GPUs are visibleOn ATI cards both the GPUs and CPUs will be visible.
+
Output of this program running on a graham GPU node is shown below.  Note the useful information, including the version of OpenCL installed, and the capabilities of hardware visible to OpenCL.  Note that two versions of OpenCL are present: one from Intel that uses the CPU and detects it as a device, and one from NVIDIA that uses a GPU (and detects 2 GPUs as devices in this case)A program needs to make a choice which of the available devices to use.
  
The output of this program on viz4-uoguelph.sharcnet.ca  machine is:
+
<source lang="C">
  
CL_PLATFORM_NAME = NVIDIA CUDA
+
Found 2 platforms
CL_PLATFORM_VERSION = OpenCL 1.0 CUDA 4.0.1
+
CL_PLATFORM_NAME = Intel(R) OpenCL
1 devices found
+
CL_PLATFORM_VERSION = OpenCL 2.0 LINUX
Device #0 name = GeForce GTX 480
+
1 devices found
        Driver version = 270.41.19
+
Device #0 name = Intel(R) Xeon(R) CPU E5-2683 v4 @ 2.10GHz
        Global Memory (MB):     1535
+
Driver version = 1.2.0.37
        Global Memory Cache (MB):       0
+
Global Memory (MB): 128540
        Local Memory (KB):     48
+
Global Memory Cache (MB): 0
        Max clock (MHz) :       1401
+
Local Memory (KB): 32
        Max Work Group Size:   1024
+
Max clock (MHz) : 2100
        Number of parallel compute cores:       15
+
Max Work Group Size: 8192
 +
Number of parallel compute cores or multiprocessors: 32
 +
CL_PLATFORM_NAME = NVIDIA CUDA
 +
CL_PLATFORM_VERSION = OpenCL 1.2 CUDA 10.0.141
 +
2 devices found
 +
Device #0 name = Tesla P100-PCIE-12GB
 +
Driver version = 410.48
 +
Global Memory (MB): 12198
 +
Global Memory Cache (MB): 0
 +
Local Memory (KB): 48
 +
Max clock (MHz) : 1328
 +
Max Work Group Size: 1024
 +
Number of parallel compute cores or multiprocessors: 56
 +
Device #1 name = Tesla P100-PCIE-12GB
 +
Driver version = 410.48
 +
Global Memory (MB): 12198
 +
Global Memory Cache (MB): 0
 +
Local Memory (KB): 48
 +
Max clock (MHz) : 1328
 +
Max Work Group Size: 1024
 +
Number of parallel compute cores or multiprocessors: 56
  
The output of on viz10-uwo.sharcnet.ca machine is:
+
</source>
 
+
CL_PLATFORM_NAME = AMD Accelerated Parallel Processing
+
CL_PLATFORM_VERSION = OpenCL 1.1 AMD-APP-SDK-v2.4 (595.10)
+
2 devices found
+
Device #0 name = Cypress
+
        Driver version = CAL 1.4.900
+
        Global Memory (MB):    2048
+
        Global Memory Cache (MB):      0
+
        Local Memory (KB):      32
+
        Max clock (MHz) :      0
+
        Max Work Group Size:    256
+
        Number of parallel compute cores:      20
+
Device #1 name = Intel(R) Xeon(R) CPU          X5560  @ 2.80GHz
+
        Driver version = 2.0
+
        Global Memory (MB):    48444
+
        Global Memory Cache (MB):      0
+
        Local Memory (KB):      32
+
        Max clock (MHz) :      1596
+
        Max Work Group Size:    1024
+
        Number of parallel compute cores:      8
+
  
 
==Using OpenCL in Python with PyOpenCL==
 
==Using OpenCL in Python with PyOpenCL==

Revision as of 13:39, 14 May 2019

OpenCL is the first open standard for writing programs that can execute across heterogeneous platforms, most importantly both CPUs and GPUs. OpenCL includes a language (based on C) for writing kernels (functions which can be executed on OpenCL devices), plus APIs to access and control the devices.

The best place to obtain authoritative information about OpenCL is the website of the Khronos Group consortium which maintains the OpenCL standard.

Compiling OpenCL programs

Compile as you would any other C code, but link to the OpenCL library. On some clusters you also need to specify path to the include files.

To compile on the graham cluster with the Intel compiler, run:

module load cuda
icc -o test.x code.c -lOpenCL

To compile on the graham cluster with GCC compiler, run:

module load cuda
gcc -o test.x code.c -lOpenCL

Compiling OpenCL examples from CUDA SDK on monk

Starting with CUDA 5.0, NVIDIA no longer ships OpenCL samples bundled with CUDA. They can still be downloaded from NVIDIA developer OpenCL page.

To run the OpenCL software development kit examples that come with CUDA 4.1, you need to first switch to that CUDA version with

module switch cuda/4.1

then you should copy its files to some location in your user space (here it will be /work/$USER/my_sdk_work but you can choose something else), following the steps below. Here $USER is an environment variable that is set to your username. You can replace $USER with your actual username in the commands below if you prefer.

cd /work/$USER/
mkdir my_sdk_work
cd my_sdk_work
cp -rp /opt/sharcnet/cuda/4.1/sdk/* .
cd OpenCL
make

This will create the binary executable files in

/work/$USER/mysdk/OpenCL/bin/linux/release

You can then experiment with changes in the source code, located in:

/work/$USER/mysdk/OpenCL/src

The examples you compiled will not run on the monk login node which does not have a GPU. They will run if you submit them as jobs to the gpu queue. You can also log into mon54 node of monk which has been set aside as a development node. To do that, do "ssh mon54" once you have logged into monk. It has two GPUs accessible to users in interactive mode, i.e. you don't have to use sqsub to run, and can just run executables that use the GPU from the command line.

Determining which OpenCL devices are available

Since OpenCL is designed to run on many platforms, it is particularly important for an OpenCL program to determine the characteristics of the hardware it is running on. OpenCL standard provides a rich set of routines which can provide detailed information about the capabilities of the system and the OpenCL devices available.

Below is an example program which lists some system information and devices available. Feel free to use it to determine the capabilities of the system you are on.

To provide useful information it should be run on a compute node which has OpenCL devices (GPUs) connected.

#include <stdio.h>
#include <CL/cl.h>
 
int main(int argc, char** argv) {
   int MAX_PLATFORMS=10;
   int MAX_DEVICES=10;
   char dname[500];
   cl_device_id devices[MAX_DEVICES];
   cl_uint num_devices,entries;
   cl_ulong long_entries;
   int d,ip;
   cl_int err;
   cl_uint num_platforms;
   cl_platform_id platform_id[MAX_PLATFORMS];
   size_t p_size;
 
/* obtain list of platforms available */
   err = clGetPlatformIDs(2, platform_id,&num_platforms);
   if (err != CL_SUCCESS)
   {
       printf("Error: Failure in clGetPlatformIDs,error code=%d \n",err);
       return 0;
   }
   printf("Found %d platforms \n", num_platforms);
 
    for (ip=0;ip<num_platforms;ip++){
/* obtain information about platform */
        clGetPlatformInfo(platform_id[ip],CL_PLATFORM_NAME,500,dname,NULL);
        printf("CL_PLATFORM_NAME = %s\n", dname);
        clGetPlatformInfo(platform_id[ip],CL_PLATFORM_VERSION,500,dname,NULL);
        printf("CL_PLATFORM_VERSION = %s\n", dname);
 
/* obtain list of devices available on platform */
        clGetDeviceIDs(platform_id[ip], CL_DEVICE_TYPE_ALL, 10, devices, &num_devices);
        printf("%d devices found\n", num_devices);
 
/* query devices for information */
 
        for (d = 0; d < num_devices; ++d) {
            clGetDeviceInfo(devices[d], CL_DEVICE_NAME, 500, dname,NULL);
            printf("Device #%d name = %s\n", d, dname);
            clGetDeviceInfo(devices[d],CL_DRIVER_VERSION, 500, dname,NULL);
            printf("\tDriver version = %s\n", dname);
            clGetDeviceInfo(devices[d],CL_DEVICE_GLOBAL_MEM_SIZE,sizeof(cl_ulong),&long_entries,NULL);
            printf("\tGlobal Memory (MB):\t%llu\n",long_entries/1024/1024);
            clGetDeviceInfo(devices[d],CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,sizeof(cl_ulong),&long_entries,NULL);
            printf("\tGlobal Memory Cache (MB):\t%llu\n",long_entries/1024/1024);
            clGetDeviceInfo(devices[d],CL_DEVICE_LOCAL_MEM_SIZE,sizeof(cl_ulong),&long_entries,NULL);
            printf("\tLocal Memory (KB):\t%llu\n",long_entries/1024);
            clGetDeviceInfo(devices[d],CL_DEVICE_MAX_CLOCK_FREQUENCY,sizeof(cl_ulong),&long_entries,NULL);
            printf("\tMax clock (MHz) :\t%llu\n",long_entries);
            clGetDeviceInfo(devices[d],CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(size_t),&p_size,NULL);
            printf("\tMax Work Group Size:\t%d\n",p_size);
            clGetDeviceInfo(devices[d],CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&entries,NULL);
            printf("\tNumber of parallel compute cores or multiprocessors:\t%d\n",entries);
        }
    }
    return 0;
}

Output

Output of this program running on a graham GPU node is shown below. Note the useful information, including the version of OpenCL installed, and the capabilities of hardware visible to OpenCL. Note that two versions of OpenCL are present: one from Intel that uses the CPU and detects it as a device, and one from NVIDIA that uses a GPU (and detects 2 GPUs as devices in this case). A program needs to make a choice which of the available devices to use.

Found 2 platforms
CL_PLATFORM_NAME = Intel(R) OpenCL
CL_PLATFORM_VERSION = OpenCL 2.0 LINUX
1 devices found
Device #0 name = Intel(R) Xeon(R) CPU E5-2683 v4 @ 2.10GHz
	Driver version = 1.2.0.37
	Global Memory (MB):	128540
	Global Memory Cache (MB):	0
	Local Memory (KB):	32
	Max clock (MHz) :	2100
	Max Work Group Size:	8192
	Number of parallel compute cores or multiprocessors:	32
CL_PLATFORM_NAME = NVIDIA CUDA
CL_PLATFORM_VERSION = OpenCL 1.2 CUDA 10.0.141
2 devices found
Device #0 name = Tesla P100-PCIE-12GB
	Driver version = 410.48
	Global Memory (MB):	12198
	Global Memory Cache (MB):	0
	Local Memory (KB):	48
	Max clock (MHz) :	1328
	Max Work Group Size:	1024
	Number of parallel compute cores or multiprocessors:	56
Device #1 name = Tesla P100-PCIE-12GB
	Driver version = 410.48
	Global Memory (MB):	12198
	Global Memory Cache (MB):	0
	Local Memory (KB):	48
	Max clock (MHz) :	1328
	Max Work Group Size:	1024
	Number of parallel compute cores or multiprocessors:	56

Using OpenCL in Python with PyOpenCL

Detailed instructions provided on separate PyOpenCL page.

References

o Website of Khronos Group Consortium Which Manages the OpenCL Standard
http://www.khronos.org/opencl" >http://www.khronos.org/opencl

o NVIDIA OpenCL Website
http://www.nvidia.com/object/cuda_opencl_new.html" >http://www.nvidia.com/object/cuda_opencl_new.html