Source code for scilpy.gpuparallel.opencl_utils

# -*- coding: utf-8 -*-
import numpy as np
import logging
import inspect
import os
import scilpy

from dipy.utils.optpkg import optional_package
cl, have_opencl, _ = optional_package('pyopencl')



[docs]
def cl_device_type(device_type_str):
    if device_type_str == 'cpu':
        return cl.device_type.CPU
    if device_type_str == 'gpu':
        return cl.device_type.GPU
    return -1




[docs]
class CLManager(object):
    """
    Class for managing an OpenCL program.

    Wraps a subset of pyopencl functions to simplify its
    integration with python. The OpenCL program can be run
    on the cpu or on the gpu, given the appropriate drivers
    are installed.

    When multiple cpu or gpu are available, the
    one that first comes up in the list of available devices
    is selected.

    Parameters
    ----------
    cl_kernel: CLKernel object
        The CLKernel containing the OpenCL program to manage.
    device_type: string
        The device onto which to run the program. One of 'cpu', 'gpu'.
    """
    def __init__(self, cl_kernel, device_type='gpu'):
        if not have_opencl:
            raise RuntimeError('pyopencl is not installed. '
                               'Cannot create CLManager instance.')

        # Reduce verbose level for pyopencl
        logging.getLogger('pytools.persistent_dict').setLevel(logging.CRITICAL)
        logging.getLogger('pyopencl').setLevel(logging.CRITICAL)

        self.input_buffers = []
        self.output_buffers = []

        # maps key to index in buffers list
        self.inputs_mapping = {}
        self.outputs_mapping = {}

        # Find the best device for running GPU tasks
        platforms = cl.get_platforms()
        best_device = None
        for p in platforms:
            devices = p.get_devices()
            for d in devices:
                d_type = d.get_info(cl.device_info.TYPE)
                if d_type == cl_device_type(device_type)\
                   and best_device is None:
                    best_device = d  # take the first device of right type

        if best_device is None:
            raise ValueError('No device of type {} found'.format(device_type))

        self.context = cl.Context(devices=[best_device])
        self.queue = cl.CommandQueue(self.context)
        program = cl.Program(self.context, cl_kernel.code_string).build()
        self.kernel = cl.Kernel(program, cl_kernel.entry_point)


[docs]
    class OutBuffer(object):
        """
        Structure containing output buffer information.

        Parameters
        ----------
        buf: cl.Buffer
            The cl.Buffer object containing the output.
        shape: tuple
            Shape for the output array.
        dtype: dtype
            Datatype for output.
        """
        def __init__(self, buf, shape, dtype):
            self.buf = buf
            self.shape = shape
            self.dtype = dtype



[docs]
    def add_input_buffer(self, key, arr=None, dtype=np.float32):
        """
        Add an input buffer to the kernel program. Input buffers
        must be added in the same order as they are declared inside
        the kernel code (.cl file).

        Parameters
        ----------
        key: string
            Name of the buffer in the input buffers list. Used for
            referencing when updating buffers.
        arr: numpy ndarray
            Data array.
        dtype: dtype, optional
            Optional type for array data. It is recommended to use float32
            whenever possible to avoid unexpected behaviours.

        Note
        ----
        Array is reordered as fortran array and then flattened. This is
        important to keep in mind when writing kernel code.

        For example, for a 3-dimensional array of shape (X, Y, Z), the flat
        index for position i, j, k is idx = i + j * X + z * X * Y.
        """
        buf = None
        if arr is not None:
            # convert to fortran ordered, dtype array
            arr = np.asfortranarray(arr, dtype=dtype)
            buf = cl.Buffer(self.context, cl.mem_flags.READ_ONLY |
                            cl.mem_flags.COPY_HOST_PTR, hostbuf=arr)

        if key in self.inputs_mapping.keys():
            raise ValueError('Invalid key for buffer!')

        self.inputs_mapping[key] = len(self.input_buffers)
        self.input_buffers.append(buf)



[docs]
    def update_input_buffer(self, key, arr, dtype=np.float32):
        """
        Update an input buffer. Input buffers must first be added
        to program using `add_input_buffer`.

        Parameters
        ----------
        key: string
            Name of the buffer in the input buffers list.
        arr: numpy ndarray
            Data array.
        dtype: dtype, optional
            Optional type for array data. It is recommended to use float32
            whenever possible to avoid unexpected behaviours.
        """
        if key not in self.inputs_mapping.keys():
            raise ValueError('Invalid key for buffer!')
        argpos = self.inputs_mapping[key]

        arr = np.asfortranarray(arr, dtype=dtype)
        buf = cl.Buffer(self.context,
                        cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
                        hostbuf=arr)
        self.input_buffers[argpos] = buf



[docs]
    def add_output_buffer(self, key, shape=None, dtype=np.float32):
        """
        Add an output buffer to the kernel program. Output buffers
        must be added in the same order as they are declared inside
        the kernel code (.cl file).

        Parameters
        ----------
        key: string
            Name of the buffer in the output buffers list. Used for
            referencing when updating buffers.
        shape: tuple
            Shape of the output array.
        dtype: dtype, optional
            Optional type for array data. It is recommended to use float32
            whenever possible to avoid unexpected behaviours.
        """
        if key in self.outputs_mapping.keys():
            raise ValueError('Invalid key for buffer!')

        buf = None
        if shape is not None:
            buf = cl.Buffer(self.context, cl.mem_flags.WRITE_ONLY,
                            np.prod(shape) * np.dtype(dtype).itemsize)

        self.outputs_mapping[key] = len(self.output_buffers)
        self.output_buffers.append(self.OutBuffer(buf, shape, dtype))



[docs]
    def update_output_buffer(self, key, shape, dtype=np.float32):
        """
        Update an output buffer. Output buffers must first be added
        to program using `add_output_buffer`.

        Parameters
        ----------
        key: string
            Name of the buffer in the output buffers list.
        shape: tuple
            New shape of the output array.
        dtype: dtype, optional
            Optional type for array data. It is recommended to use float32
            whenever possible to avoid unexpected behaviours.
        """
        if key not in self.outputs_mapping.keys():
            raise ValueError('Invalid key for buffer!')
        argpos = self.outputs_mapping[key]

        buf = cl.Buffer(self.context, cl.mem_flags.WRITE_ONLY,
                        np.prod(shape) * np.dtype(dtype).itemsize)
        out_buf = self.OutBuffer(buf, shape, dtype)
        self.output_buffers[argpos] = out_buf



[docs]
    def run(self, global_size, local_size=None):
        """
        Execute the kernel code on the GPU.

        Parameters
        ----------
        global_size: tuple
            Tuple of between 1 and 3 entries representing the shape of the
            grid used for GPU computing. OpenCL uses global_size to generate
            a unique id for each kernel execution, which can be queried using
            get_global_id(axis) with axis between 0 and 2.
        local_size: tuple, optional
            Dimensions of local groups. Must divide global_size exactly,
            element-wise. If None, an implementation local workgroup size is
            used. Memory allocated in the __local address space on the GPU is
            shared between elements in a same workgroup.

        Returns
        -------
        outputs: list of ndarrays
            List of outputs produced by the program.
        """
        wait_event = self.kernel(self.queue,
                                 global_size,
                                 local_size,
                                 *self.input_buffers,
                                 *[out.buf for out in self.output_buffers])
        outputs = []
        for output in self.output_buffers:
            out_arr = np.empty(output.shape, dtype=output.dtype, order='F')
            cl.enqueue_copy(self.queue, out_arr, output.buf,
                            wait_for=[wait_event])
            outputs.append(out_arr)
        return outputs





[docs]
class CLKernel(object):
    """
    Wrapper for OpenCL kernel/program code.

    Parameters
    ----------
    entrypoint: string
        Name of __kernel function in .cl file.
    module: string
        Scilpy module in which the kernel code is located.
    filename: string
        Name for the file containing the kernel code.
    """
    def __init__(self, entrypoint, module, filename):
        path_to_kernel = self._get_kernel_path(module, filename)
        try:
            f = open(path_to_kernel, 'r')
        except Exception:
            raise ValueError('OpenCL file not found in {}'
                             .format(path_to_kernel))
        self.code = f.readlines()
        self.entrypoint = entrypoint

    def _get_kernel_path(self, module, filename):
        """
        Get the full path for the OpenCL kernel located in scilpy
        module `module` with filename `filename`.
        """
        module_path = inspect.getfile(scilpy)
        kernel_path = os.path.join(os.path.dirname(module_path),
                                   module, filename)
        return kernel_path


[docs]
    def set_define(self, def_name, value):
        """
        Set the value for a compiler definition in the kernel code.
        This method will overwrite the previous value for this definition.

        Parameters
        ----------
        def_name: string
            Name of definition. By convention, #define should be in upper case.
            Therefore, this value will also be converted to upper case.
        value: string
            The value for the define. Will be replaced directly in the kernel
            code.

        Note
        ----
        Be careful! #define instructions are not typed and therefore prone to
        compilation errors. They are however faster to access than const
        variables. Moreover, they do not take additional space on the GPU.
        """
        def_name = def_name.upper()
        to_find = '#define {}'.format(def_name)
        def_line = None
        for i, line in enumerate(self.code):
            if line.find(to_find) != -1:
                def_line = i
                break
        if def_line is None:
            raise ValueError('Definition {0} not found in kernel code'
                             .format(def_name))

        self.code[def_line] = '#define {0} {1}\n'.format(def_name, value)


    @property
    def entry_point(self):
        return self.entrypoint

    @property
    def code_string(self):
        code_str = ''.join(self.code)
        return code_str