Source code for pyfr.backends.cuda.base

# -*- coding: utf-8 -*-

import os
import re

from pyfr.backends.base import BaseBackend
from pyfr.mpiutil import get_local_rank
from pyfr.template import DottedTemplateLookup


[docs]class CUDABackend(BaseBackend): name = 'cuda' def __init__(self, cfg): super().__init__(cfg) # Get the desired CUDA device devid = cfg.get('backend-cuda', 'device-id', 'round-robin') if not re.match(r'(round-robin|local-rank|\d+)$', devid): raise ValueError('Invalid device-id') # Handle the local-rank case if devid == 'local-rank': devid = str(get_local_rank()) # In the non round-robin case set CUDA_DEVICE to be the desired # CUDA device number (used by pycuda.autoinit) os.environ.pop('CUDA_DEVICE', None) if devid != 'round-robin': os.environ['CUDA_DEVICE'] = devid # Create a CUDA context from pycuda.autoinit import context import pycuda.driver as cuda # Take the required alignment to be 128 bytes self.alignb = 128 # Some CUDA devices share L1 cache and shared memory; on these # devices CUDA allows us to specify a preference between L1 # cache and shared memory. For the sake of CUBLAS (which # benefits greatly from more shared memory but fails to # declare its preference) we set the global default to # PREFER_SHARED. context.set_cache_config(cuda.func_cache.PREFER_SHARED) from pyfr.backends.cuda import (blasext, cublas, gimmik, packing, provider, types) # Register our data types self.base_matrix_cls = types.CUDAMatrixBase self.const_matrix_cls = types.CUDAConstMatrix self.matrix_cls = types.CUDAMatrix self.matrix_bank_cls = types.CUDAMatrixBank self.matrix_rslice_cls = types.CUDAMatrixRSlice self.queue_cls = types.CUDAQueue self.view_cls = types.CUDAView self.xchg_matrix_cls = types.CUDAXchgMatrix self.xchg_view_cls = types.CUDAXchgView # Template lookup self.lookup = DottedTemplateLookup( 'pyfr.backends.cuda.kernels', fpdtype=self.fpdtype, alignb=self.alignb ) # Instantiate the base kernel providers kprovs = [provider.CUDAPointwiseKernelProvider, blasext.CUDABlasExtKernels, packing.CUDAPackingKernels, gimmik.CUDAGiMMiKKernels, cublas.CUDACUBLASKernels] self._providers = [k(self) for k in kprovs] # Pointwise kernels self.pointwise = self._providers[0]
[docs] def _malloc_impl(self, nbytes): import pycuda.driver as cuda # Allocate data = cuda.mem_alloc(nbytes) # Zero cuda.memset_d32(data, 0, nbytes // 4) return data