Casper O. da Costa-Luis
github/casperdcl
ctypeshttps://docs.python.org/3/library/ctypes.html
ctypes exampleimport ctypes lib = ctypes.CDLL("./libmy_experiment.so") # (re)define function signature for `void myfunc(float*, size_t)` lib.myfunc.argtypes = [ctypes.c_void_p, ctypes.c_size_t] lib.myfunc.restype = None input_array = np.array([1, 2, 3], dtype=np.float32) # call function lib.myfunc( input_array.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), ctypes.c_size_t(len(input_array)))https://docs.python.org/3/c-api/intro.html
#include <Python.h> #include <numpy/arrayobject.h> static PyObject *myfunc(PyObject *self, PyObject *args) { PyObject *arr = NULL; if (!PyArg_ParseTuple(args, "O", &arr)) return NULL; PyArrayObject *np_arr = PyArray_FROM_OTF( arr, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY); float *arr_ptr = PyArray_DATA(np_arr); npy_intp *size = PyArray_SHAPE(np_arr); for (size_t i = 0; i < size[0]; ++i) arr_ptr[i] *= 2; }static struct PyModuleDef my_module = { PyModuleDef_HEAD_INIT, .m_name = "my_experiment" }; static PyMethodDef my_methods[] = { {"myfunc", myfunc, METH_VARARGS, "In-place modifies ndarray[float]"}, {NULL, NULL, 0, NULL} // Sentinel }; PyMODINIT_FUNC PyInit_spam(void) { import_array(); return PyModule_Create(&my_module); }with my_experiment.{so,dll} in PYTHONPATH:
https://docs.python.org/3/c-api/buffer.html
struct to expose arraysfloat *datasize_t ndimsize_t shape[]https://github.com/AMYPAD/CuVec/blob/main/cuvec/include/cuvec_cpython.cuh#L128
int my_buffer(PyObject *obj, Py_buffer *view, int flags) { view->buf = (void *)MY_GET_ARR_PTR(obj); view->obj = obj; view->len = MY_GET_ARR_SIZE(obj) * sizeof(float); view->readonly = 0; view->itemsize = sizeof(float); view->format = "f"; view->ndim = MY_GET_ARR_NDIM(obj); view->shape = MY_GET_ARR_SHAPE(obj); view->strides = MY_GET_ARR_STRIDES(obj); view->suboffsets = NULL; view->internal = NULL; Py_INCREF(view->obj); return 0; }pybind11, etc)https://pybind11.readthedocs.io/en/stable/
#include <pybind11/pybind11.h> void myfunc(pybind11::buffer view) { pybind11::buffer_info arr = view.request(); float *ptr = arr.ptr; if (arr.ndim != 1) throw std::runtime_error("expected 1D array"); for (size_t i = 0; i < arr.size; ++i) ptr[i] *= 2; } using namespace pybind11::literals; PYBIND11_MODULE(my_examples, m){ m.def("myfunc", &myfunc, "input_array"_a, "In-place modifies ndarray[float]"); }__cuda_array_interface__https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html
__dlpack__https://data-apis.org/array-api/latest/API_specification/generated/array_api.array.__dlpack__.html
https://dmlc.github.io/dlpack/latest/python_spec.html#reference-implementations
__cuda_array_interface__ but also handles ownership & multiple devices/streams@property def __dlpack__(self, copy=False, max_version: tuple[int]=None, stream: int=None, dl_device: tuple[int]=None ): dl_tensor = DLTensor(data=self.addr, device=dl_device, ndim=self.ndim, dtype=DLDataType.from_dtype(self.dtype), shape=ctypes.cast(self.shape, ctypes.POINTER(ctypes.c_int64)), strides=None, byte_offset=0) managed_tensor = DLManagedTensor(dl_tensor=dl_tensor, manager_ctx=0, deleter=DLTensorDeleter(lambda addr: None)) return pythonapi.PyCapsule_New(ctypes.byref(managed_tensor), b'dltensor', None)#include <cuda_runtime.h> int N = ...; float cpu_data[N]; for (int i = 0; i < N; ++i) data[i] = ...; float *gpu_data; cudaMalloc(&gpu_data, sizeof(float) * N); cudaMemcpy(gpu_data, cpu_data, sizeof(float) * N, cudaMemcpyHostToDevice); mykernel<<<1, N>>>(gpu_data, N); cudaDeviceSynchronize(); cudaMemcpy(cpu_data, gpu_data, sizeof(float) * N, cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); cudaFree(gpu_data);https://amypad.github.io/CuVec/
std::vector<T, {malloc, free}> → std::vector<T, {cudaMallocManaged, cudaFree}> std::vector<T>::data()std::vector<T>::size()std::vector<T>::resize()numpy.ndarrayhttps://amypad.github.io/NumCu/
Minimal Python/C++/CUDA library using CuVec’s CPython buffer protocol.
pip installpyproject.toml::build-systemrequires = [cmake, scikit-build-core, pybind11, ...]cmake*.{py,so,dll}