bitsandbytes-foundation
diff --git a/‎.github/scripts/build-rocm.sh‎
Lines changed: 21 additions & 0 deletions b/‎.github/scripts/build-rocm.sh‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎.github/workflows/python-package.yml‎
Lines changed: 47 additions & 0 deletions b/‎.github/workflows/python-package.yml‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 75 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 75 additions & 2 deletions
diff --git a/‎bitsandbytes/backends/cuda/ops.py‎
Lines changed: 22 additions & 5 deletions b/‎bitsandbytes/backends/cuda/ops.py‎
Lines changed: 22 additions & 5 deletions
@@ -0,0 +1,21 @@
+#!/bin/bash
+declare build_arch
+declare build_os
+declare rocm_version
+
+set -xeuo pipefail
+bnb_rocm_arch="gfx90a;gfx942;gfx1100"
+if [ "${build_os:0:6}" == ubuntu ]; then
+image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
+echo "Using image $image"
+docker run --rm --platform "linux/$build_arch" -i \
+-w /src -v "$PWD:/src" "$image" sh -c \
+"apt-get update \
+ && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
+ && cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
+ && cmake --build ."
+fi
+
+output_dir="output/${build_os}/${build_arch}"
+mkdir -p "${output_dir}"
+(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")
@@ -102,10 +102,55 @@ jobs:
  path: output/*
  retention-days: 7
 
+ build-shared-libs-rocm:
+ strategy:
+ matrix:
+ os: [ubuntu-22.04]
+ arch: [x86_64]
+ rocm_version:
+ ["6.1.2", "6.2.4", "6.3.2"]
+ runs-on: ${{ matrix.os }}
+ steps:
+ - uses: actions/checkout@v4
+ - name: Set up Docker multiarch
+ uses: docker/setup-qemu-action@v3
+ - name: Clean up disk space
+ run: |
+ sudo rm -rf \
+ /usr/share/dotnet \
+ /opt/ghc \
+ "/usr/local/share/boost" \
+ "$AGENT_TOOLSDIRECTORY" \
+ /opt/hostedtoolcache \
+ /opt/google/chrome \
+ /opt/microsoft/msedge \
+ /opt/microsoft/powershell \
+ /opt/pipx \
+ /usr/lib/mono \
+ /usr/local/julia* \
+ /usr/local/lib/android \
+ /usr/local/lib/node_modules \
+ /usr/local/share/chromium \
+ /usr/local/share/powershell \
+ /usr/share/swift
+ - name: Build C++
+ run: bash .github/scripts/build-rocm.sh
+ env:
+ build_os: ${{ matrix.os }}
+ build_arch: ${{ matrix.arch }}
+ rocm_version: ${{ matrix.rocm_version }}
+ - name: Upload build artifact
+ uses: actions/upload-artifact@v4
+ with:
+ name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}
+ path: output/*
+ retention-days: 7
+
  build-wheels:
  needs:
  - build-shared-libs
  - build-shared-libs-cuda
+ - build-shared-libs-rocm
  strategy:
  matrix:
  os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest]
@@ -173,6 +218,7 @@ jobs:
  merge-multiple: true
 
  - name: Inspect tmp directory after downloading artifacts
+
  run: |
  ls -alFR tmp/
  WHEEL_COUNT=$(find tmp/ -type f -name "*.whl" | wc -l)
@@ -210,6 +256,7 @@ jobs:
  - uses: actions/checkout@v4
  with:
  path: repo
+
  - name: Delete old pre-release (if exists)
  run: |
  cd repo && gh release delete continuous-release_main --cleanup-tag -y
 
@@ -25,13 +25,14 @@ endif()
 # Define included source files
 set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.cpp)
 set(CUDA_FILES csrc/ops.cu csrc/kernels.cu)
+set(HIP_FILES csrc/ops.hip csrc/kernels.hip)
 set(MPS_FILES csrc/mps_ops.mm)
 set(METAL_FILES csrc/mps_kernels.metal)
 # C++ sources are always included
 list(APPEND SRC_FILES ${CPP_FILES})
 
-set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, mps)")
-set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda mps)
+set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps)")
+set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps)
 option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)
 
 if(APPLE)
@@ -47,15 +48,25 @@ if(${COMPUTE_BACKEND} STREQUAL "cuda")
  message(FATAL_ERROR "CUDA is not supported on macOS" )
  endif()
  set(BUILD_CUDA ON)
+ set(BUILD_HIP OFF)
+ set(BUILD_MPS OFF)
+elseif(${COMPUTE_BACKEND} STREQUAL "hip")
+ if(APPLE)
+ message(FATAL_ERROR "HIP is not supported on macOS" )
+ endif()
+ set(BUILD_CUDA OFF)
+ set(BUILD_HIP ON)
  set(BUILD_MPS OFF)
 elseif(${COMPUTE_BACKEND} STREQUAL "mps")
  if(NOT APPLE)
  message(FATAL_ERROR "MPS is only supported on macOS" )
  endif()
  set(BUILD_CUDA OFF)
+ set(BUILD_HIP OFF)
  set(BUILD_MPS ON)
 else()
  set(BUILD_CUDA OFF)
+ set(BUILD_HIP OFF)
  set(BUILD_MPS OFF)
 endif()
 
@@ -160,6 +171,33 @@ if(BUILD_CUDA)
 
  string(APPEND BNB_OUTPUT_NAME "_cuda${CUDA_VERSION_SHORT}")
  add_compile_definitions(BUILD_CUDA)
+elseif(BUILD_HIP)
+ enable_language(HIP)
+ message(STATUS "HIP Compiler: ${CMAKE_HIP_COMPILER}")
+ if(DEFINED BNB_ROCM_ARCH)
+ set(CMAKE_HIP_ARCHITECTURES ${BNB_ROCM_ARCH})
+ else()
+ if (NOT AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
+ set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100")
+ elseif (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
+ set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
+ endif()
+ endif()
+ message(STATUS "HIP Targets: ${CMAKE_HIP_ARCHITECTURES}")
+
+ list(APPEND SRC_FILES ${HIP_FILES})
+
+ string(APPEND BNB_OUTPUT_NAME "_rocm")
+
+ # get hip version
+ execute_process(COMMAND hipconfig --version OUTPUT_VARIABLE HIP_CONFIG_VERSION)
+ string(REGEX MATCH "[0-9]+\\.[0-9]+" HIP_VERSION "${HIP_CONFIG_VERSION}")
+ string(REPLACE "." "" HIP_VERSION_SHORT "${HIP_VERSION}")
+
+ string(APPEND BNB_OUTPUT_NAME "${HIP_VERSION_SHORT}")
+ add_compile_definitions(__HIP_PLATFORM_AMD__)
+ add_compile_definitions(__HIP_PLATFORM_HCC__)
+ add_compile_definitions(BUILD_HIP)
 elseif(BUILD_MPS)
  if(NOT APPLE)
  message(FATAL_ERROR "MPS is only supported on macOS" )
@@ -208,6 +246,41 @@ if(BUILD_CUDA)
  CUDA_SEPARABLE_COMPILATION ON
  )
 endif()
+if(BUILD_HIP)
+ if(NOT DEFINED ENV{ROCM_PATH})
+ set(ROCM_PATH /opt/rocm)
+ else()
+ set(ROCM_PATH $ENV{ROCM_PATH})
+ endif()
+ list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
+ macro(find_package_and_print_version PACKAGE_NAME)
+ find_package("${PACKAGE_NAME}" ${ARGN})
+ message("${PACKAGE_NAME} VERSION: ${${PACKAGE_NAME}_VERSION}")
+ endmacro()
+ find_package_and_print_version(hipblas REQUIRED)
+ find_package_and_print_version(hiprand REQUIRED)
+ find_package_and_print_version(hipsparse REQUIRED)
+
+ ## hacky way of excluding hip::amdhip64 (with it linked many tests unexpectedly fail e.g. adam8bit because of inaccuracies)
+ set_target_properties(hip::host PROPERTIES INTERFACE_LINK_LIBRARIES "")
+ set_target_properties(hip-lang::host PROPERTIES INTERFACE_LINK_LIBRARIES "")
+ set(CMAKE_HIP_IMPLICIT_LINK_LIBRARIES "")
+
+ target_include_directories(bitsandbytes PRIVATE ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/include ${ROCM_PATH}/include /include)
+ target_link_directories(bitsandbytes PRIVATE ${ROCM_PATH}/lib /lib)
+ target_link_libraries(bitsandbytes PUBLIC roc::hipblas hip::hiprand roc::hipsparse)
+
+ target_compile_definitions(bitsandbytes PUBLIC BNB_USE_HIP)
+ set_source_files_properties(${HIP_FILES} PROPERTIES LANGUAGE HIP)
+ set_target_properties(bitsandbytes PROPERTIES LINKER_LANGUAGE CXX)
+
+ if(HIP_VERSION VERSION_LESS "6.1")
+target_compile_definitions(bitsandbytes PUBLIC NO_HIPBLASLT)
+ else()
+find_package(hipblaslt)
+ target_link_libraries(bitsandbytes PUBLIC roc::hipblaslt)
+ endif()
+endif()
 if(BUILD_MPS)
  add_dependencies(bitsandbytes metallib)
  target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph")
 
@@ -8,7 +8,7 @@
 from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
 
 from ..._ops import register_kernel
-from ...cextension import lib
+from ...cextension import HIP_ENVIRONMENT, lib
 
 
 @register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
@@ -210,7 +210,12 @@ def _get_col_absmax(
 @register_kernel("bitsandbytes::quantize_blockwise", "cuda")
 def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
  torch._check_is_size(blocksize)
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
  torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
 
  n = A.numel()
@@ -264,7 +269,11 @@ def _(
 def _dequantize_blockwise_impl(
  A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
 ) -> None:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
  torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
  torch._check(
  dtype in [torch.float16, torch.bfloat16, torch.float32],
@@ -294,7 +303,11 @@ def _dequantize_blockwise_impl(
 def _(
  A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
 ) -> tuple[torch.Tensor, torch.Tensor]:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
  torch._check(quant_type in ["fp4", "nf4"])
  torch._check(
  A.dtype in [torch.bfloat16, torch.float16, torch.float32],
@@ -372,7 +385,11 @@ def _dequantize_4bit_impl(
  dtype: torch.dtype,
  out: torch.Tensor,
 ) -> None:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
  torch._check(quant_type in ["fp4", "nf4"])
  torch._check(
  dtype in [torch.bfloat16, torch.float16, torch.float32],