- Notifications
You must be signed in to change notification settings - Fork 29
E2E Linux Example
This is a minimal guide to executing an E2E test of llvm-aie on an AIE2 device (all environment specifics will be described below).
- Working AIE2 device with a driver and runtime installed (more info at https://github.com/amd/xdna-driver);
- This doc was written against the following system configuration:
System Configuration OS Name : Linux Release : 6.8.8 Version : #2 SMP PREEMPT_DYNAMIC Fri May 3 14:13:56 CDT 2024 Machine : x86_64 CPU Cores : 16 Memory : 94278 MB Distribution : Ubuntu 22.04.3 LTS GLIBC : 2.35 Model : F7BSC BIOS vendor : American Megatrends International, LLC. BIOS version : 1.04 XRT Version : 2.18.0 Branch : HEAD Hash : c678a9469f9b20fcb9a04bbedb5c51f8473faec0 Hash Date : 2024-05-24 18:16:53 XOCL : unknown, unknown XCLMGMT : unknown, unknown WARNING: xclmgmt version is unknown. Is xclmgmt driver loaded? Or is MSD/MPD running? AMDXDNA : 2.18.0_20240524, 4ef6d95ad37a2de0aa22264c950dec8ec1bd9f52 Firmware Version : N/A Devices present BDF : Name --------------------------------- [0000:c5:00.1] : RyzenAI-npu1
- This doc was written against the following system configuration:
- Distro install of
llvm-aie(test on commit 70703e80a6ecf8f8cf3fa724191dd1f36951dea3);- Here is a plausible CMake configure:
-C $LLVM_AIE_REPO_ROOT/clang/cmake/caches/Peano-AIE.cmake \ -DCMAKE_INSTALL_PREFIX=$LLVM_AIE_REPO_ROOT/install
- Here is a plausible CMake configure:
- A python environment with
xaiepyinstalled;- A one-liner:
pip install xaiepy==0.0.1 -f https://github.com/nod-ai/prototype-aie-toolchain/releases/expanded_assets/release
- A one-liner:
All programs/scripts are "attached" below.
The example program is very simple and does exactly one thing:
#include "aiev2_locks.h" #define ACQ_LOCK 48 #define REL_LOCK 49 extern float _anonymous0[1]; int main() { acquire_greater_equal(ACQ_LOCK, 1); _anonymous0[0] = 5 * 3.14159; release(REL_LOCK, 1); return 0; }i.e., it stores 5 * 3.14159 == 15.70795 to a global array. To go along with this brilliant program you will need the following linker script:
MEMORY { program (RX) : ORIGIN = 0, LENGTH = 0x0020000 data (!RX) : ORIGIN = 0x70404, LENGTH = 0xFBFC } ENTRY(_main_init) SECTIONS { . = 0x0; .text : { *me_basic.o(.text) . = 0x200; _ctors_start = .; _init_array_start = .; KEEP(SORT(*.init_array)) _ctors_end = .; _init_array_end = .; _dtors_start = .; _dtors_end = .; *(.text) } > program .data : { *(.data*); *(.rodata*) } > data . = 0x70000; _sp_start_value_DM_stack = .; . += 0x400; /* stack */ . = 0x40000; . += 0x10000; . = 0x50000; . += 0x10000; . = 0x70400; _anonymous0 = .; . += 0x4; .bss : { *(.bss) } > data .bss.DMb.4 : { *(.bss.DMb.4) } > data } PROVIDE(_main = main); Writing this is beyond the scope of this intro.
Get all of your ducks in line (turn the above code into a main.cpp, find Peano and set PEANO_INSTALL_DIR=...) and then incant the following magical incantations:
me@mydesk: $PEANO_INSTALL_DIR/bin/clang -O2 -I$PEANO_INSTALL_DIR/lib/clang/18/include \ -S --target=aie2-none-unknown-elf main.cpp -emit-llvm me@mydesk: $PEANO_INSTALL_DIR/bin/clang -O2 --target=aie2-none-unknown-elf main.ll \ -ccc-install-dir $PEANO_INSTALL_DIR/bin -Wl,-T $PWD/main.ld.script \ -o fivepi.elf && $PEANO_INSTALL_DIR/bin/llvm-readelf -Ss fivepi.elfIf everything went according to plan you will see roughly the following as verification that your elf file is fully baked:
There are 8 section headers, starting at offset 0x1544: Section Headers: [Nr] Name Type Address Off Size ES Flg Lk Inf Al [ 0] NULL 00000000 000000 000000 00 0 0 0 [ 1] .text PROGBITS 00000000 001000 000260 00 AX 0 0 16 [ 2] .text._Exit PROGBITS 00000260 001260 000020 00 AX 0 0 16 [ 3] .text._main_init PROGBITS 00000280 001280 000050 00 AX 0 0 16 [ 4] .comment PROGBITS 00000000 0012d0 00007f 01 MS 0 0 1 [ 5] .symtab SYMTAB 00000000 001350 000100 10 7 3 4 [ 6] .shstrtab STRTAB 00000000 001450 000047 00 0 0 1 [ 7] .strtab STRTAB 00000000 001497 0000ad 00 0 0 1 Symbol table '.symtab' contains 16 entries: Num: Value Size Type Bind Vis Ndx Name 0: 00000000 0 NOTYPE LOCAL DEFAULT UND 1: 00000000 0 FILE LOCAL DEFAULT ABS main.cpp 2: 00000000 0 FILE LOCAL DEFAULT ABS crt1.cc 3: 00000200 64 FUNC GLOBAL DEFAULT 1 main 4: 00070400 0 NOTYPE GLOBAL DEFAULT 1 _anonymous0 5: 00000240 0 FUNC GLOBAL DEFAULT 1 __start 6: 00070000 0 NOTYPE GLOBAL DEFAULT 1 _sp_start_value_DM_stack 7: 00000280 80 FUNC GLOBAL DEFAULT 3 _main_init 8: 00000260 32 FUNC GLOBAL DEFAULT 2 _Exit 9: 00000200 0 FUNC GLOBAL DEFAULT 1 _main 10: 00000200 0 NOTYPE GLOBAL DEFAULT 1 _ctors_start 11: 00000200 0 NOTYPE GLOBAL DEFAULT 1 _init_array_start 12: 00000200 0 NOTYPE GLOBAL DEFAULT 1 _ctors_end 13: 00000200 0 NOTYPE GLOBAL DEFAULT 1 _init_array_end 14: 00000200 0 NOTYPE GLOBAL DEFAULT 1 _dtors_start 15: 00000200 0 NOTYPE GLOBAL DEFAULT 1 _dtors_end There are two example scripts in xaiepy that demonstrate how to configure the Phoenix device and run the program using XRT APIs:
-
gen_example.py, which generates an.xclbinthat can be loaded/run by XRT -
xrt.py, which loads and runs the aforementioned.xclbin.
Run both of these et voila you should see something resembling 15.70795.
In the next episode we'll explain what all of these things actually do...
#include "aiev2_locks.h" #define ACQ_LOCK 48 #define REL_LOCK 49 extern float _anonymous0[1]; int main() { acquire_greater_equal(ACQ_LOCK, 1); _anonymous0[0] = 5 * 3.14159; release(REL_LOCK, 1); return 0; }MEMORY { program (RX) : ORIGIN = 0, LENGTH = 0x0020000 data (!RX) : ORIGIN = 0x70404, LENGTH = 0xFBFC } ENTRY(_main_init) SECTIONS { . = 0x0; .text : { /* the _main_init symbol from me_basic.o has to come at address zero. */ *me_basic.o(.text) . = 0x200; _ctors_start = .; _init_array_start = .; KEEP(SORT(*.init_array)) _ctors_end = .; _init_array_end = .; _dtors_start = .; _dtors_end = .; *(.text) } > program .data : { *(.data*); *(.rodata*) } > data . = 0x70000; _sp_start_value_DM_stack = .; . += 0x400; /* stack */ /* No tile with memory exists to the south. */ . = 0x40000; . += 0x10000; /* No tile with memory exists to the west. */ . = 0x50000; . += 0x10000; . = 0x70400; _anonymous0 = .; . += 0x4; .bss : { *(.bss) } > data .bss.DMb.4 : { *(.bss.DMb.4) } > data } PROVIDE(_main = main); PEANO_INSTALL_DIR=<fill me in.................> $PEANO_INSTALL_DIR/bin/clang -O2 -I$PEANO_INSTALL_DIR/lib/clang/18/include \ -S --target=aie2-none-unknown-elf main.cpp -emit-llvm $PEANO_INSTALL_DIR/bin/clang -O2 --target=aie2-none-unknown-elf main.ll \ -ccc-install-dir $PEANO_INSTALL_DIR/bin -Wl,-T $PWD/main.ld.script \ -o fivepi.elf && $PEANO_INSTALL_DIR/bin/llvm-readelf -Ss fivepi.elf#! /usr/bin/env python import argparse import json import logging import platform from pathlib import Path from xaiepy import bootgen, xclbinutil from xaiepy.cdo import ( startCDOFileStream, FileHeader, configureHeader, endCurrentCDOFileStream, EnAXIdebug, setEndianness, Little_Endian, ) logging.basicConfig( level=logging.DEBUG, format="%(message)s", datefmt="%H:%M:%S", ) from xaiepy import ( XAie_Config, XAie_BackendType, XAie_PartitionProp, XAie_DevInst, XAie_CfgInitialize, XAie_LocType, XAie_LoadElf, XAie_SetupPartitionConfig, XAie_UpdateNpiAddr, XAie_CoreReset, XAie_CoreUnreset, XAie_LockSetValue, XAie_Lock, XAie_DmaDescInit, XAie_DmaSetAddrLen, XAie_DmaEnableBd, XAie_DmaWriteBd, XAie_DmaChannelSetStartQueue, XAie_DmaChannelEnable, XAie_StrmConnCctEnable, XAie_CoreEnable, StrmSwPortType, XAie_EnableAieToShimDmaStrmPort, XAie_DmaDesc, ) if platform.system() != "Windows": from xaiepy import XAie_ErrorHandlingInit XAIE_DEV_GEN_AIEML = 2 XAIE_BASE_ADDR = 0x40000000 XAIE_COL_SHIFT = 25 XAIE_ROW_SHIFT = 20 XAIE_SHIM_ROW = 0 XAIE_MEM_TILE_ROW_START = 1 XAIE_PARTITION_BASE_ADDR = 0x0 XAIE_TRANSACTION_DISABLE_AUTO_FLUSH = 0b0 DDR_AIE_ADDR_OFFSET = 0x80000000 col = 0 def build_cdo(which_pi): tile_0_0 = XAie_LocType(0, col) tile_0_1 = XAie_LocType(1, col) tile_0_2 = XAie_LocType(2, col) configPtr = XAie_Config( XAIE_DEV_GEN_AIEML, XAIE_BASE_ADDR, XAIE_COL_SHIFT, XAIE_ROW_SHIFT, 6, 5, XAIE_SHIM_ROW, XAIE_MEM_TILE_ROW_START, 1, (XAIE_MEM_TILE_ROW_START + 1), (6 - 1 - 1), XAie_PartitionProp(), XAie_BackendType.XAIE_IO_BACKEND_CDO, ) devInst = XAie_DevInst() XAie_SetupPartitionConfig(devInst, 0, 1, 1) XAie_CfgInitialize(devInst, configPtr) XAie_UpdateNpiAddr(devInst, 0) EnAXIdebug() setEndianness(Little_Endian) cdo_fp = Path(__file__).parent.absolute() / f"{which_pi}_cdo.bin" startCDOFileStream(str(cdo_fp)) FileHeader() if platform.system() != "Windows": XAie_ErrorHandlingInit(devInst) elf_path = Path(__file__).parent.absolute() / f"{which_pi}.elf" assert elf_path.exists() XAie_LoadElf(devInst, tile_0_2, str(elf_path), False) XAie_CoreReset(devInst, tile_0_2) XAie_CoreUnreset(devInst, tile_0_2) XAie_LockSetValue(devInst, tile_0_2, XAie_Lock(0, 1)) XAie_LockSetValue(devInst, tile_0_2, XAie_Lock(1, 0)) dmaTileBd = XAie_DmaDesc() XAie_DmaDescInit(devInst, dmaTileBd, tile_0_2) dmaTileBd.DmaMod.contents.SetLock( dmaTileBd, XAie_Lock(1, -1), XAie_Lock(0, 1), 1, 0 ) XAie_DmaSetAddrLen(dmaTileBd, 1024, 4) XAie_DmaEnableBd(dmaTileBd) XAie_DmaWriteBd(devInst, dmaTileBd, tile_0_2, 0) XAie_DmaChannelSetStartQueue(devInst, tile_0_2, 0, 1, 0, 1, 0) XAie_DmaChannelEnable(devInst, tile_0_2, 0, 1) XAie_StrmConnCctEnable( devInst, tile_0_0, StrmSwPortType.CTRL, 0, StrmSwPortType.SOUTH, 0 ) XAie_StrmConnCctEnable( devInst, tile_0_0, StrmSwPortType.NORTH, 0, StrmSwPortType.SOUTH, 2 ) XAie_StrmConnCctEnable( devInst, tile_0_1, StrmSwPortType.NORTH, 0, StrmSwPortType.SOUTH, 0 ) XAie_StrmConnCctEnable( devInst, tile_0_2, StrmSwPortType.DMA, 0, StrmSwPortType.SOUTH, 0 ) XAie_EnableAieToShimDmaStrmPort(devInst, tile_0_0, 2) XAie_CoreEnable(devInst, tile_0_2) configureHeader() endCurrentCDOFileStream() bif_fp = Path(__file__).parent.absolute() / f"{which_pi}.bif" with open(bif_fp, "w") as f: f.write(bootgen.emit_design_bif([cdo_fp])) pdi_fp = Path(__file__).parent.absolute() / f"{which_pi}.pdi" bootgen.make_design_pdi(str(bif_fp), str(pdi_fp)) mem_top_json_fp = Path(__file__).parent.absolute() / f"{which_pi}_mem_topology.json" with open(mem_top_json_fp, "w") as f: json.dump(xclbinutil.mem_topology, f, indent=2) aie_part_json_fp = ( Path(__file__).parent.absolute() / f"{which_pi}_aie_partition.json" ) kernel_id = "0x902" if "two" in which_pi else "0x901" pdi_spec = xclbinutil.pdi_spec(pdi_fp, kernel_ids=[kernel_id]) with open(aie_part_json_fp, "w") as f: json.dump(xclbinutil.emit_partition([pdi_spec], num_cols=1), f, indent=2) kernels_json_fp = Path(__file__).parent.absolute() / f"{which_pi}_kernel.json" kernel_spec = xclbinutil.kernel_spec( kernel_name=which_pi, kernel_id=kernel_id, buffer_args=["c0"] ) with open(kernels_json_fp, "w") as f: json.dump(xclbinutil.emit_design_kernel_json([kernel_spec]), f, indent=2) pi_xclbin_fp = Path(__file__).parent.absolute() / f"{which_pi}.xclbin" xclbinutil.make_xclbin( str(mem_top_json_fp), str(aie_part_json_fp), str(kernels_json_fp), str(pi_xclbin_fp), ) if __name__ == "__main__": build_cdo("fivepi")from pathlib import Path import numpy as np from xaiepy import pyxrt from xaiepy.pyxrt import ert_cmd_state def init_xrt_load_kernel(xclbin: Path): device = pyxrt.device(0) xclbin = pyxrt.xclbin(str(xclbin)) device.register_xclbin(xclbin) return device, xclbin _PROLOG = [ 0x00000011, 0x01000405, 0x01000100, 0x0B590100, 0x000055FF, 0x00000001, 0x00000010, 0x314E5A5F, 0x635F5F31, 0x676E696C, 0x39354E5F, 0x6E693131, 0x5F727473, 0x64726F77, 0x00004573, 0x07BD9630, 0x000055FF, ] shim_instr_v = [ 0x06000100, 0x00000000, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x80000000, 0x00000000, 0x00000000, 0x02000000, 0x02000000, 0x0001D204, 0x80000000, 0x03000000, 0x00010100, ] whichpi = "fivepi" instr_v = _PROLOG + shim_instr_v instr_v = np.array(instr_v, dtype=np.uint32) inout0 = np.zeros((1,), dtype=np.float32) device, xclbin = init_xrt_load_kernel(Path(__file__).parent.absolute() / f"{whichpi}.xclbin") def go(): context = pyxrt.hw_context(device, xclbin.get_uuid()) xkernel = next(k for k in xclbin.get_kernels() if k.get_name() == whichpi) kernel = pyxrt.kernel(context, xkernel.get_name()) bo_instr = pyxrt.bo( device, len(instr_v) * 4, pyxrt.bo.cacheable, kernel.group_id(0) ) bo_inout0 = pyxrt.bo(device, 1 * 4, pyxrt.bo.host_only, kernel.group_id(2)) bo_instr.write(instr_v, 0) bo_inout0.write(inout0, 0) bo_instr.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) bo_inout0.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) h = kernel(bo_instr, len(instr_v), bo_inout0) assert h.wait() == ert_cmd_state.ERT_CMD_STATE_COMPLETED bo_inout0.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) entire_buffer = bo_inout0.read(4, 0).view(np.float32) print(entire_buffer[0]) v = entire_buffer[0].item() assert isinstance(v, float) assert np.isclose(v, 3.14) go()