Thank you for replying.
I use the “nvcc -ptx“ to get ptx code of l1_cache_size_test_kernel.
The result shows with index_record[i]=j uncommented:
// // Generated by NVIDIA NVVM Compiler // // Compiler Build ID: CL-34714021 // Cuda compilation tools, release 12.6, V12.6.68 // Based on NVVM 7.0.1 // .version 8.5 .target sm_52 .address_size 64 // .globl _Z25l1_cache_size_test_kernelPjS_S_iPi .visible .entry _Z25l1_cache_size_test_kernelPjS_S_iPi( .param .u64 _Z25l1_cache_size_test_kernelPjS_S_iPi_param_0, .param .u64 _Z25l1_cache_size_test_kernelPjS_S_iPi_param_1, .param .u64 _Z25l1_cache_size_test_kernelPjS_S_iPi_param_2, .param .u32 _Z25l1_cache_size_test_kernelPjS_S_iPi_param_3, .param .u64 _Z25l1_cache_size_test_kernelPjS_S_iPi_param_4 ) { .reg .pred %p<11>; .reg .b32 %r<143>; .reg .b64 %rd<85>; ld.param.u64 %rd28, [_Z25l1_cache_size_test_kernelPjS_S_iPi_param_0]; ld.param.u64 %rd29, [_Z25l1_cache_size_test_kernelPjS_S_iPi_param_1]; ld.param.u64 %rd30, [_Z25l1_cache_size_test_kernelPjS_S_iPi_param_2]; ld.param.u64 %rd31, [_Z25l1_cache_size_test_kernelPjS_S_iPi_param_4]; ld.param.u32 %r33, [_Z25l1_cache_size_test_kernelPjS_S_iPi_param_3]; cvta.to.global.u64 %rd1, %rd30; cvta.to.global.u64 %rd2, %rd29; cvta.to.global.u64 %rd3, %rd28; mov.u32 %r34, 0; mov.u32 %r142, %r34; cvta.to.global.u64 %rd5, %rd31; st.global.u32 [%rd5], %r34; mov.u32 %r142, %r34; shr.u32 %r35, %r33, 31; add.s32 %r36, %r33, %r35; shr.s32 %r1, %r36, 1; setp.lt.s32 %p1, %r33, 2; mov.u32 %r141, %r34; @%p1 bra $L__BB0_7; add.s32 %r40, %r1, -1; and.b32 %r131, %r1, 3; setp.lt.u32 %p2, %r40, 3; mov.u32 %r141, 0; mov.u32 %r129, %r141; @%p2 bra $L__BB0_4; sub.s32 %r126, %r1, %r131; mov.u32 %r141, 0; mov.u64 %rd77, %rd1; mov.u64 %rd78, %rd2; $L__BB0_3: // begin inline asm mov.u64 %rd33, %clock64; // end inline asm cvt.u32.u64 %r43, %rd33; mov.u32 %r44, %r142; mul.wide.s32 %rd41, %r44, 4; add.s64 %rd42, %rd3, %rd41; ld.global.u32 %r45, [%rd42]; mov.u32 %r142, %r45; mov.u32 %r46, %r142; add.s32 %r47, %r46, %r141; // begin inline asm mov.u64 %rd34, %clock64; // end inline asm cvt.u32.u64 %r48, %rd34; sub.s32 %r49, %r48, %r43; st.global.u32 [%rd78], %r49; mov.u32 %r50, %r142; st.global.u32 [%rd77], %r50; // begin inline asm mov.u64 %rd35, %clock64; // end inline asm cvt.u32.u64 %r51, %rd35; mov.u32 %r52, %r142; mul.wide.s32 %rd43, %r52, 4; add.s64 %rd44, %rd3, %rd43; ld.global.u32 %r53, [%rd44]; mov.u32 %r142, %r53; mov.u32 %r54, %r142; add.s32 %r55, %r54, %r47; // begin inline asm mov.u64 %rd36, %clock64; // end inline asm cvt.u32.u64 %r56, %rd36; sub.s32 %r57, %r56, %r51; st.global.u32 [%rd78+4], %r57; mov.u32 %r58, %r142; st.global.u32 [%rd77+4], %r58; // begin inline asm mov.u64 %rd37, %clock64; // end inline asm cvt.u32.u64 %r59, %rd37; mov.u32 %r60, %r142; mul.wide.s32 %rd45, %r60, 4; add.s64 %rd46, %rd3, %rd45; ld.global.u32 %r61, [%rd46]; mov.u32 %r142, %r61; mov.u32 %r62, %r142; add.s32 %r63, %r62, %r55; // begin inline asm mov.u64 %rd38, %clock64; // end inline asm cvt.u32.u64 %r64, %rd38; sub.s32 %r65, %r64, %r59; st.global.u32 [%rd78+8], %r65; mov.u32 %r66, %r142; st.global.u32 [%rd77+8], %r66; // begin inline asm mov.u64 %rd39, %clock64; // end inline asm cvt.u32.u64 %r67, %rd39; mov.u32 %r68, %r142; mul.wide.s32 %rd47, %r68, 4; add.s64 %rd48, %rd3, %rd47; ld.global.u32 %r69, [%rd48]; mov.u32 %r142, %r69; mov.u32 %r70, %r142; add.s32 %r141, %r70, %r63; // begin inline asm mov.u64 %rd40, %clock64; // end inline asm cvt.u32.u64 %r71, %rd40; sub.s32 %r72, %r71, %r67; st.global.u32 [%rd78+12], %r72; mov.u32 %r73, %r142; st.global.u32 [%rd77+12], %r73; add.s32 %r129, %r129, 4; add.s64 %rd78, %rd78, 16; add.s64 %rd77, %rd77, 16; add.s32 %r126, %r126, -4; setp.ne.s32 %p3, %r126, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.eq.s32 %p4, %r131, 0; @%p4 bra $L__BB0_7; mul.wide.s32 %rd49, %r129, 4; add.s64 %rd80, %rd1, %rd49; add.s64 %rd79, %rd2, %rd49; $L__BB0_6: .pragma "nounroll"; // begin inline asm mov.u64 %rd50, %clock64; // end inline asm cvt.u32.u64 %r74, %rd50; mov.u32 %r75, %r142; mul.wide.s32 %rd52, %r75, 4; add.s64 %rd53, %rd3, %rd52; ld.global.u32 %r76, [%rd53]; mov.u32 %r142, %r76; mov.u32 %r77, %r142; add.s32 %r141, %r77, %r141; // begin inline asm mov.u64 %rd51, %clock64; // end inline asm cvt.u32.u64 %r78, %rd51; sub.s32 %r79, %r78, %r74; st.global.u32 [%rd79], %r79; mov.u32 %r80, %r142; st.global.u32 [%rd80], %r80; add.s64 %rd80, %rd80, 4; add.s64 %rd79, %rd79, 4; add.s32 %r131, %r131, -1; setp.ne.s32 %p5, %r131, 0; @%p5 bra $L__BB0_6; $L__BB0_7: membar.gl; mov.u32 %r142, %r34; setp.ge.s32 %p6, %r1, %r33; @%p6 bra $L__BB0_14; sub.s32 %r83, %r33, %r1; and.b32 %r135, %r83, 3; setp.eq.s32 %p7, %r135, 0; mov.u32 %r137, %r1; @%p7 bra $L__BB0_11; mul.wide.s32 %rd54, %r1, 4; add.s64 %rd82, %rd1, %rd54; add.s64 %rd81, %rd2, %rd54; mov.u32 %r137, %r1; $L__BB0_10: .pragma "nounroll"; // begin inline asm mov.u64 %rd55, %clock64; // end inline asm cvt.u32.u64 %r84, %rd55; mov.u32 %r85, %r142; mul.wide.s32 %rd57, %r85, 4; add.s64 %rd58, %rd3, %rd57; ld.global.u32 %r86, [%rd58]; mov.u32 %r142, %r86; mov.u32 %r87, %r142; add.s32 %r141, %r87, %r141; // begin inline asm mov.u64 %rd56, %clock64; // end inline asm cvt.u32.u64 %r88, %rd56; sub.s32 %r89, %r88, %r84; st.global.u32 [%rd81], %r89; mov.u32 %r90, %r142; st.global.u32 [%rd82], %r90; add.s32 %r137, %r137, 1; add.s64 %rd82, %rd82, 4; add.s64 %rd81, %rd81, 4; add.s32 %r135, %r135, -1; setp.ne.s32 %p8, %r135, 0; @%p8 bra $L__BB0_10; $L__BB0_11: not.b32 %r91, %r1; add.s32 %r92, %r91, %r33; setp.lt.u32 %p9, %r92, 3; @%p9 bra $L__BB0_14; mul.wide.s32 %rd59, %r137, 4; add.s64 %rd60, %rd59, 8; add.s64 %rd84, %rd2, %rd60; add.s64 %rd83, %rd1, %rd60; $L__BB0_13: // begin inline asm mov.u64 %rd61, %clock64; // end inline asm cvt.u32.u64 %r93, %rd61; mov.u32 %r94, %r142; mul.wide.s32 %rd69, %r94, 4; add.s64 %rd70, %rd3, %rd69; ld.global.u32 %r95, [%rd70]; mov.u32 %r142, %r95; mov.u32 %r96, %r142; add.s32 %r97, %r96, %r141; // begin inline asm mov.u64 %rd62, %clock64; // end inline asm cvt.u32.u64 %r98, %rd62; sub.s32 %r99, %r98, %r93; st.global.u32 [%rd84+-8], %r99; mov.u32 %r100, %r142; st.global.u32 [%rd83+-8], %r100; // begin inline asm mov.u64 %rd63, %clock64; // end inline asm cvt.u32.u64 %r101, %rd63; mov.u32 %r102, %r142; mul.wide.s32 %rd71, %r102, 4; add.s64 %rd72, %rd3, %rd71; ld.global.u32 %r103, [%rd72]; mov.u32 %r142, %r103; mov.u32 %r104, %r142; add.s32 %r105, %r104, %r97; // begin inline asm mov.u64 %rd64, %clock64; // end inline asm cvt.u32.u64 %r106, %rd64; sub.s32 %r107, %r106, %r101; st.global.u32 [%rd84+-4], %r107; mov.u32 %r108, %r142; st.global.u32 [%rd83+-4], %r108; // begin inline asm mov.u64 %rd65, %clock64; // end inline asm cvt.u32.u64 %r109, %rd65; mov.u32 %r110, %r142; mul.wide.s32 %rd73, %r110, 4; add.s64 %rd74, %rd3, %rd73; ld.global.u32 %r111, [%rd74]; mov.u32 %r142, %r111; mov.u32 %r112, %r142; add.s32 %r113, %r112, %r105; // begin inline asm mov.u64 %rd66, %clock64; // end inline asm cvt.u32.u64 %r114, %rd66; sub.s32 %r115, %r114, %r109; st.global.u32 [%rd84], %r115; mov.u32 %r116, %r142; st.global.u32 [%rd83], %r116; // begin inline asm mov.u64 %rd67, %clock64; // end inline asm cvt.u32.u64 %r117, %rd67; mov.u32 %r118, %r142; mul.wide.s32 %rd75, %r118, 4; add.s64 %rd76, %rd3, %rd75; ld.global.u32 %r119, [%rd76]; mov.u32 %r142, %r119; mov.u32 %r120, %r142; add.s32 %r141, %r120, %r113; // begin inline asm mov.u64 %rd68, %clock64; // end inline asm cvt.u32.u64 %r121, %rd68; sub.s32 %r122, %r121, %r117; st.global.u32 [%rd84+4], %r122; mov.u32 %r123, %r142; st.global.u32 [%rd83+4], %r123; add.s64 %rd84, %rd84, 16; add.s64 %rd83, %rd83, 16; add.s32 %r137, %r137, 4; setp.lt.s32 %p10, %r137, %r33; @%p10 bra $L__BB0_13; $L__BB0_14: st.global.u32 [%rd5], %r141; ret; }
And result shows with index_record[i]=j commented:
// // Generated by NVIDIA NVVM Compiler // // Compiler Build ID: CL-34714021 // Cuda compilation tools, release 12.6, V12.6.68 // Based on NVVM 7.0.1 // .version 8.5 .target sm_52 .address_size 64 // .globl _Z25l1_cache_size_test_kernelPjS_S_iPi .visible .entry _Z25l1_cache_size_test_kernelPjS_S_iPi( .param .u64 _Z25l1_cache_size_test_kernelPjS_S_iPi_param_0, .param .u64 _Z25l1_cache_size_test_kernelPjS_S_iPi_param_1, .param .u64 _Z25l1_cache_size_test_kernelPjS_S_iPi_param_2, .param .u32 _Z25l1_cache_size_test_kernelPjS_S_iPi_param_3, .param .u64 _Z25l1_cache_size_test_kernelPjS_S_iPi_param_4 ) { .reg .pred %p<11>; .reg .b32 %r<133>; .reg .b64 %rd<68>; ld.param.u64 %rd16, [_Z25l1_cache_size_test_kernelPjS_S_iPi_param_0]; ld.param.u64 %rd17, [_Z25l1_cache_size_test_kernelPjS_S_iPi_param_1]; ld.param.u64 %rd18, [_Z25l1_cache_size_test_kernelPjS_S_iPi_param_4]; ld.param.u32 %r33, [_Z25l1_cache_size_test_kernelPjS_S_iPi_param_3]; cvta.to.global.u64 %rd1, %rd17; cvta.to.global.u64 %rd2, %rd16; mov.u32 %r34, 0; mov.u32 %r132, %r34; cvta.to.global.u64 %rd4, %rd18; st.global.u32 [%rd4], %r34; mov.u32 %r132, %r34; shr.u32 %r35, %r33, 31; add.s32 %r36, %r33, %r35; shr.s32 %r1, %r36, 1; setp.lt.s32 %p1, %r33, 2; mov.u32 %r131, %r34; @%p1 bra $L__BB0_7; add.s32 %r40, %r1, -1; and.b32 %r121, %r1, 3; setp.lt.u32 %p2, %r40, 3; mov.u32 %r131, 0; mov.u32 %r119, %r131; @%p2 bra $L__BB0_4; sub.s32 %r116, %r1, %r121; mov.u32 %r131, 0; mov.u64 %rd64, %rd1; $L__BB0_3: // begin inline asm mov.u64 %rd20, %clock64; // end inline asm cvt.u32.u64 %r43, %rd20; mov.u32 %r44, %r132; mul.wide.s32 %rd28, %r44, 4; add.s64 %rd29, %rd2, %rd28; ld.global.u32 %r45, [%rd29]; mov.u32 %r132, %r45; mov.u32 %r46, %r132; add.s32 %r47, %r46, %r131; // begin inline asm mov.u64 %rd21, %clock64; // end inline asm cvt.u32.u64 %r48, %rd21; sub.s32 %r49, %r48, %r43; st.global.u32 [%rd64], %r49; // begin inline asm mov.u64 %rd22, %clock64; // end inline asm cvt.u32.u64 %r50, %rd22; mov.u32 %r51, %r132; mul.wide.s32 %rd30, %r51, 4; add.s64 %rd31, %rd2, %rd30; ld.global.u32 %r52, [%rd31]; mov.u32 %r132, %r52; mov.u32 %r53, %r132; add.s32 %r54, %r53, %r47; // begin inline asm mov.u64 %rd23, %clock64; // end inline asm cvt.u32.u64 %r55, %rd23; sub.s32 %r56, %r55, %r50; st.global.u32 [%rd64+4], %r56; // begin inline asm mov.u64 %rd24, %clock64; // end inline asm cvt.u32.u64 %r57, %rd24; mov.u32 %r58, %r132; mul.wide.s32 %rd32, %r58, 4; add.s64 %rd33, %rd2, %rd32; ld.global.u32 %r59, [%rd33]; mov.u32 %r132, %r59; mov.u32 %r60, %r132; add.s32 %r61, %r60, %r54; // begin inline asm mov.u64 %rd25, %clock64; // end inline asm cvt.u32.u64 %r62, %rd25; sub.s32 %r63, %r62, %r57; st.global.u32 [%rd64+8], %r63; // begin inline asm mov.u64 %rd26, %clock64; // end inline asm cvt.u32.u64 %r64, %rd26; mov.u32 %r65, %r132; mul.wide.s32 %rd34, %r65, 4; add.s64 %rd35, %rd2, %rd34; ld.global.u32 %r66, [%rd35]; mov.u32 %r132, %r66; mov.u32 %r67, %r132; add.s32 %r131, %r67, %r61; // begin inline asm mov.u64 %rd27, %clock64; // end inline asm cvt.u32.u64 %r68, %rd27; sub.s32 %r69, %r68, %r64; st.global.u32 [%rd64+12], %r69; add.s32 %r119, %r119, 4; add.s64 %rd64, %rd64, 16; add.s32 %r116, %r116, -4; setp.ne.s32 %p3, %r116, 0; @%p3 bra $L__BB0_3; $L__BB0_4: setp.eq.s32 %p4, %r121, 0; @%p4 bra $L__BB0_7; mul.wide.s32 %rd36, %r119, 4; add.s64 %rd65, %rd1, %rd36; $L__BB0_6: .pragma "nounroll"; // begin inline asm mov.u64 %rd37, %clock64; // end inline asm cvt.u32.u64 %r70, %rd37; mov.u32 %r71, %r132; mul.wide.s32 %rd39, %r71, 4; add.s64 %rd40, %rd2, %rd39; ld.global.u32 %r72, [%rd40]; mov.u32 %r132, %r72; mov.u32 %r73, %r132; add.s32 %r131, %r73, %r131; // begin inline asm mov.u64 %rd38, %clock64; // end inline asm cvt.u32.u64 %r74, %rd38; sub.s32 %r75, %r74, %r70; st.global.u32 [%rd65], %r75; add.s64 %rd65, %rd65, 4; add.s32 %r121, %r121, -1; setp.ne.s32 %p5, %r121, 0; @%p5 bra $L__BB0_6; $L__BB0_7: membar.gl; mov.u32 %r132, %r34; setp.ge.s32 %p6, %r1, %r33; @%p6 bra $L__BB0_14; sub.s32 %r78, %r33, %r1; and.b32 %r125, %r78, 3; setp.eq.s32 %p7, %r125, 0; mov.u32 %r127, %r1; @%p7 bra $L__BB0_11; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd66, %rd1, %rd41; mov.u32 %r127, %r1; $L__BB0_10: .pragma "nounroll"; // begin inline asm mov.u64 %rd42, %clock64; // end inline asm cvt.u32.u64 %r79, %rd42; mov.u32 %r80, %r132; mul.wide.s32 %rd44, %r80, 4; add.s64 %rd45, %rd2, %rd44; ld.global.u32 %r81, [%rd45]; mov.u32 %r132, %r81; mov.u32 %r82, %r132; add.s32 %r131, %r82, %r131; // begin inline asm mov.u64 %rd43, %clock64; // end inline asm cvt.u32.u64 %r83, %rd43; sub.s32 %r84, %r83, %r79; st.global.u32 [%rd66], %r84; add.s32 %r127, %r127, 1; add.s64 %rd66, %rd66, 4; add.s32 %r125, %r125, -1; setp.ne.s32 %p8, %r125, 0; @%p8 bra $L__BB0_10; $L__BB0_11: not.b32 %r85, %r1; add.s32 %r86, %r85, %r33; setp.lt.u32 %p9, %r86, 3; @%p9 bra $L__BB0_14; mul.wide.s32 %rd46, %r127, 4; add.s64 %rd47, %rd1, %rd46; add.s64 %rd67, %rd47, 8; $L__BB0_13: // begin inline asm mov.u64 %rd48, %clock64; // end inline asm cvt.u32.u64 %r87, %rd48; mov.u32 %r88, %r132; mul.wide.s32 %rd56, %r88, 4; add.s64 %rd57, %rd2, %rd56; ld.global.u32 %r89, [%rd57]; mov.u32 %r132, %r89; mov.u32 %r90, %r132; add.s32 %r91, %r90, %r131; // begin inline asm mov.u64 %rd49, %clock64; // end inline asm cvt.u32.u64 %r92, %rd49; sub.s32 %r93, %r92, %r87; st.global.u32 [%rd67+-8], %r93; // begin inline asm mov.u64 %rd50, %clock64; // end inline asm cvt.u32.u64 %r94, %rd50; mov.u32 %r95, %r132; mul.wide.s32 %rd58, %r95, 4; add.s64 %rd59, %rd2, %rd58; ld.global.u32 %r96, [%rd59]; mov.u32 %r132, %r96; mov.u32 %r97, %r132; add.s32 %r98, %r97, %r91; // begin inline asm mov.u64 %rd51, %clock64; // end inline asm cvt.u32.u64 %r99, %rd51; sub.s32 %r100, %r99, %r94; st.global.u32 [%rd67+-4], %r100; // begin inline asm mov.u64 %rd52, %clock64; // end inline asm cvt.u32.u64 %r101, %rd52; mov.u32 %r102, %r132; mul.wide.s32 %rd60, %r102, 4; add.s64 %rd61, %rd2, %rd60; ld.global.u32 %r103, [%rd61]; mov.u32 %r132, %r103; mov.u32 %r104, %r132; add.s32 %r105, %r104, %r98; // begin inline asm mov.u64 %rd53, %clock64; // end inline asm cvt.u32.u64 %r106, %rd53; sub.s32 %r107, %r106, %r101; st.global.u32 [%rd67], %r107; // begin inline asm mov.u64 %rd54, %clock64; // end inline asm cvt.u32.u64 %r108, %rd54; mov.u32 %r109, %r132; mul.wide.s32 %rd62, %r109, 4; add.s64 %rd63, %rd2, %rd62; ld.global.u32 %r110, [%rd63]; mov.u32 %r132, %r110; mov.u32 %r111, %r132; add.s32 %r131, %r111, %r105; // begin inline asm mov.u64 %rd55, %clock64; // end inline asm cvt.u32.u64 %r112, %rd55; sub.s32 %r113, %r112, %r108; st.global.u32 [%rd67+4], %r113; add.s64 %rd67, %rd67, 16; add.s32 %r127, %r127, 4; setp.lt.s32 %p10, %r127, %r33; @%p10 bra $L__BB0_13; $L__BB0_14: st.global.u32 [%rd4], %r131; ret; }
There is no S2UR in those two ptx code.
Morever, the ptx code of uncommented index_record is just add mov and st.global.u32 for each loop. Why do these two operations make result different? From your explain, is the reason timestamp read after LDG also not barried before using the data LDG get ?