My source:
__device__ float2 get_vec() {
return make_float2(1, 2);
}
__global__ void kernel(float2* p) {
*p = get_vec();
}
My optimized IR:
; Function Attrs: noinline nounwind
define void @_Z6kernelP6float4Pf(%struct.float4* nocapture readonly, float* nocapture) local_unnamed_addr #1 {
%3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
%4 = zext i32 %3 to i64
%.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float4, %struct.float4* %0, i64 %4, i32 0
%.sroa.0.0.copyload = load float, float* %.sroa.0.0..sroa_idx, align 4
%.sroa.2.0..sroa_idx1 = getelementptr inbounds %struct.float4, %struct.float4* %0, i64 %4, i32 1
%.sroa.2.0.copyload = load float, float* %.sroa.2.0..sroa_idx1, align 4
%.sroa.3.0..sroa_idx2 = getelementptr inbounds %struct.float4, %struct.float4* %0, i64 %4, i32 2
%.sroa.3.0.copyload = load float, float* %.sroa.3.0..sroa_idx2, align 4
%.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.float4, %struct.float4* %0, i64 %4, i32 3
%.sroa.4.0.copyload = load float, float* %.sroa.4.0..sroa_idx3, align 4
%5 = fmul float %.sroa.0.0.copyload, %.sroa.0.0.copyload
%6 = fmul float %.sroa.2.0.copyload, %.sroa.2.0.copyload
%7 = fadd float %5, %6
%8 = fmul float %.sroa.3.0.copyload, %.sroa.3.0.copyload
%9 = fadd float %8, %7
%10 = fmul float %.sroa.4.0.copyload, %.sroa.4.0.copyload
%11 = fadd float %10, %9
%12 = getelementptr inbounds float, float* %1, i64 %4
store float %11, float* %12, align 4, !tbaa !7
ret void
}
My optimized PTX. This uses the same PassManagerBuilder settings as above,
but with a TargetMachine::addPassesToEmitFile CGFT_AssemblyFile to target
PTX instead of createPrintModulePass to target IR. See how all the ld/st
instructions are broken up into .u8 transfers and bit shifting is used to
reassemble the words. When I build with -O0 for PassManagerBuilder, the
PTX is not broken up, but also the ld/st instructions aren't targetted to
the right address space (they are st.f32 instead of st.global.f32 as
desired).
// .globl _Z6kernelP6float4Pf // -- Begin function _Z6kernelP6float4Pf
.visible .entry _Z6kernelP6float4Pf(
.param .u64 _Z6kernelP6float4Pf_param_0,
.param .u64 _Z6kernelP6float4Pf_param_1
) // @_Z6kernelP6float4Pf
{
.reg .f32 %f<12>;
.reg .b32 %r<42>;
.reg .b64 %rd<7>;
// %bb.0:
ld.param.u64 %rd1, [_Z6kernelP6float4Pf_param_0];
ld.param.u64 %rd2, [_Z6kernelP6float4Pf_param_1];
mov.u32 %r1, %tid.x;
mul.wide.u32 %rd3, %r1, 16;
add.s64 %rd4, %rd1, %rd3;
ld.u8 %r2, [%rd4+12];
ld.u8 %r3, [%rd4+13];
shl.b32 %r4, %r3, 8;
or.b32 %r5, %r4, %r2;
ld.u8 %r6, [%rd4+14];
ld.u8 %r7, [%rd4+15];
shl.b32 %r8, %r7, 8;
or.b32 %r9, %r8, %r6;
shl.b32 %r10, %r9, 16;
or.b32 %r11, %r10, %r5;
mov.b32 %f1, %r11;
ld.u8 %r12, [%rd4+8];
ld.u8 %r13, [%rd4+9];
shl.b32 %r14, %r13, 8;
or.b32 %r15, %r14, %r12;
ld.u8 %r16, [%rd4+10];
ld.u8 %r17, [%rd4+11];
shl.b32 %r18, %r17, 8;
or.b32 %r19, %r18, %r16;
shl.b32 %r20, %r19, 16;
or.b32 %r21, %r20, %r15;
mov.b32 %f2, %r21;
ld.u8 %r22, [%rd4+4];
ld.u8 %r23, [%rd4+5];
shl.b32 %r24, %r23, 8;
or.b32 %r25, %r24, %r22;
ld.u8 %r26, [%rd4+6];
ld.u8 %r27, [%rd4+7];
shl.b32 %r28, %r27, 8;
or.b32 %r29, %r28, %r26;
shl.b32 %r30, %r29, 16;
or.b32 %r31, %r30, %r25;
mov.b32 %f3, %r31;
ld.u8 %r32, [%rd4];
ld.u8 %r33, [%rd4+1];
shl.b32 %r34, %r33, 8;
or.b32 %r35, %r34, %r32;
ld.u8 %r36, [%rd4+2];
ld.u8 %r37, [%rd4+3];
shl.b32 %r38, %r37, 8;
or.b32 %r39, %r38, %r36;
shl.b32 %r40, %r39, 16;
or.b32 %r41, %r40, %r35;
mov.b32 %f4, %r41;
mul.rn.f32 %f5, %f4, %f4;
mul.rn.f32 %f6, %f3, %f3;
add.rn.f32 %f7, %f5, %f6;
mul.rn.f32 %f8, %f2, %f2;
add.rn.f32 %f9, %f8, %f7;
mul.rn.f32 %f10, %f1, %f1;
add.rn.f32 %f11, %f10, %f9;
mul.wide.u32 %rd5, %r1, 4;
add.s64 %rd6, %rd2, %rd5;
st.f32 [%rd6], %f11;
ret;
// -- End function
}
Now when I take the IR my program generates and pass it to llc-8
with -mcpu sm_52, I get the expected PTX output:
// .globl _Z6kernelP6float4Pf // -- Begin function _Z6kernelP6float4Pf
.visible .entry _Z6kernelP6float4Pf(
.param .u64 _Z6kernelP6float4Pf_param_0,
.param .u64 _Z6kernelP6float4Pf_param_1
) // @_Z6kernelP6float4Pf
{
.reg .f32 %f<12>;
.reg .b32 %r<2>;
.reg .b64 %rd<9>;
// %bb.0:
ld.param.u64 %rd1, [_Z6kernelP6float4Pf_param_0];
ld.param.u64 %rd2, [_Z6kernelP6float4Pf_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %tid.x;
mul.wide.u32 %rd5, %r1, 16;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
ld.global.f32 %f2, [%rd6+4];
ld.global.f32 %f3, [%rd6+8];
ld.global.f32 %f4, [%rd6+12];
mul.rn.f32 %f5, %f1, %f1;
mul.rn.f32 %f6, %f2, %f2;
add.rn.f32 %f7, %f5, %f6;
mul.rn.f32 %f8, %f3, %f3;
add.rn.f32 %f9, %f8, %f7;
mul.rn.f32 %f10, %f4, %f4;
add.rn.f32 %f11, %f10, %f9;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd3, %rd7;
st.global.f32 [%rd8], %f11;
ret;
// -- End function
}