My source __device__ float2 get_vec return make_float2 __global__ void

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
My source:
__device__ float2 get_vec() {
return make_float2(1, 2);
}
__global__ void kernel(float2* p) {
*p = get_vec();
}
My optimized IR:
; Function Attrs: noinline nounwind
define void @_Z6kernelP6float4Pf(%struct.float4* nocapture readonly, float* nocapture) local_unnamed_addr #1 {
%3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
%4 = zext i32 %3 to i64
%.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float4, %struct.float4* %0, i64 %4, i32 0
%.sroa.0.0.copyload = load float, float* %.sroa.0.0..sroa_idx, align 4
%.sroa.2.0..sroa_idx1 = getelementptr inbounds %struct.float4, %struct.float4* %0, i64 %4, i32 1
%.sroa.2.0.copyload = load float, float* %.sroa.2.0..sroa_idx1, align 4
%.sroa.3.0..sroa_idx2 = getelementptr inbounds %struct.float4, %struct.float4* %0, i64 %4, i32 2
%.sroa.3.0.copyload = load float, float* %.sroa.3.0..sroa_idx2, align 4
%.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.float4, %struct.float4* %0, i64 %4, i32 3
%.sroa.4.0.copyload = load float, float* %.sroa.4.0..sroa_idx3, align 4
%5 = fmul float %.sroa.0.0.copyload, %.sroa.0.0.copyload
%6 = fmul float %.sroa.2.0.copyload, %.sroa.2.0.copyload
%7 = fadd float %5, %6
%8 = fmul float %.sroa.3.0.copyload, %.sroa.3.0.copyload
%9 = fadd float %8, %7
%10 = fmul float %.sroa.4.0.copyload, %.sroa.4.0.copyload
%11 = fadd float %10, %9
%12 = getelementptr inbounds float, float* %1, i64 %4
store float %11, float* %12, align 4, !tbaa !7
ret void
}
My optimized PTX. This uses the same PassManagerBuilder settings as above,
but with a TargetMachine::addPassesToEmitFile CGFT_AssemblyFile to target
PTX instead of createPrintModulePass to target IR. See how all the ld/st
instructions are broken up into .u8 transfers and bit shifting is used to
reassemble the words. When I build with -O0 for PassManagerBuilder, the
PTX is not broken up, but also the ld/st instructions aren't targetted to
the right address space (they are st.f32 instead of st.global.f32 as
desired).
// .globl _Z6kernelP6float4Pf // -- Begin function _Z6kernelP6float4Pf
.visible .entry _Z6kernelP6float4Pf(
.param .u64 _Z6kernelP6float4Pf_param_0,
.param .u64 _Z6kernelP6float4Pf_param_1
) // @_Z6kernelP6float4Pf
{
.reg .f32 %f<12>;
.reg .b32 %r<42>;
.reg .b64 %rd<7>;
// %bb.0:
ld.param.u64 %rd1, [_Z6kernelP6float4Pf_param_0];
ld.param.u64 %rd2, [_Z6kernelP6float4Pf_param_1];
mov.u32 %r1, %tid.x;
mul.wide.u32 %rd3, %r1, 16;
add.s64 %rd4, %rd1, %rd3;
ld.u8 %r2, [%rd4+12];
ld.u8 %r3, [%rd4+13];
shl.b32 %r4, %r3, 8;
or.b32 %r5, %r4, %r2;
ld.u8 %r6, [%rd4+14];
ld.u8 %r7, [%rd4+15];
shl.b32 %r8, %r7, 8;
or.b32 %r9, %r8, %r6;
shl.b32 %r10, %r9, 16;
or.b32 %r11, %r10, %r5;
mov.b32 %f1, %r11;
ld.u8 %r12, [%rd4+8];
ld.u8 %r13, [%rd4+9];
shl.b32 %r14, %r13, 8;
or.b32 %r15, %r14, %r12;
ld.u8 %r16, [%rd4+10];
ld.u8 %r17, [%rd4+11];
shl.b32 %r18, %r17, 8;
or.b32 %r19, %r18, %r16;
shl.b32 %r20, %r19, 16;
or.b32 %r21, %r20, %r15;
mov.b32 %f2, %r21;
ld.u8 %r22, [%rd4+4];
ld.u8 %r23, [%rd4+5];
shl.b32 %r24, %r23, 8;
or.b32 %r25, %r24, %r22;
ld.u8 %r26, [%rd4+6];
ld.u8 %r27, [%rd4+7];
shl.b32 %r28, %r27, 8;
or.b32 %r29, %r28, %r26;
shl.b32 %r30, %r29, 16;
or.b32 %r31, %r30, %r25;
mov.b32 %f3, %r31;
ld.u8 %r32, [%rd4];
ld.u8 %r33, [%rd4+1];
shl.b32 %r34, %r33, 8;
or.b32 %r35, %r34, %r32;
ld.u8 %r36, [%rd4+2];
ld.u8 %r37, [%rd4+3];
shl.b32 %r38, %r37, 8;
or.b32 %r39, %r38, %r36;
shl.b32 %r40, %r39, 16;
or.b32 %r41, %r40, %r35;
mov.b32 %f4, %r41;
mul.rn.f32 %f5, %f4, %f4;
mul.rn.f32 %f6, %f3, %f3;
add.rn.f32 %f7, %f5, %f6;
mul.rn.f32 %f8, %f2, %f2;
add.rn.f32 %f9, %f8, %f7;
mul.rn.f32 %f10, %f1, %f1;
add.rn.f32 %f11, %f10, %f9;
mul.wide.u32 %rd5, %r1, 4;
add.s64 %rd6, %rd2, %rd5;
st.f32 [%rd6], %f11;
ret;
// -- End function
}
Now when I take the IR my program generates and pass it to llc-8
with -mcpu sm_52, I get the expected PTX output:
// .globl _Z6kernelP6float4Pf // -- Begin function _Z6kernelP6float4Pf
.visible .entry _Z6kernelP6float4Pf(
.param .u64 _Z6kernelP6float4Pf_param_0,
.param .u64 _Z6kernelP6float4Pf_param_1
) // @_Z6kernelP6float4Pf
{
.reg .f32 %f<12>;
.reg .b32 %r<2>;
.reg .b64 %rd<9>;
// %bb.0:
ld.param.u64 %rd1, [_Z6kernelP6float4Pf_param_0];
ld.param.u64 %rd2, [_Z6kernelP6float4Pf_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %tid.x;
mul.wide.u32 %rd5, %r1, 16;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
ld.global.f32 %f2, [%rd6+4];
ld.global.f32 %f3, [%rd6+8];
ld.global.f32 %f4, [%rd6+12];
mul.rn.f32 %f5, %f1, %f1;
mul.rn.f32 %f6, %f2, %f2;
add.rn.f32 %f7, %f5, %f6;
mul.rn.f32 %f8, %f3, %f3;
add.rn.f32 %f9, %f8, %f7;
mul.rn.f32 %f10, %f4, %f4;
add.rn.f32 %f11, %f10, %f9;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd3, %rd7;
st.global.f32 [%rd8], %f11;
ret;
// -- End function
}