My source: __device__ float2 get_vec() { return make_float2(1, 2); } __global__ void kernel(float2* p) { *p = get_vec(); } My optimized IR: ; Function Attrs: noinline nounwind define void @_Z6kernelP6float4Pf(%struct.float4* nocapture readonly, float* nocapture) local_unnamed_addr #1 { %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3 %4 = zext i32 %3 to i64 %.sroa.0.0..sroa_idx = getelementptr inbounds %struct.float4, %struct.float4* %0, i64 %4, i32 0 %.sroa.0.0.copyload = load float, float* %.sroa.0.0..sroa_idx, align 4 %.sroa.2.0..sroa_idx1 = getelementptr inbounds %struct.float4, %struct.float4* %0, i64 %4, i32 1 %.sroa.2.0.copyload = load float, float* %.sroa.2.0..sroa_idx1, align 4 %.sroa.3.0..sroa_idx2 = getelementptr inbounds %struct.float4, %struct.float4* %0, i64 %4, i32 2 %.sroa.3.0.copyload = load float, float* %.sroa.3.0..sroa_idx2, align 4 %.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.float4, %struct.float4* %0, i64 %4, i32 3 %.sroa.4.0.copyload = load float, float* %.sroa.4.0..sroa_idx3, align 4 %5 = fmul float %.sroa.0.0.copyload, %.sroa.0.0.copyload %6 = fmul float %.sroa.2.0.copyload, %.sroa.2.0.copyload %7 = fadd float %5, %6 %8 = fmul float %.sroa.3.0.copyload, %.sroa.3.0.copyload %9 = fadd float %8, %7 %10 = fmul float %.sroa.4.0.copyload, %.sroa.4.0.copyload %11 = fadd float %10, %9 %12 = getelementptr inbounds float, float* %1, i64 %4 store float %11, float* %12, align 4, !tbaa !7 ret void } My optimized PTX. This uses the same PassManagerBuilder settings as above, but with a TargetMachine::addPassesToEmitFile CGFT_AssemblyFile to target PTX instead of createPrintModulePass to target IR. See how all the ld/st instructions are broken up into .u8 transfers and bit shifting is used to reassemble the words. When I build with -O0 for PassManagerBuilder, the PTX is not broken up, but also the ld/st instructions aren't targetted to the right address space (they are st.f32 instead of st.global.f32 as desired). // .globl _Z6kernelP6float4Pf // -- Begin function _Z6kernelP6float4Pf .visible .entry _Z6kernelP6float4Pf( .param .u64 _Z6kernelP6float4Pf_param_0, .param .u64 _Z6kernelP6float4Pf_param_1 ) // @_Z6kernelP6float4Pf { .reg .f32 %f<12>; .reg .b32 %r<42>; .reg .b64 %rd<7>; // %bb.0: ld.param.u64 %rd1, [_Z6kernelP6float4Pf_param_0]; ld.param.u64 %rd2, [_Z6kernelP6float4Pf_param_1]; mov.u32 %r1, %tid.x; mul.wide.u32 %rd3, %r1, 16; add.s64 %rd4, %rd1, %rd3; ld.u8 %r2, [%rd4+12]; ld.u8 %r3, [%rd4+13]; shl.b32 %r4, %r3, 8; or.b32 %r5, %r4, %r2; ld.u8 %r6, [%rd4+14]; ld.u8 %r7, [%rd4+15]; shl.b32 %r8, %r7, 8; or.b32 %r9, %r8, %r6; shl.b32 %r10, %r9, 16; or.b32 %r11, %r10, %r5; mov.b32 %f1, %r11; ld.u8 %r12, [%rd4+8]; ld.u8 %r13, [%rd4+9]; shl.b32 %r14, %r13, 8; or.b32 %r15, %r14, %r12; ld.u8 %r16, [%rd4+10]; ld.u8 %r17, [%rd4+11]; shl.b32 %r18, %r17, 8; or.b32 %r19, %r18, %r16; shl.b32 %r20, %r19, 16; or.b32 %r21, %r20, %r15; mov.b32 %f2, %r21; ld.u8 %r22, [%rd4+4]; ld.u8 %r23, [%rd4+5]; shl.b32 %r24, %r23, 8; or.b32 %r25, %r24, %r22; ld.u8 %r26, [%rd4+6]; ld.u8 %r27, [%rd4+7]; shl.b32 %r28, %r27, 8; or.b32 %r29, %r28, %r26; shl.b32 %r30, %r29, 16; or.b32 %r31, %r30, %r25; mov.b32 %f3, %r31; ld.u8 %r32, [%rd4]; ld.u8 %r33, [%rd4+1]; shl.b32 %r34, %r33, 8; or.b32 %r35, %r34, %r32; ld.u8 %r36, [%rd4+2]; ld.u8 %r37, [%rd4+3]; shl.b32 %r38, %r37, 8; or.b32 %r39, %r38, %r36; shl.b32 %r40, %r39, 16; or.b32 %r41, %r40, %r35; mov.b32 %f4, %r41; mul.rn.f32 %f5, %f4, %f4; mul.rn.f32 %f6, %f3, %f3; add.rn.f32 %f7, %f5, %f6; mul.rn.f32 %f8, %f2, %f2; add.rn.f32 %f9, %f8, %f7; mul.rn.f32 %f10, %f1, %f1; add.rn.f32 %f11, %f10, %f9; mul.wide.u32 %rd5, %r1, 4; add.s64 %rd6, %rd2, %rd5; st.f32 [%rd6], %f11; ret; // -- End function } Now when I take the IR my program generates and pass it to llc-8 with -mcpu sm_52, I get the expected PTX output: // .globl _Z6kernelP6float4Pf // -- Begin function _Z6kernelP6float4Pf .visible .entry _Z6kernelP6float4Pf( .param .u64 _Z6kernelP6float4Pf_param_0, .param .u64 _Z6kernelP6float4Pf_param_1 ) // @_Z6kernelP6float4Pf { .reg .f32 %f<12>; .reg .b32 %r<2>; .reg .b64 %rd<9>; // %bb.0: ld.param.u64 %rd1, [_Z6kernelP6float4Pf_param_0]; ld.param.u64 %rd2, [_Z6kernelP6float4Pf_param_1]; cvta.to.global.u64 %rd3, %rd2; cvta.to.global.u64 %rd4, %rd1; mov.u32 %r1, %tid.x; mul.wide.u32 %rd5, %r1, 16; add.s64 %rd6, %rd4, %rd5; ld.global.f32 %f1, [%rd6]; ld.global.f32 %f2, [%rd6+4]; ld.global.f32 %f3, [%rd6+8]; ld.global.f32 %f4, [%rd6+12]; mul.rn.f32 %f5, %f1, %f1; mul.rn.f32 %f6, %f2, %f2; add.rn.f32 %f7, %f5, %f6; mul.rn.f32 %f8, %f3, %f3; add.rn.f32 %f9, %f8, %f7; mul.rn.f32 %f10, %f4, %f4; add.rn.f32 %f11, %f10, %f9; mul.wide.u32 %rd7, %r1, 4; add.s64 %rd8, %rd3, %rd7; st.global.f32 [%rd8], %f11; ret; // -- End function }