my cuda

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
__global__ void NewCycleKernel(double *out1, double *out2, double *A, double *b, double *At, double *AtA, double *AAt, double *Atb)
{
int idx;
double temp;
if (threadIdx.x > 255)
{
idx = threadIdx.x - 256;
}
else
{
idx = threadIdx.x;
}
__shared__ double old_out1[256], old_out2[16];
__shared__ double Aout2[256];
__shared__ double out4[256], out[256], re[128], im[128], out3[128];
for (int i = 0; i < ITER; ++i)
{
//Aout2 = A'*out2;
//out4 = out1;
//out = out1 + Aout2;
if (threadIdx.x < 256)
{
for (int k = 0; k < 16; k++)
{
Aout2[idx] += At[idx * 16 + k] * out2[k];
}
out4[idx] = out1[idx];
out[idx] = out1[idx] + Aout2[idx];
}
//re = out(1:MFFT);
//im = out(MFFT+1:MFFT*2);
if (threadIdx.x < 128)
{
re[idx] = out[idx];
}
else if (threadIdx.x < 256)
{
im[idx-128] = out[idx];
}
__syncthreads();
//out3 =(re.^2 + im.^2).^0.5;
//for j =1:MFFT
// if out3(j)>1
// out(j) = re(j)/out3(j);
// out(j+MFFT) = im(j)/out3(j);
// end
//end
if (threadIdx.x < 128)
{
out3[idx] = sqrt(re[idx] * re[idx] + im[idx] * im[idx]);
if (out3[idx] > 1)
{
out[idx] = re[idx] / out3[idx];
out[idx + 128] = im[idx] / out3[idx];
}
}
__syncthreads();
//Main loop
if (threadIdx.x > 255) //-A*out4 - AAt*out2 + A*out + b
{
temp = 0;
for (int k = 0; k < 256; k++)
{
temp += A[idx * 256 + k] * (out[k] - out4[k]); //- A*out4 + A*out
}
for (int k = 0; k < 16; k++)
{
temp -= AAt[idx * 16 + k] * out2[idx]; //- AAt*out2
}
out2[idx] = out2[idx] + delta_t * (temp + b[idx]); //+ b
}
else //Ab + Aout2 - AA*out1 - out
{
temp = 0;
temp = Atb[idx] + Aout2[idx] - out[idx]; //Ab + Aout2 - out
for (int k = 0; k < 256; k++)
{
temp -= AtA[idx * 256 + k] * out1[k]; //- AA*out1
}
out1[idx] = out1[idx] + delta_t * temp;
}
__syncthreads();
}
}