- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hello,
I have got another question concerning OpenCL. My problem: When I execute the same kernel on GPU and on FPGA (pre-compiled binary), I get different results when I read the buffer afterwards. Are there any device specific operations that can result in a different output? The kernel code:# pragma OPENCL EXTENSION cl_amd_printf : enable
struct __attribute__ ((packed)) gm_component {
float w;
float4 m;
float16 P;
};
struct __attribute__ ((packed)) gm_component_survive {
float w;
float4 m;
float16 P;
float2 eta;
float4 S;
float8 K;
};
//4x4 matrix multiplication
float16 matrixMult4x4f(float16 M, float16 N,float4 unit4)
{
//return M*N
float4 a=M.s0123;
float4 b=M.s4567;
float4 c=M.s89ab;
float4 d=M.scdef;
float4 e=N.s048c;
float4 f=N.s159d;
float4 g=N.s26ae;
float4 h=N.s37bf;
float16 tmp = (float16){dot(a*e,unit4),dot(a*f,unit4),dot(a*g,unit4),dot(a*h,unit4),
dot(b*e,unit4),dot(b*f,unit4),dot(b*g,unit4),dot(b*h,unit4),
dot(c*e,unit4),dot(c*f,unit4),dot(c*g,unit4),dot(c*h,unit4),
dot(d*e,unit4),dot(d*f,unit4),dot(d*g,unit4),dot(d*h,unit4)};
return tmp;
}
// OpenCL Kernel to compute multiplication and addition
__kernel void
update(__global struct gm_component_survive * restrict predict_mixture,
float8 Hk, __global float2 *Zk,
__global struct gm_component * restrict update_mixture,int size, float pr_dk, int updateMixtureSize)
{
int zk_index=get_global_id(0);
int survive_index = get_global_id(1);
if(zk_index<size && survive_index<updateMixtureSize){
__global struct gm_component *um = &update_mixture;
__global struct gm_component_survive *pm = &predict_mixture;
float2 unit2={1.f,1.f};
float4 unit4={1.f,1.f,1.f,1.f};
//Multivarate guassian calculation
//calculate miu
float2 miu = Zk -pm->eta;
//Inverse of covarience.. start
float4 inv_covariance = pm->S;
//calculate denominator
float determenent = inv_covariance.s3*inv_covariance.s0 - inv_covariance.s2*inv_covariance.s1;
inv_covariance =(float4){inv_covariance.s3,-inv_covariance.s1,-inv_covariance.s2,inv_covariance.s0};
inv_covariance = inv_covariance/determenent; //inverse calculation end
//multiplication of miu.T*covariance*miu
float2 number = { dot(miu*inv_covariance.even,unit2),dot(miu*inv_covariance.odd,unit2)};
number=number*miu;
//Calculate denominator :pow(2*M_PI, 2)* determenent 39.4784
float denom = 39.4784* determenent;
denom=sqrt(denom);
//calculate weight
um->w = pr_dk*pm->w*native_exp(-0.5f * dot(number,unit2)) / denom;
//calculate mean
number = (float2){dot(Hk.lo*pm->m,unit4) , dot(Hk.hi*pm->m,unit4)};
number = Zk-number;
inv_covariance =(float4){dot(pm->K.lo.lo*number,unit2),dot(pm->K.lo.hi*number,unit2),dot(pm->K.hi.lo*number,unit2),dot(pm->K.hi.hi*number,unit2)};
um->m = pm->m+inv_covariance;
//calculate covarince
float16 temp1 = (float16){1-dot(pm->K.lo.lo*Hk.s04,unit2),-dot(pm->K.lo.lo*Hk.s15,unit2),-dot(pm->K.lo.lo*Hk.s26,unit2),-dot(pm->K.lo.lo*Hk.s37,unit2),
-dot(pm->K.lo.hi*Hk.s04,unit2),1-dot(pm->K.lo.hi*Hk.s15,unit2),-dot(pm->K.lo.hi*Hk.s26,unit2),-dot(pm->K.lo.hi*Hk.s37,unit2),
-dot(pm->K.hi.lo*Hk.s04,unit2),-dot(pm->K.hi.lo*Hk.s15,unit2),1-dot(pm->K.hi.lo*Hk.s26,unit2),-dot(pm->K.hi.lo*Hk.s37,unit2),
-dot(pm->K.hi.hi*Hk.s04,unit2),-dot(pm->K.hi.hi*Hk.s15,unit2),-dot(pm->K.hi.hi*Hk.s26,unit2),1-dot(pm->K.hi.hi*Hk.s37,unit2)};
um->P = matrixMult4x4f(temp1, pm->P,unit4);;
}
}
Any hints will be apprecciated. Tobias
Link Copied
6 Replies
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
How different are the results? If the results are only different in the last few digits of the numbers, then it is probably caused by some rounding difference. Note that if you use --fpc or --fp-relaxed for FPGA compilation, output of floating-point operations will be different.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Thank you for your answer!
The output is somehow very different. As example look at the following outputs obtained by GPU and FPGA: FPGA: w: 0 m: 0.1 51410 0.1 0 P: 8.25 13.5 0 0 -879.562 -1435.38 -0.0762125 -0.124711 0 0 8.25 13.5 0 0 13.5 26 w: 0 m: 0.1 56227.3 0.1 0 P: 8.25 13.5 0 0 -879.562 -1435.38 -0.0762125 -0.124711 0 0 8.25 13.5 0 0 13.5 26 w: 0 m: 0.1 95197.5 0.1 0 P: 8.25 13.5 0 0 -879.562 -1435.38 -0.0762125 -0.124711 0 0 8.25 13.5 0 0 13.5 26 w: 0 m: 0.1 85130.8 0.1 0 P: 8.25 13.5 0 0 -879.562 -1435.38 -0.0762125 -0.124711 0 0 8.25 13.5 0 0 13.5 26 w: 0 m: 0.1 68243.1 0.1 0 P: 8.25 13.5 0 0 -879.562 -1435.38 -0.0762125 -0.124711 0 0 8.25 13.5 0 0 13.5 26 w: 0 m: 0 0 0 0 P: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 GPU: w: 0 m: 4.48707 0 2.19607 0 P: 8.17379 13.3753 0 0 13.5 26 0 0 0 0 8.17379 13.3753 0 0 13.5 26 w: 0 m: 4.89815 0 2.33002 0 P: 8.17379 13.3753 0 0 13.5 26 0 0 0 0 8.17379 13.3753 0 0 13.5 26 w: 0 m: 8.22379 0 2.51016 0 P: 8.17379 13.3753 0 0 13.5 26 0 0 0 0 8.17379 13.3753 0 0 13.5 26 w: 0 m: 7.36467 0 3.0552 0 P: 8.17379 13.3753 0 0 13.5 26 0 0 0 0 8.17379 13.3753 0 0 13.5 26 w: 0 m: 5.92356 0 2.4224 0 P: 8.17379 13.3753 0 0 13.5 26 0 0 0 0 8.17379 13.3753 0 0 13.5 26 w: 0 m: 0 0 0 0 P: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Thank you for your answer!
The results are quite different. A sample is shown below:
FPGA:
w: 0
m: 0.1 51410 0.1 0
P: 8.25 13.5 0 0
-879.562 -1435.38 -0.0762125 -0.124711
0 0 8.25 13.5
0 0 13.5 26
w: 0
m: 0.1 56227.3 0.1 0
P: 8.25 13.5 0 0
-879.562 -1435.38 -0.0762125 -0.124711
0 0 8.25 13.5
0 0 13.5 26
w: 0
m: 0.1 95197.5 0.1 0
P: 8.25 13.5 0 0
-879.562 -1435.38 -0.0762125 -0.124711
0 0 8.25 13.5
0 0 13.5 26
w: 0
m: 0.1 85130.8 0.1 0
P: 8.25 13.5 0 0
-879.562 -1435.38 -0.0762125 -0.124711
0 0 8.25 13.5
0 0 13.5 26
w: 0
m: 0.1 68243.1 0.1 0
P: 8.25 13.5 0 0
-879.562 -1435.38 -0.0762125 -0.124711
0 0 8.25 13.5
0 0 13.5 26
w: 0
m: 0 0 0 0
P: 0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
GPU:
w: 0
m: 4.48707 0 2.19607 0
P: 8.17379 13.3753 0 0
13.5 26 0 0
0 0 8.17379 13.3753
0 0 13.5 26
w: 0
m: 4.89815 0 2.33002 0
P: 8.17379 13.3753 0 0
13.5 26 0 0
0 0 8.17379 13.3753
0 0 13.5 26
w: 0
m: 8.22379 0 2.51016 0
P: 8.17379 13.3753 0 0
13.5 26 0 0
0 0 8.17379 13.3753
0 0 13.5 26
w: 0
m: 7.36467 0 3.0552 0
P: 8.17379 13.3753 0 0
13.5 26 0 0
0 0 8.17379 13.3753
0 0 13.5 26
w: 0
m: 5.92356 0 2.4224 0
P: 8.17379 13.3753 0 0
13.5 26 0 0
0 0 8.17379 13.3753
0 0 13.5 26
w: 0
m: 0 0 0 0
P: 0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Thank you for your answer!
The outputs are in fact very different.- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
One example:
FPGA: m: 0.1 51410 0.1 0 GPU: m: 4.48707 0 2.19607 0- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
That is certainly not because of rounding difference. Are you using the exact same kernel and host (minus the clCreateProgram difference) code for both? Have you tried Altera's emulator?
Reply
Topic Options
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page