cuda - Low performance kernel -
i have cuda kernel there many operations , few branches. looks like __global__ void kernel(real *randomvalues, real mu, real sigma) { int row = blockdim.y * blockidx.y + threadidx.y; int col = blockdim.x * blockidx.x + threadidx.x; if ( row >= cntimesteps || col >= cnpaths ) return; real alphalevel = randomvalues[row*cnpaths+col]; real q = 0.0; real x = 0.0; if ( alphalevel < p_low) { q = sqrt( -2*log( alphalevel ) ); x = (((((c1*q+c2)*q+c3)*q+c4)*q+c5)*q+c6) / ((((d1*q+d2)*q+d3)*q+d4)*q+1); } else if ( alphalevel < p_high ) { q = alphalevel-0.5; real r = q*q; x= (((((a1*r+a2)*r+a3)*r+a4)*r+a5)*r+a6)*q / (((((b1*r+b2)*r+b3)*r+b4)*r+b5)*r+1); } else { q = sqrt( -2*log( 1.0-alphalevel ) ); x = -(((((c1*q+c2)*q+c3)*q+c4)*q+c5)*q+c6) / ((((d1*q+d2)*q+d3)*q+d4)*q+1); } randomvalues[row*cnpaths+col] = sigma * x + mu; } where a 's, b 's, c ...