__kernel void scanExclusiveNaive(
__global uint *d_Dst,
__global uint *d_Src,
__local uint *l_Data,
uint n
){
int thid = get_local_id(0);
int offset = 1;
int ai = thid;
int bi = thid + (n/2);
l_Data[ai] = d_Src[ai];
l_Data[bi] = d_Src[bi]; // [1]
// Up sweep reduce phase
// build sum in place up the tree
for (int d = n>>1; d > 0; d >>= 1) {
barrier(CLK_LOCAL_MEM_FENCE);
if (thid <>
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
l_Data[bi] += l_Data[ai];
}
offset *= 2;
}
// Down sweep
// clear the last element
if (thid == 0) {
l_Data[n - 1] = 0;
}
// traverse down tree & build scan
for (int d = 1; d <>2)
{
offset >>= 1;
barrier(CLK_LOCAL_MEM_FENCE);
if (thid <>
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
float t = l_Data[ai];
l_Data[ai] = l_Data[bi];
l_Data[bi] += t;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
d_Dst[ai] = l_Data[ai]; //[2]
d_Dst[bi] = l_Data[bi];
}
But here's a couple of weird things. First, look at the code marked by [1]. [bi] is calculated as:
int bi = thid + (n/2);
That means if we are processing say 32 numbers and our thread Id thid is 31, [bi] will index from global:
31 + (32 / 2) = 47
[47] does not exist, so hopefully at worst it will be zero.
Next, at the end of the kernel, each work item writes two values to global memory at different places. To me, this seems flawed for a couple of reasons. Again, to reiterate, it is doubtless that it is my understanding, not the code that is flawed.
First, why not have one thread write one value? This would mean coalesced writes, which is more efficient.
I can only guess that this example is intended to process a sample larger than itself. For example 16 threads processing a scan on 32 numbers. That's why each thread must write two values.
Curiously, when I ran the kernel multiple times, it generated spurious values. Running once was fine. Running more than once gave incorrect results. Changing the kernel to write one value at index of [thid]resolved the issue:
// d_Dst[ai] = l_Data[ai];
// d_Dst[bi] = l_Data[bi];
d_Dst[thid] = l_Data[thid]; // write results to device memory
No comments:
Post a Comment