static const int g_max_blur_radius = 6;

RWTexture2D<float2> g_coc_near : register(u0);

#define N 256
groupshared float g_cache[N + 2*g_max_blur_radius];

[numthreads(N, 1, 1)]
void main(int3 group_thread_id : SV_GroupThreadID, int3 dispatch_thread_id : SV_DispatchThreadID)
{
  float2 dim;
  g_coc_near.GetDimensions(dim.x, dim.y); 
  
  if (group_thread_id.x < g_max_blur_radius)
  {
    int x = max(dispatch_thread_id.x - g_max_blur_radius, 0);
    g_cache[group_thread_id.x] = g_coc_near[int2(x, dispatch_thread_id.y)].r;
  }
  if (group_thread_id.x >= N-g_max_blur_radius)
  {
    int x = min(dispatch_thread_id.x + g_max_blur_radius, dim.x - 1);
    g_cache[group_thread_id.x+2*g_max_blur_radius] = g_coc_near[int2(x, dispatch_thread_id.y)].r;
  }
  
  g_cache[group_thread_id.x+g_max_blur_radius] = g_coc_near[min(dispatch_thread_id.xy, dim.xy-1)].r;
  
  GroupMemoryBarrierWithGroupSync();
  
  float c = 0.0f;
  for (int i = -g_max_blur_radius; i <= g_max_blur_radius; ++i)
  {
    int k = group_thread_id.x + g_max_blur_radius + i;
    c += g_cache[k].r;
  }

  c /= g_max_blur_radius * 2.0f + 1.0f;
  g_coc_near[dispatch_thread_id.xy].r = c;
}