#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>

int   const N       = 32768;
int   const THREADS = 256;
int   const MAX     = 768;
float const EPS2    = 0.0001;

double get_time() 
{  struct timeval tim;
  cudaThreadSynchronize();
  gettimeofday(&tim, NULL);
  return (double) tim.tv_sec+(tim.tv_usec/1000000.0);
}


__global__ void direct(float4 *sourceGlob, float *targetGlob) {
  float3 d;
  __shared__ float4 sourceShrd[MAX];
  float4 target = sourceGlob[blockIdx.x * THREADS + threadIdx.x];
  target.w *= -rsqrtf(EPS2);

  for( int iblok=0; iblok<N/MAX; iblok++) {
    __syncthreads();

#pragma unroll 32
    for (int i=0; i<MAX/THREADS; i++)
    	sourceShrd[i*THREADS + threadIdx.x] = sourceGlob[iblok*MAX + i*THREADS + threadIdx.x];
    __syncthreads();

#pragma unroll 32
    for( int i=0; i<MAX; i++ ) {
      d.x = target.x - sourceShrd[i].x;
      d.y = target.y - sourceShrd[i].y;
      d.z = target.z - sourceShrd[i].z;
      target.w += sourceShrd[i].w * rsqrtf(d.x * d.x + d.y * d.y + d.z * d.z + EPS2);
    }
  }
  targetGlob[blockIdx.x * THREADS + threadIdx.x] = target.w;
}

int main() {

  cudaSetDevice(3);
  float4 *sourceHost,*sourceDevc;
  float  *targetHost,*targetDevc;
// Allocate memory on host and device
  sourceHost = (float4*)     malloc( N*sizeof(float4) );
  targetHost = (float *)     malloc( N*sizeof(float ) );
  cudaMalloc(  (void**) &sourceDevc, N*sizeof(float4) );
  cudaMalloc(  (void**) &targetDevc, N*sizeof(float ) );
// Initialize
  for( int i=0; i<N; i++ ) {
    sourceHost[i].x = rand()/(1.+RAND_MAX);
    sourceHost[i].y = rand()/(1.+RAND_MAX);
    sourceHost[i].z = rand()/(1.+RAND_MAX);
    sourceHost[i].w = 1.0/N;
  }
// Direct summation on device
  cudaMemcpy(sourceDevc,sourceHost,N*sizeof(float4),cudaMemcpyHostToDevice);

  double start = get_time();
  direct<<< N/THREADS, THREADS >>>(sourceDevc,targetDevc);
  double stop = get_time();

  cudaMemcpy(targetHost,targetDevc,N*sizeof(float ),cudaMemcpyDeviceToHost);

  double time = stop - start;

  printf("Kernel execution time: %f\n",time);

  cudaFree(sourceDevc);
  cudaFree(targetDevc);


// Direct summation on host
/*  float dx,dy,dz,r;
  for( int i=0; i<N; i++ ) {
    float p = - sourceHost[i].w / sqrtf(EPS2);
    for( int j=0; j<N; j++ ) {
      dx = sourceHost[i].x - sourceHost[j].x;
      dy = sourceHost[i].y - sourceHost[j].y;
      dz = sourceHost[i].z - sourceHost[j].z;
      r = sqrtf(dx * dx + dy * dy + dz * dz + EPS2);
      p += sourceHost[j].w / r;
    }
    printf("%d %f %f\n",i,p,targetHost[i]);
  }*/
}
