I want to see the computing performance of my GTX 460 v2 vs cpu. Only say to do this easily is sqrts (did i mention i love sqrts?)
Anyways there is a const int signifying the size of the array as well as the for loop. I am a complete CUDA noobie, however I do understand C++ to an extent (not so much vanilla C with it's pointers).
I just can't wrap my head why changing the const int to a value from 200,000 to anything larger (i.e. 300,00) would change the result of lets say 6605 -> 1.000004 to 6605->6605.
Thanks for your help
	
	
	
		
				
			Anyways there is a const int signifying the size of the array as well as the for loop. I am a complete CUDA noobie, however I do understand C++ to an extent (not so much vanilla C with it's pointers).
I just can't wrap my head why changing the const int to a value from 200,000 to anything larger (i.e. 300,00) would change the result of lets say 6605 -> 1.000004 to 6605->6605.
Thanks for your help

		Code:
	
	#include <stdio.h>
#include <iostream>
#include <math.h>
 
using namespace std;
 
// Kernel that executes on the CUDA device
__global__ void double_array( float *a, int N )
{
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if ( idx < N )
      a[idx] = sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(a[idx])))))))))))))))))))));
}
 
// Main routine that executes on the host
int main( void )
{
 
   cudaDeviceProp prop; // Struct that contains device properties
   int dev; // Integer to save the device number
 
   cudaGetDevice(&dev); // Get the number of the device in use
   cudaGetDeviceProperties(&prop, dev); // Get the properties
 
   // Print the number and name of the CUDA device in use
   cout << "Cuda device: " << dev << " with name: " 
      << prop.name << endl << endl;
 
   float *a_h, *a_d; // Pointer to host & device arrays
   const int N = 200000; // Number of elements in arrays
   size_t size = N * sizeof( float );
   a_h = (float *)malloc( size );    // Allocate array on host
   cudaMalloc( (void **)&a_d, size ); // Allocate array on device
 
   // Initialize host array and copy it to CUDA device
   for ( int i = 0; i < N; i++ )
      a_h[i] = (float)i;
   cudaMemcpy( a_d, a_h, size, cudaMemcpyHostToDevice );
 
   // Do calculation on device:
   int block_size = 4;
   int n_blocks   = N / block_size + ( N % block_size == 0 ? 0 : 1 );
   double_array <<< n_blocks, block_size >>> ( a_d, N );
 
   // Retrieve result from device and store it in host array
   cudaMemcpy( a_h, a_d, sizeof( float ) * N, cudaMemcpyDeviceToHost );
 
   // Print results
   for ( int i = 0; i < N; i++ )
   {
      cout << i << " " << fixed << a_h[i] << endl;
   }
 
   // Free the memory on the host and the CUDA device
   free( a_h );
   cudaFree( a_d );
}
	
				
		