I want to see the computing performance of my GTX 460 v2 vs cpu. Only say to do this easily is sqrts (did i mention i love sqrts?)
Anyways there is a const int signifying the size of the array as well as the for loop. I am a complete CUDA noobie, however I do understand C++ to an extent (not so much vanilla C with it's pointers).
I just can't wrap my head why changing the const int to a value from 200,000 to anything larger (i.e. 300,00) would change the result of lets say 6605 -> 1.000004 to 6605->6605.
Thanks for your help
Anyways there is a const int signifying the size of the array as well as the for loop. I am a complete CUDA noobie, however I do understand C++ to an extent (not so much vanilla C with it's pointers).
I just can't wrap my head why changing the const int to a value from 200,000 to anything larger (i.e. 300,00) would change the result of lets say 6605 -> 1.000004 to 6605->6605.
Thanks for your help
#include <stdio.h>
#include <iostream>
#include <math.h>
using namespace std;
// Kernel that executes on the CUDA device
__global__ void double_array( float *a, int N )
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if ( idx < N )
a[idx] = sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(sqrt(a[idx])))))))))))))))))))));
// Main routine that executes on the host
int main( void )
cudaDeviceProp prop; // Struct that contains device properties
int dev; // Integer to save the device number
cudaGetDevice(&dev); // Get the number of the device in use
cudaGetDeviceProperties(&prop, dev); // Get the properties
// Print the number and name of the CUDA device in use
cout << "Cuda device: " << dev << " with name: "
<< prop.name << endl << endl;
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 200000; // Number of elements in arrays
size_t size = N * sizeof( float );
a_h = (float *)malloc( size ); // Allocate array on host
cudaMalloc( (void **)&a_d, size ); // Allocate array on device
// Initialize host array and copy it to CUDA device
for ( int i = 0; i < N; i++ )
a_h[i] = (float)i;
cudaMemcpy( a_d, a_h, size, cudaMemcpyHostToDevice );
// Do calculation on device:
int block_size = 4;
int n_blocks = N / block_size + ( N % block_size == 0 ? 0 : 1 );
double_array <<< n_blocks, block_size >>> ( a_d, N );
// Retrieve result from device and store it in host array
cudaMemcpy( a_h, a_d, sizeof( float ) * N, cudaMemcpyDeviceToHost );
// Print results
for ( int i = 0; i < N; i++ )
cout << i << " " << fixed << a_h[i] << endl;
// Free the memory on the host and the CUDA device
free( a_h );
cudaFree( a_d );