#include <thrust/host_vector.h >
#include <thrust/device_vector.h>
#include <thrust/sort.h>
int main(void) {
// define a vector of 16M int on the host
thrust::host_vector h_vec(1 << 24);
// generate 16M random numbers on the host
thrust::generate(h_vec.begin(), h_vec.end(), rand);
// transfer data to the device
thrust::device_vector d_vec = h_vec;
// sort data on the device
thrust::sort(d_vec.begin(), d_vec.end());
// transfer data back to host
thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
return 0;
// allocate device vector
thrust::device_vector d_vec(4);
// obtain raw pointer to device vector’s memory
int * ptr = thrust::raw_pointer_cast(&d_vec[0]);
// Filename: csort.cu// nvcc -c -arch sm_13 csort.cu #include <thrust/device_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
extern "C" {
//Sort for integer arrays
void sort_int_wrapper( int *data, int N)
{
// Wrap raw pointer with a device_ptr
thrust::device_ptr <int> dev_ptr(data);
// Use device_ptr in Thrust sort algorithm
thrust::sort(dev_ptr, dev_ptr+N);
}
//Sort for float arrays
void sort_float_wrapper( float *data, int N)
{
thrust::device_ptr <float> dev_ptr(data);
thrust::sort(dev_ptr, dev_ptr+N);
}
//Sort for double arrays
void sort_double_wrapper( double *data, int N)
{
thrust::device_ptr <double> dev_ptr(data);
thrust::sort(dev_ptr, dev_ptr+N);
}
}
nvcc -c -arch sm_13 csort.cu
module thrust
interface thrustsort subroutine sort_int( input,N) bind(C,name="sort_int_wrapper") use iso_c_binding integer(c_int),device:: input(*) integer(c_int),value:: N end subroutine
subroutine sort_float( input,N) bind(C,name="sort_float_wrapper") use iso_c_binding real(c_float),device:: input(*) integer(c_int),value:: N end subroutine
subroutine sort_double( input,N) bind(C,name="sort_double_wrapper") use iso_c_binding real(c_double),device:: input(*) integer(c_int),value:: N end subroutine
end interface
end module thrust
program testsortuse thrustreal, allocatable :: cpuData(:)real, allocatable, device :: gpuData(:)integer:: N=10allocate(cpuData(N))allocate(gpuData(N))
do i=1,N cpuData(i)=random(i)end docpuData(5)=100.
print *,"Before sorting", cpuData
gpuData=cpuData
call thrustsort(gpuData,size(gpuData))
cpuData=gpuData
print *,"After sorting", cpuDataend program
$ pgf90 -rc=rc4.0 -Mcuda=cc20 -O3 thrust_module.cuf sample_sort.cuf csort.othrust_module.cuf:sample_sort.cuf:
$ ./a.out Before sorting 4.1630346E-02 0.9124327 0.7832350 0.6540373 100.0000 0.3956419 0.2664442 0.1372465 8.0488138E-03 0.8788511
After sorting 8.0488138E-03 4.1630346E-02 0.1372465 0.2664442 0.3956419 0.6540373 0.7832350 0.8788511 0.9124327 100.0000
- declare two arrays, cpuData and gpuData.
- allocate them using the standard allocate
- copy cpuData from the host to gpuData on the GPU with a simple assignment
- call the Thrust sort routine
- copy sorted array back to the host
- print the sorted array
program timesortuse cudaforuse thrustimplicit nonereal, allocatable :: cpuData(:)real, allocatable, device :: gpuData(:)integer:: i,N=100000000
! cuda events for elapsingtype ( cudaEvent ) :: startEvent , stopEventreal :: time, randominteger :: istat
! Create eventsistat = cudaEventCreate ( startEvent )istat = cudaEventCreate ( stopEvent )
! Allocate arraysallocate(cpuData(N))allocate(gpuData(N))
do i=1,N cpuData(i)=random(i)end do
print *,"Sorting array of ",N, " single precision"
gpuData=cpuData
istat = cudaEventRecord ( startEvent , 0)call thrustsort(gpuData,size(gpuData))
istat = cudaEventRecord ( stopEvent , 0)istat = cudaEventSynchronize ( stopEvent )istat = cudaEventElapsedTime ( time , startEvent , stopEvent )
cpuData=gpuData
print *," Sorted array in:",time," (ms)"
!Print the first five elements and the last five.print *,"After sorting", cpuData(1:5),cpuData(N-4:N)end program
pgf90 -O3 -rc=rc4.0 -Mcuda=cc20 thrust_module.cuf time_sort.cuf csort.o -o time_sortthrust_module.cuf:time_sort.cuf:
$ ./time_sort Sorting array of 100000000 single precision Sorted array in: 222.1711 (ms) After sorting 7.0585919E-09 1.0318221E-08 1.9398616E-08 3.1738640E-08 4.4078664E-08 0.9999999 0.9999999 1.000000 1.000000 1.000000 ./a.out Sorting array of 100000000 single precision Sorted array in: 225.0452 (ms) After sorting 7.0585919E-09 1.0318221E-08 1.9398616E-08 3.1738640E-08 4.4078664E-08 0.9999999 0.9999999 0.9999999 1.000000 1.000000
Thanks ! This helped a lot
ReplyDeleteJust wanted to say thanks! please post any thing else about Cuda FORTRAN please!
ReplyDelete