#include <thrust/host_vector.h > 
#include <thrust/device_vector.h> 
#include <thrust/sort.h>
 int main(void) {
 // define a vector of 16M int on the host
 thrust::host_vector h_vec(1 << 24);  
// generate 16M random numbers on the host
thrust::generate(h_vec.begin(), h_vec.end(), rand);
 // transfer data to the device
 thrust::device_vector d_vec = h_vec;  
// sort data on the device
 thrust::sort(d_vec.begin(), d_vec.end()); 
// transfer data back to host
 thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin()); 
return 0;
// allocate device vector
 thrust::device_vector d_vec(4);  
// obtain raw pointer to device vector’s memory
 int * ptr = thrust::raw_pointer_cast(&d_vec[0]);
// Filename: csort.cu// nvcc  -c -arch sm_13 csort.cu #include <thrust/device_vector.h> 
#include <thrust/device_vector.h>
#include <thrust/sort.h>
extern "C" {
//Sort for  integer arrays
void sort_int_wrapper( int *data, int N)
 {
  // Wrap raw pointer with a device_ptr
    thrust::device_ptr <int>  dev_ptr(data); 
  // Use device_ptr in Thrust sort algorithm
    thrust::sort(dev_ptr, dev_ptr+N);
}
//Sort for  float arrays
 void sort_float_wrapper( float *data, int N)
 {
   thrust::device_ptr <float>  dev_ptr(data); 
   thrust::sort(dev_ptr, dev_ptr+N);
 }
//Sort for  double arrays
 void sort_double_wrapper( double *data, int N)
 {
  thrust::device_ptr <double> dev_ptr(data); 
  thrust::sort(dev_ptr, dev_ptr+N);
 }
}
   nvcc  -c -arch sm_13 csort.cu module thrust
interface thrustsort subroutine sort_int( input,N) bind(C,name="sort_int_wrapper")  use iso_c_binding  integer(c_int),device:: input(*)  integer(c_int),value:: N end subroutine
 subroutine sort_float( input,N) bind(C,name="sort_float_wrapper")  use iso_c_binding  real(c_float),device:: input(*)  integer(c_int),value:: N end subroutine
 subroutine sort_double( input,N) bind(C,name="sort_double_wrapper")  use iso_c_binding  real(c_double),device:: input(*)  integer(c_int),value:: N end subroutine
end interface
end module thrustprogram testsortuse thrustreal, allocatable              :: cpuData(:)real, allocatable, device :: gpuData(:)integer:: N=10allocate(cpuData(N))allocate(gpuData(N))
do i=1,N cpuData(i)=random(i)end docpuData(5)=100.
print *,"Before sorting", cpuData
gpuData=cpuData
call thrustsort(gpuData,size(gpuData))
cpuData=gpuData
print *,"After sorting", cpuDataend program$ pgf90 -rc=rc4.0 -Mcuda=cc20 -O3 thrust_module.cuf sample_sort.cuf csort.othrust_module.cuf:sample_sort.cuf:
$ ./a.out  Before sorting   4.1630346E-02   0.9124327       0.7832350       0.6540373         100.0000       0.3956419       0.2664442       0.1372465        8.0488138E-03   0.8788511    
 After sorting   8.0488138E-03   4.1630346E-02   0.1372465       0.2664442        0.3956419       0.6540373       0.7832350       0.8788511        0.9124327        100.0000    - declare two arrays, cpuData and gpuData.
- allocate them using the standard allocate
- copy cpuData from the host to gpuData on the GPU with a simple assignment
- call the Thrust sort routine
- copy sorted array back to the host
- print the sorted array
program timesortuse cudaforuse thrustimplicit nonereal, allocatable :: cpuData(:)real, allocatable, device :: gpuData(:)integer:: i,N=100000000
! cuda events for elapsingtype ( cudaEvent ) :: startEvent , stopEventreal :: time, randominteger :: istat
! Create eventsistat = cudaEventCreate ( startEvent )istat = cudaEventCreate ( stopEvent )
! Allocate arraysallocate(cpuData(N))allocate(gpuData(N))
do i=1,N cpuData(i)=random(i)end do
print *,"Sorting array of ",N, " single precision"
gpuData=cpuData
istat = cudaEventRecord ( startEvent , 0)call thrustsort(gpuData,size(gpuData))
istat = cudaEventRecord ( stopEvent , 0)istat = cudaEventSynchronize ( stopEvent )istat = cudaEventElapsedTime ( time , startEvent , stopEvent )
cpuData=gpuData
print *," Sorted array in:",time," (ms)"
!Print the first five elements and the last five.print *,"After sorting", cpuData(1:5),cpuData(N-4:N)end programpgf90 -O3 -rc=rc4.0 -Mcuda=cc20 thrust_module.cuf time_sort.cuf csort.o -o time_sortthrust_module.cuf:time_sort.cuf:
$ ./time_sort Sorting array of     100000000  single precision  Sorted array in:    222.1711      (ms) After sorting   7.0585919E-09   1.0318221E-08   1.9398616E-08   3.1738640E-08    4.4078664E-08   0.9999999       0.9999999        1.000000         1.000000        1.000000   ./a.out  Sorting array of     100000000  single precision  Sorted array in:    225.0452      (ms) After sorting   7.0585919E-09   1.0318221E-08   1.9398616E-08   3.1738640E-08    4.4078664E-08   0.9999999       0.9999999       0.9999999         1.000000        1.000000        
