C   61

thread block add

Guest on 27th May 2022 09:19:32 AM

  1. #include <stdio.h>
  2. #include <cuda.h>
  3.  
  4. __global__ void add(int *a, int *b, int *c) {
  5.         int index = threadIdx.x + blockIdx.x * blockDim.x;
  6.         c[index] = a[index] + b[index];
  7. }
  8.  
  9. void random_ints(int* a, int N)
  10. {
  11.         int i;
  12.         for (i = 0; i < N; ++i)
  13.                 a[i] = rand();
  14. }
  15.  
  16.  
  17. // Note: N is a multiple of Threads....
  18. #define N (2048*2048)
  19. #define THREADS_PER_BLOCK 512
  20. int main(void) {
  21.         int *a, *b, *c;                         // host copies of a, b, c
  22.         int *d_a, *d_b, *d_c;                   // device copies of a, b, c
  23.         int size = N * sizeof(int);
  24.        
  25.         // Alloc space for device copies of a, b, c
  26.         cudaMalloc((void **)&d_a, size);
  27.         cudaMalloc((void **)&d_b, size);
  28.         cudaMalloc((void **)&d_c, size);
  29.        
  30.         // Alloc space for host copies of a, b, c and setup input values
  31.         a = (int *)malloc(size); random_ints(a, N);
  32.         b = (int *)malloc(size); random_ints(b, N);
  33.         c = (int *)malloc(size);
  34.  
  35.         // Copy inputs to device
  36.         cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
  37.         cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
  38.  
  39.         // Launch add() kernel on GPU with N blocks
  40.         add<<<N/THREADS_PER_BLOCK,THREADS_PER_BLOCK>>>(d_a, d_b, d_c);
  41.  
  42.         // Copy result back to host
  43.         cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
  44.  
  45.         // Cleanup
  46.         free(a); free(b); free(c);
  47.         cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
  48.  
  49.         return 0;
  50.  
  51. }

Raw Paste


Login or Register to edit or fork this paste. It's free.