C   78

thread block uneven add

Guest on 27th May 2022 09:21:36 AM

  1. #include <stdio.h>
  2. #include <cuda.h>
  3.  
  4. __global__ void add(int *a, int *b, int *c, int n) {
  5.         int index = threadIdx.x + blockIdx.x * blockDim.x;
  6.         if (index < n)
  7.                 c[index] = a[index] + b[index];
  8. }
  9.  
  10. void random_ints(int* a, int N)
  11. {
  12.         int i;
  13.         for (i = 0; i < N; ++i)
  14.                 a[i] = rand();
  15. }
  16.  
  17.  
  18. // Note: N is a multiple of Threads....
  19. #define N (2048*2048)
  20. #define THREADS_PER_BLOCK 512
  21. int main(void) {
  22.         int *a, *b, *c;                         // host copies of a, b, c
  23.         int *d_a, *d_b, *d_c;                   // device copies of a, b, c
  24.         int size = N * sizeof(int);
  25.        
  26.         // Alloc space for device copies of a, b, c
  27.         cudaMalloc((void **)&d_a, size);
  28.         cudaMalloc((void **)&d_b, size);
  29.         cudaMalloc((void **)&d_c, size);
  30.        
  31.         // Alloc space for host copies of a, b, c and setup input values
  32.         a = (int *)malloc(size); random_ints(a, N);
  33.         b = (int *)malloc(size); random_ints(b, N);
  34.         c = (int *)malloc(size);
  35.  
  36.         // Copy inputs to device
  37.         cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
  38.         cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
  39.  
  40.         // Launch add() kernel on GPU with N blocks
  41.         add<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK,THREADS_PER_BLOCK>>>(d_a, d_b, d_c, N);
  42.  
  43.         // Copy result back to host
  44.         cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
  45.  
  46.         // Cleanup
  47.         free(a); free(b); free(c);
  48.         cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
  49.  
  50.         return 0;
  51.  
  52. }

Raw Paste


Login or Register to edit or fork this paste. It's free.