0% found this document useful (0 votes)
4 views4 pages

Cuda

The document contains CUDA programming assignments by Abhishek Kumar Yadav, which include summing an array of 10 numbers using a kernel function, adding three vectors of 10 elements each, multiplying three scalar variables, and swapping two elements without using a third variable. Each assignment includes the necessary CUDA code, memory allocation, and kernel execution. The document demonstrates fundamental CUDA operations and memory management for parallel computing.

Uploaded by

yadavabhi4268
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
4 views4 pages

Cuda

The document contains CUDA programming assignments by Abhishek Kumar Yadav, which include summing an array of 10 numbers using a kernel function, adding three vectors of 10 elements each, multiplying three scalar variables, and swapping two elements without using a third variable. Each assignment includes the necessary CUDA code, memory allocation, and kernel execution. The document demonstrates fundamental CUDA operations and memory management for parallel computing.

Uploaded by

yadavabhi4268
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 4

CUDA ASSIGNMENT :

Abhishek Kumar Yadav (CSE-AIML/22/62)


1. Take an array of 10 numbers and perform the summation of these 10 numbers. Use a kernel function.

%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void sumKernel(int *array, int *result, int n) {


__shared__ int partialSum[10]; // Shared memory for partial sums

int tid = threadIdx.x;


partialSum[tid] = (tid < n) ? array[tid] : 0; // Load elements into shared memory

__syncthreads();

// Perform parallel reduction within the block


for (int stride = 1; stride < blockDim.x; stride *= 2) {
if (tid % (2 * stride) == 0) {
partialSum[tid] += partialSum[tid + stride];
}
__syncthreads();
}

// First thread in the block writes the result


if (tid == 0) {
*result = partialSum[0];
}
}

int main() {
const int n = 10;
int h_array[n] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; // Example array of 10 numbers
int h_result = 0; // Host variable for result

// Device variables
int *d_array, *d_result;
cudaMalloc((void **)&d_array, n * sizeof(int));
cudaMalloc((void **)&d_result, sizeof(int));

// Copy array from host to device


cudaMemcpy(d_array, h_array, n * sizeof(int), cudaMemcpyHostToDevice);

// Launch kernel with 10 threads in a single block


sumKernel<<<1, n>>>(d_array, d_result, n);

// Copy result back to host


cudaMemcpy(&h_result, d_result, sizeof(int), cudaMemcpyDeviceToHost);

// Print the result


printf("Sum of array elements: %d\n", h_result);

// Free device memory


cudaFree(d_array);
cudaFree(d_result);

return 0;
}
2. Take three vectors consisting of 10 elements each and add them and store it in a 4th
vector.
%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

#define N 10 // Number of elements in each vector

// Kernel function to add vectors


__global__ void vectorAdd(int *A, int *B, int *C, int *D, int n) {
int tid = threadIdx.x;

if (tid < n) {
// Perform element-wise addition and store it in vector D
D[tid] = A[tid] + B[tid] + C[tid];
}
}

int main() {
int h_A[N], h_B[N], h_C[N], h_D[N]; // Host vectors
int *d_A, *d_B, *d_C, *d_D; // Device vectors

// Initialize vectors A, B, and C


for (int i = 0; i < N; i++) {
h_A[i] = i + 1; // Vector A: 1, 2, 3, ..., 10
h_B[i] = (i + 1) * 2; // Vector B: 2, 4, 6, ..., 20
h_C[i] = (i + 1) * 3; // Vector C: 3, 6, 9, ..., 30
}

// Allocate memory on the device


cudaMalloc((void **)&d_A, N * sizeof(int));
cudaMalloc((void **)&d_B, N * sizeof(int));
cudaMalloc((void **)&d_C, N * sizeof(int));
cudaMalloc((void **)&d_D, N * sizeof(int));

// Copy data from host to device


cudaMemcpy(d_A, h_A, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_C, h_C, N * sizeof(int), cudaMemcpyHostToDevice);

// Launch the kernel with N threads


vectorAdd<<<1, N>>>(d_A, d_B, d_C, d_D, N);

// Copy the result from device to host


cudaMemcpy(h_D, d_D, N * sizeof(int), cudaMemcpyDeviceToHost);

// Print the result


printf("Resulting vector D after adding A, B, and C:\n");
for (int i = 0; i < N; i++) {
printf("%d ", h_D[i]);
}
printf("\n");

// Free device memory


cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaFree(d_D);

return 0;
}
3. Take 3 scalar variables and assign floating point values to them then perform the
multiplication and store it in 4th variable.

%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

// Kernel function to multiply three scalars


__global__ void scalarMultiply(float *a, float *b, float *c, float *result) {
// Perform the multiplication and store the result
*result = (*a) * (*b) * (*c);
}

int main() {
// Declare and initialize the scalar variables
float h_a = 2.5f, h_b = 3.5f, h_c = 4.0f; // Host variables
float h_result = 0.0f; // Host variable for storing result

// Device variables
float *d_a, *d_b, *d_c, *d_result;

// Allocate memory on the device


cudaMalloc((void **)&d_a, sizeof(float));
cudaMalloc((void **)&d_b, sizeof(float));
cudaMalloc((void **)&d_c, sizeof(float));
cudaMalloc((void **)&d_result, sizeof(float));

// Copy data from host to device


cudaMemcpy(d_a, &h_a, sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &h_b, sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, &h_c, sizeof(float), cudaMemcpyHostToDevice);

// Launch the kernel (1 block, 1 thread)


scalarMultiply<<<1, 1>>>(d_a, d_b, d_c, d_result);

// Copy the result from device to host


cudaMemcpy(&h_result, d_result, sizeof(float), cudaMemcpyDeviceToHost);

// Print the result


printf("The result of multiplying %.2f, %.2f, and %.2f is: %.2f\n", h_a, h_b, h_c, h_result);

// Free device memory


cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFree(d_result);

return 0;
}
4. Write a kernel function to swap two elements without the use of 3rd
variable.

%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

// Kernel function to swap two elements without using a third variable


__global__ void swapKernel(int *a, int *b) {
// Swap the elements using arithmetic operations (addition and subtraction)
*a = *a + *b; // a = a + b
*b = *a - *b; // b = (a + b) - b = a
*a = *a - *b; // a = (a + b) - a = b
}

int main() {
int h_a = 5, h_b = 10; // Host variables
int *d_a, *d_b; // Device variables

// Allocate memory on the device


cudaMalloc((void **)&d_a, sizeof(int));
cudaMalloc((void **)&d_b, sizeof(int));

// Copy data from host to device


cudaMemcpy(d_a, &h_a, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &h_b, sizeof(int), cudaMemcpyHostToDevice);

// Launch the kernel (1 block, 1 thread)


swapKernel<<<1, 1>>>(d_a, d_b);

// Copy the result back from device to host


cudaMemcpy(&h_a, d_a, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&h_b, d_b, sizeof(int), cudaMemcpyDeviceToHost);

// Print the swapped values


printf("After swapping, a = %d and b = %d\n", h_a, h_b);

// Free device memory


cudaFree(d_a);
cudaFree(d_b);

return 0;
}

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy