0% found this document useful (0 votes)

4 views4 pages

Cuda

The document contains CUDA programming assignments by Abhishek Kumar Yadav, which include summing an array of 10 numbers using a kernel function, adding three vectors of 10 elements each, multiplying three scalar variables, and swapping two elements without using a third variable. Each assignment includes the necessary CUDA code, memory allocation, and kernel execution. The document demonstrates fundamental CUDA operations and memory management for parallel computing.

Uploaded by

yadavabhi4268

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

4 views4 pages

Cuda

Uploaded by

yadavabhi4268

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 4

CUDA ASSIGNMENT :

Abhishek Kumar Yadav (CSE-AIML/22/62)

1. Take an array of 10 numbers and perform the summation of these 10 numbers. Use a kernel function.

%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

global void sumKernel(int array, int result, int n) {

__shared__ int partialSum[10]; // Shared memory for partial sums

int tid = threadIdx.x;

partialSum[tid] = (tid < n) ? array[tid] : 0; // Load elements into shared memory

__syncthreads();

// Perform parallel reduction within the block

for (int stride = 1; stride < blockDim.x; stride *= 2) {
if (tid % (2 * stride) == 0) {
partialSum[tid] += partialSum[tid + stride];
}
__syncthreads();
}

// First thread in the block writes the result

if (tid == 0) {
*result = partialSum[0];
}
}

int main() {
const int n = 10;
int h_array[n] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; // Example array of 10 numbers
int h_result = 0; // Host variable for result

// Device variables
int *d_array, *d_result;
cudaMalloc((void **)&d_array, n * sizeof(int));
cudaMalloc((void **)&d_result, sizeof(int));

// Copy array from host to device

cudaMemcpy(d_array, h_array, n * sizeof(int), cudaMemcpyHostToDevice);

// Launch kernel with 10 threads in a single block

sumKernel<<<1, n>>>(d_array, d_result, n);

// Copy result back to host

cudaMemcpy(&h_result, d_result, sizeof(int), cudaMemcpyDeviceToHost);

// Print the result

printf("Sum of array elements: %d\n", h_result);

// Free device memory

cudaFree(d_array);
cudaFree(d_result);

return 0;
}
2. Take three vectors consisting of 10 elements each and add them and store it in a 4th
vector.
%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

#define N 10 // Number of elements in each vector

// Kernel function to add vectors

__global__ void vectorAdd(int *A, int *B, int *C, int *D, int n) {
int tid = threadIdx.x;

if (tid < n) {
// Perform element-wise addition and store it in vector D
D[tid] = A[tid] + B[tid] + C[tid];
}
}

int main() {
int h_A[N], h_B[N], h_C[N], h_D[N]; // Host vectors
int *d_A, *d_B, *d_C, *d_D; // Device vectors

// Initialize vectors A, B, and C

for (int i = 0; i < N; i++) {
h_A[i] = i + 1; // Vector A: 1, 2, 3, ..., 10
h_B[i] = (i + 1) * 2; // Vector B: 2, 4, 6, ..., 20
h_C[i] = (i + 1) * 3; // Vector C: 3, 6, 9, ..., 30
}

// Allocate memory on the device

cudaMalloc((void **)&d_A, N * sizeof(int));
cudaMalloc((void **)&d_B, N * sizeof(int));
cudaMalloc((void **)&d_C, N * sizeof(int));
cudaMalloc((void **)&d_D, N * sizeof(int));

// Copy data from host to device

cudaMemcpy(d_A, h_A, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_C, h_C, N * sizeof(int), cudaMemcpyHostToDevice);

// Launch the kernel with N threads

vectorAdd<<<1, N>>>(d_A, d_B, d_C, d_D, N);

// Copy the result from device to host

cudaMemcpy(h_D, d_D, N * sizeof(int), cudaMemcpyDeviceToHost);

// Print the result

printf("Resulting vector D after adding A, B, and C:\n");
for (int i = 0; i < N; i++) {
printf("%d ", h_D[i]);
}
printf("\n");

// Free device memory

cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaFree(d_D);

return 0;
}
3. Take 3 scalar variables and assign floating point values to them then perform the
multiplication and store it in 4th variable.

%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

// Kernel function to multiply three scalars

__global__ void scalarMultiply(float *a, float *b, float *c, float *result) {
// Perform the multiplication and store the result
*result = (*a) * (*b) * (*c);
}

int main() {
// Declare and initialize the scalar variables
float h_a = 2.5f, h_b = 3.5f, h_c = 4.0f; // Host variables
float h_result = 0.0f; // Host variable for storing result

// Device variables
float *d_a, *d_b, *d_c, *d_result;

// Allocate memory on the device

cudaMalloc((void **)&d_a, sizeof(float));
cudaMalloc((void **)&d_b, sizeof(float));
cudaMalloc((void **)&d_c, sizeof(float));
cudaMalloc((void **)&d_result, sizeof(float));

// Copy data from host to device

cudaMemcpy(d_a, &h_a, sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &h_b, sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, &h_c, sizeof(float), cudaMemcpyHostToDevice);

// Launch the kernel (1 block, 1 thread)

scalarMultiply<<<1, 1>>>(d_a, d_b, d_c, d_result);

// Copy the result from device to host

cudaMemcpy(&h_result, d_result, sizeof(float), cudaMemcpyDeviceToHost);

// Print the result

printf("The result of multiplying %.2f, %.2f, and %.2f is: %.2f\n", h_a, h_b, h_c, h_result);

// Free device memory

cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFree(d_result);

return 0;
}
4. Write a kernel function to swap two elements without the use of 3rd
variable.

%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

// Kernel function to swap two elements without using a third variable

__global__ void swapKernel(int *a, int *b) {
// Swap the elements using arithmetic operations (addition and subtraction)
*a = *a + *b; // a = a + b
*b = *a - *b; // b = (a + b) - b = a
*a = *a - *b; // a = (a + b) - a = b
}

int main() {
int h_a = 5, h_b = 10; // Host variables
int *d_a, *d_b; // Device variables

// Allocate memory on the device

cudaMalloc((void **)&d_a, sizeof(int));
cudaMalloc((void **)&d_b, sizeof(int));

// Copy data from host to device

cudaMemcpy(d_a, &h_a, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &h_b, sizeof(int), cudaMemcpyHostToDevice);

// Launch the kernel (1 block, 1 thread)

swapKernel<<<1, 1>>>(d_a, d_b);

// Copy the result back from device to host

cudaMemcpy(&h_a, d_a, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&h_b, d_b, sizeof(int), cudaMemcpyDeviceToHost);

// Print the swapped values

printf("After swapping, a = %d and b = %d\n", h_a, h_b);

// Free device memory

cudaFree(d_a);
cudaFree(d_b);

return 0;
}

50 Uses of Computers in My Area
100% (1)
50 Uses of Computers in My Area
4 pages
Payroll Audit Program Final
100% (2)
Payroll Audit Program Final
39 pages
Cuda Add Mult
No ratings yet
Cuda Add Mult
3 pages
LP 1,,1
No ratings yet
LP 1,,1
5 pages
CUDA Additionof2Vector
No ratings yet
CUDA Additionof2Vector
2 pages
Addition Cuda
No ratings yet
Addition Cuda
2 pages
2023 CSC14120 Lecture01 CUDAIntroduction
No ratings yet
2023 CSC14120 Lecture01 CUDAIntroduction
32 pages
PDC Assignment
No ratings yet
PDC Assignment
9 pages
CUDA PPT Anurita Unit3
No ratings yet
CUDA PPT Anurita Unit3
42 pages
CUDA Exercises
No ratings yet
CUDA Exercises
185 pages
Google Colab Solution Activity
No ratings yet
Google Colab Solution Activity
5 pages
Vector Addition
No ratings yet
Vector Addition
3 pages
Cuda Firstprograms PDF
No ratings yet
Cuda Firstprograms PDF
6 pages
Cuda C/C++ Basics: NVIDIA Corporation
No ratings yet
Cuda C/C++ Basics: NVIDIA Corporation
67 pages
Rishi
No ratings yet
Rishi
30 pages
217 Lec2
No ratings yet
217 Lec2
24 pages
Introduction To CUDA: CAP 4730 Spring 2012
No ratings yet
Introduction To CUDA: CAP 4730 Spring 2012
35 pages
Group A Assignment 4 (A) : Two Large Vectors
No ratings yet
Group A Assignment 4 (A) : Two Large Vectors
5 pages
CUDA
No ratings yet
CUDA
3 pages
Moving To Parallel - Addition of 2 Matrices
No ratings yet
Moving To Parallel - Addition of 2 Matrices
14 pages
CUDA Programming Invert
No ratings yet
CUDA Programming Invert
36 pages
Introduction To CUDA C 3
No ratings yet
Introduction To CUDA C 3
67 pages
Gpu History and Cuda Programming Basics
No ratings yet
Gpu History and Cuda Programming Basics
44 pages
Intro To CUDA
No ratings yet
Intro To CUDA
76 pages
3 Cuda
No ratings yet
3 Cuda
5 pages
Allocate The Device Memory Where We Will Copy M
No ratings yet
Allocate The Device Memory Where We Will Copy M
2 pages
01 Cuda C Basics
No ratings yet
01 Cuda C Basics
32 pages
TP1: Converting Vector Addition To CUDA.: Listing 1 An Example of Vector Addition Implemented in C
No ratings yet
TP1: Converting Vector Addition To CUDA.: Listing 1 An Example of Vector Addition Implemented in C
1 page
CUDA Part-1
No ratings yet
CUDA Part-1
52 pages
Lecture 11 Programming On Gpus Part 1 Zxu2acms60212 40212 S15lec 11 Gpupdf
No ratings yet
Lecture 11 Programming On Gpus Part 1 Zxu2acms60212 40212 S15lec 11 Gpupdf
121 pages
Lab 1 Parallel
No ratings yet
Lab 1 Parallel
4 pages
CUDA - Part 1 LMS
No ratings yet
CUDA - Part 1 LMS
51 pages
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
No ratings yet
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
45 pages
Parallel Scan in C CUda
No ratings yet
Parallel Scan in C CUda
3 pages
Csnb594csnb4423 Lab 5 01a Harveen Velan Sw0104101
No ratings yet
Csnb594csnb4423 Lab 5 01a Harveen Velan Sw0104101
19 pages
Introduction To CUDA C
No ratings yet
Introduction To CUDA C
67 pages
Mulmatrix Cu
No ratings yet
Mulmatrix Cu
3 pages
BECOA157 Parallel Matrix Multiplication
No ratings yet
BECOA157 Parallel Matrix Multiplication
3 pages
CUDA MatrixMultiplication
No ratings yet
CUDA MatrixMultiplication
2 pages
Ejercicio 2 Práctica 3: CUDA Desempeño en Función de La Homogeneidad para Acceder A Memoria y de La Regularidad Del Código
No ratings yet
Ejercicio 2 Práctica 3: CUDA Desempeño en Función de La Homogeneidad para Acceder A Memoria y de La Regularidad Del Código
8 pages
Introduccion CUDA C
No ratings yet
Introduccion CUDA C
51 pages
GPU Series III CUDA Compilation Host Side 1721302802
No ratings yet
GPU Series III CUDA Compilation Host Side 1721302802
8 pages
3 Computation
No ratings yet
3 Computation
28 pages
5 Functions
No ratings yet
5 Functions
34 pages
Threads
No ratings yet
Threads
54 pages
Lecture2 Cuda Basic 2010
No ratings yet
Lecture2 Cuda Basic 2010
44 pages
20 Quiz 14
No ratings yet
20 Quiz 14
12 pages
3 Some Commonly Used CUDA API: 3.1 Function Type Qualifiers
No ratings yet
3 Some Commonly Used CUDA API: 3.1 Function Type Qualifiers
7 pages
Lab Report 6
No ratings yet
Lab Report 6
12 pages
Input: Output: 1. Sub String Program
No ratings yet
Input: Output: 1. Sub String Program
8 pages
CUDAProg Model
No ratings yet
CUDAProg Model
24 pages
Aca Lab Manual Final
No ratings yet
Aca Lab Manual Final
28 pages
Hetero Lecture Slides 002 Lecture 1 Lecture-1-5-Cuda-API
No ratings yet
Hetero Lecture Slides 002 Lecture 1 Lecture-1-5-Cuda-API
11 pages
2023 CSC14120 Lecture05 CUDAMemories
No ratings yet
2023 CSC14120 Lecture05 CUDAMemories
48 pages
Basic-Cuda
No ratings yet
Basic-Cuda
49 pages
L06 GPGPU CUDA Programming 1
No ratings yet
L06 GPGPU CUDA Programming 1
23 pages
Lab 10,11
No ratings yet
Lab 10,11
4 pages
DeviceFunc Cu
100% (1)
DeviceFunc Cu
1 page
CUDA Compute Unified Device Architecture
No ratings yet
CUDA Compute Unified Device Architecture
26 pages
HPC Int2 Key
No ratings yet
HPC Int2 Key
10 pages
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet
LPIC-1 Primer
From Everand
LPIC-1 Primer
John Greene
4.5/5 (3)
Nozomi Networks WP Drone Telemetry
No ratings yet
Nozomi Networks WP Drone Telemetry
73 pages
Switching Lemma
No ratings yet
Switching Lemma
3 pages
Brain Storming
100% (1)
Brain Storming
11 pages
Stock Ledger
0% (1)
Stock Ledger
25 pages
TIOBE Programming Community Index For December 2011
No ratings yet
TIOBE Programming Community Index For December 2011
8 pages
AKTU - QP20E290QP: Time: 3 Hours Total Marks: 100
No ratings yet
AKTU - QP20E290QP: Time: 3 Hours Total Marks: 100
4 pages
Master The Product Sense Interview - by Aakash Gupta
No ratings yet
Master The Product Sense Interview - by Aakash Gupta
38 pages
Breadth-First Search: Breadth-First Search (BFS) Is A Traversing Algorithm Where You Starts From A Given
No ratings yet
Breadth-First Search: Breadth-First Search (BFS) Is A Traversing Algorithm Where You Starts From A Given
13 pages
Casework Aime
No ratings yet
Casework Aime
5 pages
Solutions To Assignment 2: Problem 1: Smallest Error in Differentiation
No ratings yet
Solutions To Assignment 2: Problem 1: Smallest Error in Differentiation
3 pages
Breaking The Chain
No ratings yet
Breaking The Chain
42 pages
SHS LCS Q1 Las Le2
No ratings yet
SHS LCS Q1 Las Le2
6 pages
1596934417
No ratings yet
1596934417
319 pages
Speech and Language Processing 3rd Edition Daniel Jurafsky James H Martin Download
100% (1)
Speech and Language Processing 3rd Edition Daniel Jurafsky James H Martin Download
29 pages
Final Review SolutionsWritten
No ratings yet
Final Review SolutionsWritten
13 pages
KJRP-86I, A Installation and Owner's Manual
No ratings yet
KJRP-86I, A Installation and Owner's Manual
25 pages
Aditya College of Engineering: B R E A K B R E A K
No ratings yet
Aditya College of Engineering: B R E A K B R E A K
6 pages
T MSL 9.4 Im Up SS
No ratings yet
T MSL 9.4 Im Up SS
84 pages
Infineon-Presentation 2kW ZVS Demoboard description-AP-v01 00-EN
No ratings yet
Infineon-Presentation 2kW ZVS Demoboard description-AP-v01 00-EN
16 pages
What Are The Benefits of A User Manual
No ratings yet
What Are The Benefits of A User Manual
3 pages
Keishank T
No ratings yet
Keishank T
1 page
7 On Semiconductor
No ratings yet
7 On Semiconductor
1 page
Vdrive - Vinculum Firmware
No ratings yet
Vdrive - Vinculum Firmware
77 pages
200-901 V15.95
No ratings yet
200-901 V15.95
121 pages
Avanti Kumari - A Report
No ratings yet
Avanti Kumari - A Report
39 pages
Keysight - Techniques For Advanced Cable Testing Using FieldFox Handheld Analyzers
No ratings yet
Keysight - Techniques For Advanced Cable Testing Using FieldFox Handheld Analyzers
15 pages
A Comparison of A Dynamic and Static Optimization of An ASP Flooding Process For EOR
No ratings yet
A Comparison of A Dynamic and Static Optimization of An ASP Flooding Process For EOR
20 pages
The Entity-Relationship Model: Database Management Systems 3ed, R. Ramakrishnan and J. Gehrke 1
No ratings yet
The Entity-Relationship Model: Database Management Systems 3ed, R. Ramakrishnan and J. Gehrke 1
18 pages

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

Cuda

Uploaded by

Cuda

Uploaded by

CUDA ASSIGNMENT :

Abhishek Kumar Yadav (CSE-AIML/22/62)

global void sumKernel(int array, int result, int n) {

int tid = threadIdx.x;

// Perform parallel reduction within the block

// First thread in the block writes the result

// Copy array from host to device

// Launch kernel with 10 threads in a single block

// Copy result back to host

// Print the result

// Free device memory

#define N 10 // Number of elements in each vector

// Kernel function to add vectors

// Initialize vectors A, B, and C

// Allocate memory on the device

// Copy data from host to device

// Launch the kernel with N threads

// Copy the result from device to host

// Print the result

// Free device memory

// Kernel function to multiply three scalars

// Allocate memory on the device

// Copy data from host to device

// Launch the kernel (1 block, 1 thread)

// Copy the result from device to host

// Print the result

// Free device memory

// Kernel function to swap two elements without using a third variable

// Allocate memory on the device

// Copy data from host to device

// Launch the kernel (1 block, 1 thread)

// Copy the result back from device to host

// Print the swapped values

// Free device memory

You might also like

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Cuda

Uploaded by

Cuda

Uploaded by

CUDA ASSIGNMENT :

Abhishek Kumar Yadav (CSE-AIML/22/62)

__global__ void sumKernel(int *array, int *result, int n) {

int tid = threadIdx.x;

// Perform parallel reduction within the block

// First thread in the block writes the result

// Copy array from host to device

// Launch kernel with 10 threads in a single block

// Copy result back to host

// Print the result

// Free device memory

#define N 10 // Number of elements in each vector

// Kernel function to add vectors

// Initialize vectors A, B, and C

// Allocate memory on the device

// Copy data from host to device

// Launch the kernel with N threads

// Copy the result from device to host

// Print the result

// Free device memory

// Kernel function to multiply three scalars

// Allocate memory on the device

// Copy data from host to device

// Launch the kernel (1 block, 1 thread)

// Copy the result from device to host

// Print the result

// Free device memory

// Kernel function to swap two elements without using a third variable

// Allocate memory on the device

// Copy data from host to device

// Launch the kernel (1 block, 1 thread)

// Copy the result back from device to host

// Print the swapped values

// Free device memory

You might also like

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

global void sumKernel(int array, int result, int n) {