0% found this document useful (0 votes)

7 views30 pages

Rishi

Uploaded by

12111103cse

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

7 views30 pages

Rishi

Uploaded by

12111103cse

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 30

PRACTICAL LAB FILE

Graphics Processing Unit Computing Lab

(CSL-761)

Submitted To: Submitted By:

Dr. Bhoopesh Singh Bhati Rishi Bachhuka
(Assistant Professor-CSE) Roll no.-12111065
Semester- 7th

(November,2024)
Index

Serial No. Topic Date Page No.

Installation of GPU CUDA
1. Environment Setup and
06-08-2024 3-3
Hello World Program

Write and test CUDA

2. program for Matrix-Matrix
13-08-2024 4-6
Multiplication

Write and test CUDA

3. program Vector Reduction
20-08-2024 7-9
Write and test CUDA
4. program for Vector
03-09-2024 10-12
Reduction with Unlimited
Input Elements

Write and test CUDA

5. program to find solution of
10-09-2024 13-16
simultaneous linear
equations

Write and test CUDA

6. program for Strassen Matrix
24-09-2024 17-20
multiplication

Write and test CUDA

7. program to implement
1-09-2024 21-22
Monte Carlo algorithm

Write and test CUDA

8. program for DES encryption
08-10-2024 23-25
and decryption

Write and test CUDA

9. program for AES encryption
22-10-2024 26-28
and decryption

Write and test CUDA

10. program for random number
12-11-2024 29-30
generation
Experiment 1

Aim: Installation of GPU CUDA Environment Setup and Hello

World Program

In this experiment, we will:

● Verify the installation of the CUDA environment.

● Write a simple CUDA program to print "Hello, CUDA World!".
● Compile and execute the program using NVCC.

Open a Command Prompt or PowerShell and run:

● nvcc --version
● nvidia-smi

Code:
#include <iostream>

global void helloWorld() {

printf("Hello, CUDA World!\n");
}

int main() {
helloWorld<<<1, 1>>>();
cudaDeviceSynchronize();
return 0;
}
Experiment 2

Aim: Write and test CUDA program for Matrix-Matrix Multiplication

Code:
#include <iostream>
#include <cuda.h>
#define N 16

global void matrixMul(int a, int b, int *c, int n) {

int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
if (row < n && col < n) {
for (int k = 0; k < n; k++) {
sum += a[row * n + k] * b[k * n + col];
}
c[row * n + col] = sum;
}
}

void initializeMatrix(int *matrix, int n) {

for (int i = 0; i < n * n; i++) {
matrix[i] = rand() % 10;
}
}

void printMatrix(int *matrix, int n) {

for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
std::cout << matrix[i * n + j] << " ";
}
std::cout << "\n";
}
}

int main() {
int n = N;
int size = n * n * sizeof(int);

int h_a = (int )malloc(size);

int *h_b = (int *)malloc(size);
int *h_c = (int *)malloc(size);

initializeMatrix(h_a, n);
initializeMatrix(h_b, n);

int d_a, d_b, *d_c;

cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);

cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);

cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

dim3 threadsPerBlock(16, 16);

dim3 blocksPerGrid((n + threadsPerBlock.x - 1) /
threadsPerBlock.x,
(n + threadsPerBlock.y - 1) / threadsPerBlock.y);
matrixMul<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c,
n);

cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

std::cout << "Matrix A:\n";

printMatrix(h_a, n);

std::cout << "\nMatrix B:\n";

printMatrix(h_b, n);

std::cout << "\nMatrix C (Result):\n";

printMatrix(h_c, n);

cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(h_a);
free(h_b);
free(h_c);

return 0;
}
Experiment 3

Aim: Write and test CUDA program Vector Reduction

Code:
#include <iostream>
#include <cuda.h>
#define N 1024

global void vectorReduction(int input, int output, int n) {

__shared__ int sharedData[1024];
int tid = threadIdx.x;
int index = blockIdx.x * blockDim.x + threadIdx.x;

if (index < n) {
sharedData[tid] = input[index];
} else {
sharedData[tid] = 0;
}
__syncthreads();

for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {

if (tid < stride) {
sharedData[tid] += sharedData[tid + stride];
}
__syncthreads();
}

if (tid == 0) {
output[blockIdx.x] = sharedData[0];
}
}

void initializeArray(int *array, int n) {

for (int i = 0; i < n; i++) {
array[i] = rand() % 100;
}
}

int main() {
int n = N;
int size = n * sizeof(int);
int *h_input = (int *)malloc(size);
int *h_output = (int *)malloc(sizeof(int));

initializeArray(h_input, n);

int d_input, d_intermediate, *d_output;

cudaMalloc((void **)&d_input, size);
cudaMalloc((void **)&d_intermediate, sizeof(int) * (n / 1024));
cudaMalloc((void **)&d_output, sizeof(int));

cudaMemcpy(d_input, h_input, size,

cudaMemcpyHostToDevice);

vectorReduction<<<n / 1024, 1024>>>(d_input, d_intermediate,

n);
vectorReduction<<<1, 1024>>>(d_intermediate, d_output, n /
1024);
cudaMemcpy(h_output, d_output, sizeof(int),
cudaMemcpyDeviceToHost);

std::cout << "Sum of array elements: " << h_output[0] << "\n";

cudaFree(d_input);
cudaFree(d_intermediate);
cudaFree(d_output);
free(h_input);
free(h_output);

return 0;
}
Experiment 4

Aim: Write and test CUDA program for Vector Reduction with
Unlimited Input Elements

Code:
#include <iostream>
#include <cuda.h>
#define BLOCK_SIZE 1024

global void vectorReduction(int input, int output, int n) {

__shared__ int sharedData[BLOCK_SIZE];
int tid = threadIdx.x;
int index = blockIdx.x * blockDim.x + threadIdx.x;

sharedData[tid] = (index < n) ? input[index] : 0;

__syncthreads();

for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {

if (tid < stride) {
sharedData[tid] += sharedData[tid + stride];
}
__syncthreads();
}

if (tid == 0) {
output[blockIdx.x] = sharedData[0];
}
}
void initializeArray(int *array, int n) {
for (int i = 0; i < n; i++) {
array[i] = rand() % 100;
}
}

int main() {
int n;
std::cout << "Enter the size of the array: ";
std::cin >> n;

int size = n * sizeof(int);

int *h_input = (int *)malloc(size);
int *h_output = (int *)malloc(sizeof(int));

initializeArray(h_input, n);

int d_input, d_intermediate, *d_output;

cudaMalloc((void **)&d_input, size);
cudaMalloc((void **)&d_intermediate, sizeof(int) * ((n +
BLOCK_SIZE - 1) / BLOCK_SIZE));
cudaMalloc((void **)&d_output, sizeof(int));

cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);

int numBlocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;

vectorReduction<<<numBlocks, BLOCK_SIZE>>>(d_input,
d_intermediate, n);
vectorReduction<<<1, BLOCK_SIZE>>>(d_intermediate,
d_output, numBlocks);
cudaMemcpy(h_output, d_output, sizeof(int),
cudaMemcpyDeviceToHost);

std::cout << "Sum of array elements: " << h_output[0] << "\n";

cudaFree(d_input);
cudaFree(d_intermediate);
cudaFree(d_output);
free(h_input);
free(h_output);

return 0;
}

Experiment 5
Aim: Write and test CUDA program to find solution of simultaneous
linear equations

Code:
#include <iostream>
#include <cuda.h>
#define N 3

global void gaussianElimination(float a, float b, int n) {

int i = blockIdx.x;
int j = threadIdx.x;

if (i < n && j > i && j < n) {

float factor = a[j * n + i] / a[i * n + i];
for (int k = i; k < n; k++) {
a[j * n + k] -= factor * a[i * n + k];
}
b[j] -= factor * b[i];
}
}

void initializeMatrix(float *matrix, int n) {

float sample[N][N] = {{3, 2, -4}, {2, 3, 3}, {5, -3, 1}};
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
matrix[i * n + j] = sample[i][j];
}
}
}
void initializeVector(float *vector, int n) {
float sample[N] = {3, 15, 14};
for (int i = 0; i < n; i++) {
vector[i] = sample[i];
}
}

void printMatrix(float *matrix, int n) {

for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
std::cout << matrix[i * n + j] << " ";
}
std::cout << "\n";
}
}

void printVector(float *vector, int n) {

for (int i = 0; i < n; i++) {
std::cout << vector[i] << " ";
}
std::cout << "\n";
}

void backSubstitution(float a, float b, float *x, int n) {

for (int i = n - 1; i >= 0; i--) {
x[i] = b[i];
for (int j = i + 1; j < n; j++) {
x[i] -= a[i * n + j] * x[j];
}
x[i] /= a[i * n + i];
}
}

int main() {
int n = N;
int sizeA = n * n * sizeof(float);
int sizeB = n * sizeof(float);

float h_a = (float )malloc(sizeA);

float *h_b = (float *)malloc(sizeB);
float *h_x = (float *)malloc(sizeB);

initializeMatrix(h_a, n);
initializeVector(h_b, n);

std::cout << "Matrix A:\n";

printMatrix(h_a, n);
std::cout << "\nVector B:\n";
printVector(h_b, n);

float d_a, d_b;

cudaMalloc((void **)&d_a, sizeA);
cudaMalloc((void **)&d_b, sizeB);

cudaMemcpy(d_a, h_a, sizeA, cudaMemcpyHostToDevice);

cudaMemcpy(d_b, h_b, sizeB, cudaMemcpyHostToDevice);

for (int i = 0; i < n - 1; i++) {

gaussianElimination<<<n, n>>>(d_a, d_b, n);
cudaDeviceSynchronize();
}

cudaMemcpy(h_a, d_a, sizeA, cudaMemcpyDeviceToHost);

cudaMemcpy(h_b, d_b, sizeB, cudaMemcpyDeviceToHost);

backSubstitution(h_a, h_b, h_x, n);

std::cout << "\nSolution Vector X:\n";

printVector(h_x, n);

cudaFree(d_a);
cudaFree(d_b);
free(h_a);
free(h_b);
free(h_x);

return 0;
}

Experiment 6
Aim: Write and test CUDA program for Strassen Matrix
multiplication

Code:
#include <iostream>
#include <cuda.h>
#include <cmath>
#define N 4

global void matrixAdd(int A, int B, int *C, int n) {

int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n && j < n) {
C[i * n + j] = A[i * n + j] + B[i * n + j];
}
}

global void matrixSub(int A, int B, int *C, int n) {

int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n && j < n) {
C[i * n + j] = A[i * n + j] - B[i * n + j];
}
}

global void matrixMul(int A, int B, int *C, int n) {

int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
if (i < n && j < n) {
for (int k = 0; k < n; k++) {
sum += A[i * n + k] * B[k * n + j];
}
C[i * n + j] = sum;
}
}

void initializeMatrix(int *matrix, int n) {

for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
matrix[i * n + j] = rand() % 10;
}
}
}

void printMatrix(int *matrix, int n) {

for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
std::cout << matrix[i * n + j] << " ";
}
std::cout << "\n";
}
}

int main() {
int n = N;
int size = n * n * sizeof(int);

int h_A = (int )malloc(size);

int *h_B = (int *)malloc(size);
int *h_C = (int *)malloc(size);

initializeMatrix(h_A, n);
initializeMatrix(h_B, n);

std::cout << "Matrix A:\n";

printMatrix(h_A, n);
std::cout << "\nMatrix B:\n";
printMatrix(h_B, n);

int d_A, d_B, *d_C;

cudaMalloc((void **)&d_A, size);
cudaMalloc((void **)&d_B, size);
cudaMalloc((void **)&d_C, size);

cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

dim3 dimBlock(16, 16);

dim3 dimGrid((n + dimBlock.x - 1) / dimBlock.x, (n + dimBlock.y
- 1) / dimBlock.y);

matrixMul<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, n);

cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

std::cout << "\nMatrix C (A * B):\n";

printMatrix(h_C, n);

cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);

return 0;
}

Experiment 7
AIM: Write and test CUDA program to implement Monte Carlo
algorithm

Code:
#include <iostream>
#include <cstdlib>
#include <ctime>
#include <cuda.h>

global void monteCarloPi(int *insideCircle, int numPoints) {

int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < numPoints) {
float x = (rand() % 10000) / 10000.0f * 2.0f - 1.0f;
float y = (rand() % 10000) / 10000.0f * 2.0f - 1.0f;
if (x * x + y * y <= 1.0f) {
atomicAdd(insideCircle, 1);
}
}
}

int main() {
int numPoints = 1000000;
int *d_insideCircle, *h_insideCircle;
h_insideCircle = (int *)malloc(sizeof(int));
*h_insideCircle = 0;

cudaMalloc((void **)&d_insideCircle, sizeof(int));

cudaMemcpy(d_insideCircle, h_insideCircle, sizeof(int),
cudaMemcpyHostToDevice);
int threadsPerBlock = 256;
int numBlocks = (numPoints + threadsPerBlock - 1) /
threadsPerBlock;
monteCarloPi<<<numBlocks,
threadsPerBlock>>>(d_insideCircle, numPoints);

cudaMemcpy(h_insideCircle, d_insideCircle, sizeof(int),

cudaMemcpyDeviceToHost);

float pi = 4.0f * (*h_insideCircle) / numPoints;

std::cout << "Estimated value of Pi: " << pi << std::endl;

cudaFree(d_insideCircle);
free(h_insideCircle);

return 0;
}

Experiment 8
Aim: Write and test CUDA program for DES encryption and
decryption

Code:
#include <iostream>
#include <cuda.h>
#include <openssl/des.h>

global void desEncryptKernel(unsigned char *in, unsigned

char *out, DES_key_schedule *keySchedule) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < 1) {
DES_ecb_encrypt((DES_cblock *) &in[idx], (DES_cblock *)
&out[idx], keySchedule, DES_ENCRYPT);
}
}

global void desDecryptKernel(unsigned char *in, unsigned

int main() {
unsigned char in[8] = {'1','2','3','4','5','6','7','8'};
unsigned char out[8];
unsigned char key[8] = {'S','e','c','r','e','t','K','e'};
DES_cblock keyBlock;
DES_key_schedule keySchedule;

for (int i = 0; i < 8; i++) {

keyBlock[i] = key[i];
}

DES_set_key(&keyBlock, &keySchedule);

unsigned char d_in, d_out;

DES_key_schedule *d_keySchedule;

cudaMalloc((void **)&d_in, sizeof(in));

cudaMalloc((void **)&d_out, sizeof(out));
cudaMalloc((void **)&d_keySchedule,
sizeof(DES_key_schedule));

cudaMemcpy(d_in, in, sizeof(in), cudaMemcpyHostToDevice);

cudaMemcpy(d_keySchedule, &keySchedule,
sizeof(DES_key_schedule), cudaMemcpyHostToDevice);

int threadsPerBlock = 1;
int numBlocks = 1;

desEncryptKernel<<<numBlocks, threadsPerBlock>>>(d_in,
d_out, d_keySchedule);
cudaMemcpy(out, d_out, sizeof(out),
cudaMemcpyDeviceToHost);
std::cout << "Encrypted text: ";
for (int i = 0; i < 8; i++) {
std::cout << out[i];
}
std::cout << std::endl;

desDecryptKernel<<<numBlocks, threadsPerBlock>>>(d_out,
d_in, d_keySchedule);
cudaMemcpy(in, d_in, sizeof(in), cudaMemcpyDeviceToHost);

std::cout << "Decrypted text: ";

for (int i = 0; i < 8; i++) {
std::cout << in[i];
}
std::cout << std::endl;

cudaFree(d_in);
cudaFree(d_out);
cudaFree(d_keySchedule);

return 0;
}

Experiment 9
Aim: Write and test CUDA program for AES encryption and
decryption

Code:
#include <iostream>
#include <openssl/aes.h>
#include <cuda.h>

global void aesEncryptKernel(unsigned char *in, unsigned

char *out, AES_KEY *keySchedule) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < 1) {
AES_encrypt(&in[idx], &out[idx], keySchedule);
}
}

global void aesDecryptKernel(unsigned char *in, unsigned

char *out, AES_KEY *keySchedule) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < 1) {
AES_decrypt(&in[idx], &out[idx], keySchedule);
}
}

int main() {
unsigned char in[16] = {'T', 'h', 'i', 's', 'i', 's', 'a', 's', 'e', 'c', 'r', 'e', 't',
'k', 'e', 'y'};
unsigned char out[16];
unsigned char key[16] = {'S', 'e', 'c', 'r', 'e', 't', 'K', 'e', 'y', 'F', 'o', 'r',
'A', 'E', 'S', 'T'};
AES_KEY encryptKey, decryptKey;

AES_set_encrypt_key(key, 128, &encryptKey);

AES_set_decrypt_key(key, 128, &decryptKey);

unsigned char d_in, d_out;

AES_KEY *d_encryptKey, *d_decryptKey;

cudaMalloc((void **)&d_in, sizeof(in));

cudaMalloc((void **)&d_out, sizeof(out));
cudaMalloc((void **)&d_encryptKey, sizeof(AES_KEY));
cudaMalloc((void **)&d_decryptKey, sizeof(AES_KEY));

cudaMemcpy(d_in, in, sizeof(in), cudaMemcpyHostToDevice);

cudaMemcpy(d_encryptKey, &encryptKey, sizeof(AES_KEY),
cudaMemcpyHostToDevice);
cudaMemcpy(d_decryptKey, &decryptKey, sizeof(AES_KEY),
cudaMemcpyHostToDevice);

int threadsPerBlock = 1;
int numBlocks = 1;

aesEncryptKernel<<<numBlocks, threadsPerBlock>>>(d_in,
d_out, d_encryptKey);
cudaMemcpy(out, d_out, sizeof(out),
cudaMemcpyDeviceToHost);

std::cout << "Encrypted text: ";

for (int i = 0; i < 16; i++) {
std::cout << out[i];
}
std::cout << std::endl;

aesDecryptKernel<<<numBlocks, threadsPerBlock>>>(d_out,
d_in, d_decryptKey);
cudaMemcpy(in, d_in, sizeof(in), cudaMemcpyDeviceToHost);

std::cout << "Decrypted text: ";

for (int i = 0; i < 16; i++) {
std::cout << in[i];
}
std::cout << std::endl;

cudaFree(d_in);
cudaFree(d_out);
cudaFree(d_encryptKey);
cudaFree(d_decryptKey);

return 0;
}

Experiment 10
Aim: Write and test CUDA program for random number generation

Code:
#include <iostream>
#include <curand_kernel.h>

global void generateRandomNumbers(int *randomNumbers,

int numElements, unsigned long long seed) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < numElements) {
curandState state;
curand_init(seed, idx, 0, &state);
randomNumbers[idx] = curand(&state);
}
}

int main() {
int numElements = 100;
int *d_randomNumbers, *h_randomNumbers;
h_randomNumbers = (int *)malloc(numElements * sizeof(int));

cudaMalloc((void **)&d_randomNumbers, numElements *

sizeof(int));

int threadsPerBlock = 256;

int numBlocks = (numElements + threadsPerBlock - 1) /
threadsPerBlock;
generateRandomNumbers<<<numBlocks,
threadsPerBlock>>>(d_randomNumbers, numElements,
time(NULL));

cudaMemcpy(h_randomNumbers, d_randomNumbers,
numElements * sizeof(int), cudaMemcpyDeviceToHost);

std::cout << "Generated random numbers: \n";

for (int i = 0; i < numElements; i++) {
std::cout << h_randomNumbers[i] << " ";
}
std::cout << std::endl;

cudaFree(d_randomNumbers);
free(h_randomNumbers);

return 0;
}

Gpu History and Cuda Programming Basics
No ratings yet
Gpu History and Cuda Programming Basics
44 pages
CUDA Exercises
No ratings yet
CUDA Exercises
185 pages
2023 CSC14120 Lecture01 CUDAIntroduction
No ratings yet
2023 CSC14120 Lecture01 CUDAIntroduction
32 pages
217 Lec2
No ratings yet
217 Lec2
24 pages
Moving To Parallel - Addition of 2 Matrices
No ratings yet
Moving To Parallel - Addition of 2 Matrices
14 pages
Group A Assignment 4 (A) : Two Large Vectors
No ratings yet
Group A Assignment 4 (A) : Two Large Vectors
5 pages
Cuda
No ratings yet
Cuda
4 pages
L06 GPGPU CUDA Programming 1
No ratings yet
L06 GPGPU CUDA Programming 1
23 pages
Csnb594csnb4423 Lab 5 01a Harveen Velan Sw0104101
No ratings yet
Csnb594csnb4423 Lab 5 01a Harveen Velan Sw0104101
19 pages
G80 Cuda
No ratings yet
G80 Cuda
25 pages
CUDA PPT Anurita Unit3
No ratings yet
CUDA PPT Anurita Unit3
42 pages
Web GPU
0% (1)
Web GPU
40 pages
Kernel Cu
100% (1)
Kernel Cu
1 page
Assignment 04
No ratings yet
Assignment 04
16 pages
HW 2
No ratings yet
HW 2
12 pages
DeviceFunc Cu
100% (1)
DeviceFunc Cu
1 page
Lab7 GPU
No ratings yet
Lab7 GPU
10 pages
GPU Series III CUDA Compilation Host Side 1721302802
No ratings yet
GPU Series III CUDA Compilation Host Side 1721302802
8 pages
CUDA Additionof2Vector
No ratings yet
CUDA Additionof2Vector
2 pages
CUDA Programming Invert
No ratings yet
CUDA Programming Invert
36 pages
3 Some Commonly Used CUDA API: 3.1 Function Type Qualifiers
No ratings yet
3 Some Commonly Used CUDA API: 3.1 Function Type Qualifiers
7 pages
Parallel Computing Lab4
No ratings yet
Parallel Computing Lab4
13 pages
CUDA Programming: Johan Seland Johan - Seland@sintef - No
No ratings yet
CUDA Programming: Johan Seland Johan - Seland@sintef - No
76 pages
HPC Final 4-8
No ratings yet
HPC Final 4-8
25 pages
Google Colab Solution Activity
No ratings yet
Google Colab Solution Activity
5 pages
Addition Cuda
No ratings yet
Addition Cuda
2 pages
Threads
No ratings yet
Threads
54 pages
Introduction To CUDA C 3
No ratings yet
Introduction To CUDA C 3
67 pages
Haskell PDF
100% (2)
Haskell PDF
504 pages
Cuda Add Mult
No ratings yet
Cuda Add Mult
3 pages
PDC Assignment
No ratings yet
PDC Assignment
9 pages
HPC File
No ratings yet
HPC File
22 pages
Lab Experiment 6
No ratings yet
Lab Experiment 6
4 pages
LP 1,,1
No ratings yet
LP 1,,1
5 pages
Vector Addition
No ratings yet
Vector Addition
3 pages
5 Computation
No ratings yet
5 Computation
13 pages
CUDA
No ratings yet
CUDA
3 pages
Allocate The Device Memory Where We Will Copy M
No ratings yet
Allocate The Device Memory Where We Will Copy M
2 pages
HPC Int2 Key
No ratings yet
HPC Int2 Key
10 pages
20 Quiz 14
No ratings yet
20 Quiz 14
12 pages
Cuda Mode Lecture2
No ratings yet
Cuda Mode Lecture2
33 pages
CUDA MatrixMultiplication
No ratings yet
CUDA MatrixMultiplication
2 pages
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
No ratings yet
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
45 pages
HPC 4 B
No ratings yet
HPC 4 B
5 pages
HPC (Pra 04)
No ratings yet
HPC (Pra 04)
11 pages
Multithreaded Architectures: Memory and Data Locality
No ratings yet
Multithreaded Architectures: Memory and Data Locality
39 pages
Input: Output: 1. Sub String Program
No ratings yet
Input: Output: 1. Sub String Program
8 pages
Parallel Scan in C CUda
No ratings yet
Parallel Scan in C CUda
3 pages
Lecture2 Cuda Basic 2010
No ratings yet
Lecture2 Cuda Basic 2010
44 pages
Cuda Notes From Udacity Lecture
No ratings yet
Cuda Notes From Udacity Lecture
3 pages
Introduction To CUDA: CAP 4730 Spring 2012
No ratings yet
Introduction To CUDA: CAP 4730 Spring 2012
35 pages
Lab 1 Parallel
No ratings yet
Lab 1 Parallel
4 pages
3 Cuda
No ratings yet
3 Cuda
5 pages
BECOA157 Parallel Matrix Multiplication
No ratings yet
BECOA157 Parallel Matrix Multiplication
3 pages
Mulmatrix Cu
No ratings yet
Mulmatrix Cu
3 pages
Cuda Firstprograms PDF
No ratings yet
Cuda Firstprograms PDF
6 pages
Lab Report 6
No ratings yet
Lab Report 6
12 pages
Cuda C/C++ Basics: NVIDIA Corporation
No ratings yet
Cuda C/C++ Basics: NVIDIA Corporation
67 pages
Aca Lab Manual Final
No ratings yet
Aca Lab Manual Final
28 pages
Oose Notes
No ratings yet
Oose Notes
46 pages
ISPSoft Tutorial 2008
No ratings yet
ISPSoft Tutorial 2008
33 pages
NCERT Class 11 Computer Science Algorithms and Flowcharts
No ratings yet
NCERT Class 11 Computer Science Algorithms and Flowcharts
11 pages
Setnanoroutemode PDF
No ratings yet
Setnanoroutemode PDF
9 pages
Demo 11q
No ratings yet
Demo 11q
10 pages
What Is IDOC: Basic IDOC Type (We30)
No ratings yet
What Is IDOC: Basic IDOC Type (We30)
5 pages
The - Format Is Usually Used To Store Data.: A) BCD B) Decimal C) Hexadecimal D) Octal
No ratings yet
The - Format Is Usually Used To Store Data.: A) BCD B) Decimal C) Hexadecimal D) Octal
27 pages
Bresenham Line Drawing Algorithm
No ratings yet
Bresenham Line Drawing Algorithm
8 pages
SELENIUM Webdriver Commands
No ratings yet
SELENIUM Webdriver Commands
2 pages
III Ooad
No ratings yet
III Ooad
21 pages
Advanced Mobile App Development Guide
No ratings yet
Advanced Mobile App Development Guide
44 pages
3 Months RoadMap To Become Data Analyst
No ratings yet
3 Months RoadMap To Become Data Analyst
2 pages
CB19241-Data Structures and Algorithms-Syllabus
No ratings yet
CB19241-Data Structures and Algorithms-Syllabus
1 page
1 Introduction To Python
No ratings yet
1 Introduction To Python
93 pages
Assignmentcpp 1
No ratings yet
Assignmentcpp 1
76 pages
SDK Htdisplaydll en
No ratings yet
SDK Htdisplaydll en
2 pages
Viva Questions
No ratings yet
Viva Questions
18 pages
Prompt Engineering Techniques - by OpenAI (For Consult)
No ratings yet
Prompt Engineering Techniques - by OpenAI (For Consult)
14 pages
Pip QP
No ratings yet
Pip QP
35 pages
Viva
No ratings yet
Viva
33 pages
Day 7 and 8
No ratings yet
Day 7 and 8
10 pages
Unit V-Hive
No ratings yet
Unit V-Hive
10 pages
Himanshu Singh Resume
No ratings yet
Himanshu Singh Resume
2 pages
214 MAD Microproject
No ratings yet
214 MAD Microproject
20 pages
The Architecture of Open Source Applications (Volume 1) LLVM
No ratings yet
The Architecture of Open Source Applications (Volume 1) LLVM
16 pages
Bridge Course - Schedule (2023-24)
No ratings yet
Bridge Course - Schedule (2023-24)
2 pages
Com 121
No ratings yet
Com 121
2 pages
Free Download Turbo C++ Complirer and Installtion Guide Procedure
No ratings yet
Free Download Turbo C++ Complirer and Installtion Guide Procedure
5 pages
S12 - A Wild Guess
No ratings yet
S12 - A Wild Guess
3 pages

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

Rishi

Uploaded by

Rishi

Uploaded by

PRACTICAL LAB FILE

Graphics Processing Unit Computing Lab

Submitted To: Submitted By:

Serial No. Topic Date Page No.

Write and test CUDA

Write and test CUDA

Write and test CUDA

Write and test CUDA

Write and test CUDA

Write and test CUDA

Write and test CUDA

Write and test CUDA

Aim: Installation of GPU CUDA Environment Setup and Hello

In this experiment, we will:

● Verify the installation of the CUDA environment.

Open a Command Prompt or PowerShell and run:

__global__ void helloWorld() {

Aim: Write and test CUDA program for Matrix-Matrix Multiplication

__global__ void matrixMul(int *a, int *b, int *c, int n) {

void initializeMatrix(int *matrix, int n) {

void printMatrix(int *matrix, int n) {

int *h_a = (int *)malloc(size);

int *d_a, *d_b, *d_c;

cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);

dim3 threadsPerBlock(16, 16);

cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

std::cout << "Matrix A:\n";

std::cout << "\nMatrix B:\n";

std::cout << "\nMatrix C (Result):\n";

Aim: Write and test CUDA program Vector Reduction

__global__ void vectorReduction(int *input, int *output, int n) {

for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {

void initializeArray(int *array, int n) {

int *d_input, *d_intermediate, *d_output;

cudaMemcpy(d_input, h_input, size,

vectorReduction<<<n / 1024, 1024>>>(d_input, d_intermediate,

__global__ void vectorReduction(int *input, int *output, int n) {

sharedData[tid] = (index < n) ? input[index] : 0;

for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {

int size = n * sizeof(int);

int *d_input, *d_intermediate, *d_output;

cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);

int numBlocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;

__global__ void gaussianElimination(float *a, float *b, int n) {

if (i < n && j > i && j < n) {

void initializeMatrix(float *matrix, int n) {

void printMatrix(float *matrix, int n) {

void printVector(float *vector, int n) {

void backSubstitution(float *a, float *b, float *x, int n) {

float *h_a = (float *)malloc(sizeA);

std::cout << "Matrix A:\n";

float *d_a, *d_b;

cudaMemcpy(d_a, h_a, sizeA, cudaMemcpyHostToDevice);

for (int i = 0; i < n - 1; i++) {

cudaMemcpy(h_a, d_a, sizeA, cudaMemcpyDeviceToHost);

backSubstitution(h_a, h_b, h_x, n);

std::cout << "\nSolution Vector X:\n";

__global__ void matrixAdd(int *A, int *B, int *C, int n) {

__global__ void matrixSub(int *A, int *B, int *C, int n) {

__global__ void matrixMul(int *A, int *B, int *C, int n) {

void initializeMatrix(int *matrix, int n) {

void printMatrix(int *matrix, int n) {

int *h_A = (int *)malloc(size);

std::cout << "Matrix A:\n";

int *d_A, *d_B, *d_C;

cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

dim3 dimBlock(16, 16);

matrixMul<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, n);

std::cout << "\nMatrix C (A * B):\n";

__global__ void monteCarloPi(int *insideCircle, int numPoints) {

cudaMalloc((void **)&d_insideCircle, sizeof(int));

cudaMemcpy(h_insideCircle, d_insideCircle, sizeof(int),

float pi = 4.0f * (*h_insideCircle) / numPoints;

__global__ void desEncryptKernel(unsigned char *in, unsigned

__global__ void desDecryptKernel(unsigned char *in, unsigned

for (int i = 0; i < 8; i++) {

global void helloWorld() {

global void matrixMul(int a, int b, int *c, int n) {

int h_a = (int )malloc(size);

int d_a, d_b, *d_c;

global void vectorReduction(int input, int output, int n) {

int d_input, d_intermediate, *d_output;

global void vectorReduction(int input, int output, int n) {

int d_input, d_intermediate, *d_output;

global void gaussianElimination(float a, float b, int n) {

void backSubstitution(float a, float b, float *x, int n) {

float h_a = (float )malloc(sizeA);

float d_a, d_b;

global void matrixAdd(int A, int B, int *C, int n) {

global void matrixSub(int A, int B, int *C, int n) {

global void matrixMul(int A, int B, int *C, int n) {

int h_A = (int )malloc(size);

int d_A, d_B, *d_C;

global void monteCarloPi(int *insideCircle, int numPoints) {

global void desEncryptKernel(unsigned char *in, unsigned

global void desDecryptKernel(unsigned char *in, unsigned

unsigned char d_in, d_out;

global void aesEncryptKernel(unsigned char *in, unsigned

global void aesDecryptKernel(unsigned char *in, unsigned

unsigned char d_in, d_out;

global void generateRandomNumbers(int *randomNumbers,