0% found this document useful (0 votes)
7 views30 pages

Rishi

Uploaded by

12111103cse
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
7 views30 pages

Rishi

Uploaded by

12111103cse
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 30

PRACTICAL LAB FILE

Graphics Processing Unit Computing Lab


(CSL-761)

Submitted To: Submitted By:


Dr. Bhoopesh Singh Bhati Rishi Bachhuka
(Assistant Professor-CSE) Roll no.-12111065
Semester- 7th

(November,2024)
Index

Serial No. Topic Date Page No.


Installation of GPU CUDA
1. Environment Setup and
06-08-2024 3-3
Hello World Program

Write and test CUDA


2. program for Matrix-Matrix
13-08-2024 4-6
Multiplication

Write and test CUDA


3. program Vector Reduction
20-08-2024 7-9
Write and test CUDA
4. program for Vector
03-09-2024 10-12
Reduction with Unlimited
Input Elements

Write and test CUDA


5. program to find solution of
10-09-2024 13-16
simultaneous linear
equations

Write and test CUDA


6. program for Strassen Matrix
24-09-2024 17-20
multiplication

Write and test CUDA


7. program to implement
1-09-2024 21-22
Monte Carlo algorithm

Write and test CUDA


8. program for DES encryption
08-10-2024 23-25
and decryption

Write and test CUDA


9. program for AES encryption
22-10-2024 26-28
and decryption

Write and test CUDA


10. program for random number
12-11-2024 29-30
generation
Experiment 1

Aim: Installation of GPU CUDA Environment Setup and Hello


World Program

In this experiment, we will:

● Verify the installation of the CUDA environment.


● Write a simple CUDA program to print "Hello, CUDA World!".
● Compile and execute the program using NVCC.

Open a Command Prompt or PowerShell and run:

● nvcc --version
● nvidia-smi

Code:
#include <iostream>

__global__ void helloWorld() {


printf("Hello, CUDA World!\n");
}

int main() {
helloWorld<<<1, 1>>>();
cudaDeviceSynchronize();
return 0;
}
Experiment 2

Aim: Write and test CUDA program for Matrix-Matrix Multiplication

Code:
#include <iostream>
#include <cuda.h>
#define N 16

__global__ void matrixMul(int *a, int *b, int *c, int n) {


int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
if (row < n && col < n) {
for (int k = 0; k < n; k++) {
sum += a[row * n + k] * b[k * n + col];
}
c[row * n + col] = sum;
}
}

void initializeMatrix(int *matrix, int n) {


for (int i = 0; i < n * n; i++) {
matrix[i] = rand() % 10;
}
}

void printMatrix(int *matrix, int n) {


for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
std::cout << matrix[i * n + j] << " ";
}
std::cout << "\n";
}
}

int main() {
int n = N;
int size = n * n * sizeof(int);

int *h_a = (int *)malloc(size);


int *h_b = (int *)malloc(size);
int *h_c = (int *)malloc(size);

initializeMatrix(h_a, n);
initializeMatrix(h_b, n);

int *d_a, *d_b, *d_c;


cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);

cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);


cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

dim3 threadsPerBlock(16, 16);


dim3 blocksPerGrid((n + threadsPerBlock.x - 1) /
threadsPerBlock.x,
(n + threadsPerBlock.y - 1) / threadsPerBlock.y);
matrixMul<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c,
n);

cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

std::cout << "Matrix A:\n";


printMatrix(h_a, n);

std::cout << "\nMatrix B:\n";


printMatrix(h_b, n);

std::cout << "\nMatrix C (Result):\n";


printMatrix(h_c, n);

cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(h_a);
free(h_b);
free(h_c);

return 0;
}
Experiment 3

Aim: Write and test CUDA program Vector Reduction

Code:
#include <iostream>
#include <cuda.h>
#define N 1024

__global__ void vectorReduction(int *input, int *output, int n) {


__shared__ int sharedData[1024];
int tid = threadIdx.x;
int index = blockIdx.x * blockDim.x + threadIdx.x;

if (index < n) {
sharedData[tid] = input[index];
} else {
sharedData[tid] = 0;
}
__syncthreads();

for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {


if (tid < stride) {
sharedData[tid] += sharedData[tid + stride];
}
__syncthreads();
}

if (tid == 0) {
output[blockIdx.x] = sharedData[0];
}
}

void initializeArray(int *array, int n) {


for (int i = 0; i < n; i++) {
array[i] = rand() % 100;
}
}

int main() {
int n = N;
int size = n * sizeof(int);
int *h_input = (int *)malloc(size);
int *h_output = (int *)malloc(sizeof(int));

initializeArray(h_input, n);

int *d_input, *d_intermediate, *d_output;


cudaMalloc((void **)&d_input, size);
cudaMalloc((void **)&d_intermediate, sizeof(int) * (n / 1024));
cudaMalloc((void **)&d_output, sizeof(int));

cudaMemcpy(d_input, h_input, size,


cudaMemcpyHostToDevice);

vectorReduction<<<n / 1024, 1024>>>(d_input, d_intermediate,


n);
vectorReduction<<<1, 1024>>>(d_intermediate, d_output, n /
1024);
cudaMemcpy(h_output, d_output, sizeof(int),
cudaMemcpyDeviceToHost);

std::cout << "Sum of array elements: " << h_output[0] << "\n";

cudaFree(d_input);
cudaFree(d_intermediate);
cudaFree(d_output);
free(h_input);
free(h_output);

return 0;
}
Experiment 4

Aim: Write and test CUDA program for Vector Reduction with
Unlimited Input Elements

Code:
#include <iostream>
#include <cuda.h>
#define BLOCK_SIZE 1024

__global__ void vectorReduction(int *input, int *output, int n) {


__shared__ int sharedData[BLOCK_SIZE];
int tid = threadIdx.x;
int index = blockIdx.x * blockDim.x + threadIdx.x;

sharedData[tid] = (index < n) ? input[index] : 0;


__syncthreads();

for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {


if (tid < stride) {
sharedData[tid] += sharedData[tid + stride];
}
__syncthreads();
}

if (tid == 0) {
output[blockIdx.x] = sharedData[0];
}
}
void initializeArray(int *array, int n) {
for (int i = 0; i < n; i++) {
array[i] = rand() % 100;
}
}

int main() {
int n;
std::cout << "Enter the size of the array: ";
std::cin >> n;

int size = n * sizeof(int);


int *h_input = (int *)malloc(size);
int *h_output = (int *)malloc(sizeof(int));

initializeArray(h_input, n);

int *d_input, *d_intermediate, *d_output;


cudaMalloc((void **)&d_input, size);
cudaMalloc((void **)&d_intermediate, sizeof(int) * ((n +
BLOCK_SIZE - 1) / BLOCK_SIZE));
cudaMalloc((void **)&d_output, sizeof(int));

cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);

int numBlocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;


vectorReduction<<<numBlocks, BLOCK_SIZE>>>(d_input,
d_intermediate, n);
vectorReduction<<<1, BLOCK_SIZE>>>(d_intermediate,
d_output, numBlocks);
cudaMemcpy(h_output, d_output, sizeof(int),
cudaMemcpyDeviceToHost);

std::cout << "Sum of array elements: " << h_output[0] << "\n";

cudaFree(d_input);
cudaFree(d_intermediate);
cudaFree(d_output);
free(h_input);
free(h_output);

return 0;
}

Experiment 5
Aim: Write and test CUDA program to find solution of simultaneous
linear equations

Code:
#include <iostream>
#include <cuda.h>
#define N 3

__global__ void gaussianElimination(float *a, float *b, int n) {


int i = blockIdx.x;
int j = threadIdx.x;

if (i < n && j > i && j < n) {


float factor = a[j * n + i] / a[i * n + i];
for (int k = i; k < n; k++) {
a[j * n + k] -= factor * a[i * n + k];
}
b[j] -= factor * b[i];
}
}

void initializeMatrix(float *matrix, int n) {


float sample[N][N] = {{3, 2, -4}, {2, 3, 3}, {5, -3, 1}};
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
matrix[i * n + j] = sample[i][j];
}
}
}
void initializeVector(float *vector, int n) {
float sample[N] = {3, 15, 14};
for (int i = 0; i < n; i++) {
vector[i] = sample[i];
}
}

void printMatrix(float *matrix, int n) {


for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
std::cout << matrix[i * n + j] << " ";
}
std::cout << "\n";
}
}

void printVector(float *vector, int n) {


for (int i = 0; i < n; i++) {
std::cout << vector[i] << " ";
}
std::cout << "\n";
}

void backSubstitution(float *a, float *b, float *x, int n) {


for (int i = n - 1; i >= 0; i--) {
x[i] = b[i];
for (int j = i + 1; j < n; j++) {
x[i] -= a[i * n + j] * x[j];
}
x[i] /= a[i * n + i];
}
}

int main() {
int n = N;
int sizeA = n * n * sizeof(float);
int sizeB = n * sizeof(float);

float *h_a = (float *)malloc(sizeA);


float *h_b = (float *)malloc(sizeB);
float *h_x = (float *)malloc(sizeB);

initializeMatrix(h_a, n);
initializeVector(h_b, n);

std::cout << "Matrix A:\n";


printMatrix(h_a, n);
std::cout << "\nVector B:\n";
printVector(h_b, n);

float *d_a, *d_b;


cudaMalloc((void **)&d_a, sizeA);
cudaMalloc((void **)&d_b, sizeB);

cudaMemcpy(d_a, h_a, sizeA, cudaMemcpyHostToDevice);


cudaMemcpy(d_b, h_b, sizeB, cudaMemcpyHostToDevice);

for (int i = 0; i < n - 1; i++) {


gaussianElimination<<<n, n>>>(d_a, d_b, n);
cudaDeviceSynchronize();
}

cudaMemcpy(h_a, d_a, sizeA, cudaMemcpyDeviceToHost);


cudaMemcpy(h_b, d_b, sizeB, cudaMemcpyDeviceToHost);

backSubstitution(h_a, h_b, h_x, n);

std::cout << "\nSolution Vector X:\n";


printVector(h_x, n);

cudaFree(d_a);
cudaFree(d_b);
free(h_a);
free(h_b);
free(h_x);

return 0;
}

Experiment 6
Aim: Write and test CUDA program for Strassen Matrix
multiplication

Code:
#include <iostream>
#include <cuda.h>
#include <cmath>
#define N 4

__global__ void matrixAdd(int *A, int *B, int *C, int n) {


int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n && j < n) {
C[i * n + j] = A[i * n + j] + B[i * n + j];
}
}

__global__ void matrixSub(int *A, int *B, int *C, int n) {


int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n && j < n) {
C[i * n + j] = A[i * n + j] - B[i * n + j];
}
}

__global__ void matrixMul(int *A, int *B, int *C, int n) {


int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
if (i < n && j < n) {
for (int k = 0; k < n; k++) {
sum += A[i * n + k] * B[k * n + j];
}
C[i * n + j] = sum;
}
}

void initializeMatrix(int *matrix, int n) {


for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
matrix[i * n + j] = rand() % 10;
}
}
}

void printMatrix(int *matrix, int n) {


for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
std::cout << matrix[i * n + j] << " ";
}
std::cout << "\n";
}
}

int main() {
int n = N;
int size = n * n * sizeof(int);

int *h_A = (int *)malloc(size);


int *h_B = (int *)malloc(size);
int *h_C = (int *)malloc(size);

initializeMatrix(h_A, n);
initializeMatrix(h_B, n);

std::cout << "Matrix A:\n";


printMatrix(h_A, n);
std::cout << "\nMatrix B:\n";
printMatrix(h_B, n);

int *d_A, *d_B, *d_C;


cudaMalloc((void **)&d_A, size);
cudaMalloc((void **)&d_B, size);
cudaMalloc((void **)&d_C, size);

cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);


cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

dim3 dimBlock(16, 16);


dim3 dimGrid((n + dimBlock.x - 1) / dimBlock.x, (n + dimBlock.y
- 1) / dimBlock.y);

matrixMul<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, n);


cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

std::cout << "\nMatrix C (A * B):\n";


printMatrix(h_C, n);

cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);

return 0;
}

Experiment 7
AIM: Write and test CUDA program to implement Monte Carlo
algorithm

Code:
#include <iostream>
#include <cstdlib>
#include <ctime>
#include <cuda.h>

__global__ void monteCarloPi(int *insideCircle, int numPoints) {


int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < numPoints) {
float x = (rand() % 10000) / 10000.0f * 2.0f - 1.0f;
float y = (rand() % 10000) / 10000.0f * 2.0f - 1.0f;
if (x * x + y * y <= 1.0f) {
atomicAdd(insideCircle, 1);
}
}
}

int main() {
int numPoints = 1000000;
int *d_insideCircle, *h_insideCircle;
h_insideCircle = (int *)malloc(sizeof(int));
*h_insideCircle = 0;

cudaMalloc((void **)&d_insideCircle, sizeof(int));


cudaMemcpy(d_insideCircle, h_insideCircle, sizeof(int),
cudaMemcpyHostToDevice);
int threadsPerBlock = 256;
int numBlocks = (numPoints + threadsPerBlock - 1) /
threadsPerBlock;
monteCarloPi<<<numBlocks,
threadsPerBlock>>>(d_insideCircle, numPoints);

cudaMemcpy(h_insideCircle, d_insideCircle, sizeof(int),


cudaMemcpyDeviceToHost);

float pi = 4.0f * (*h_insideCircle) / numPoints;


std::cout << "Estimated value of Pi: " << pi << std::endl;

cudaFree(d_insideCircle);
free(h_insideCircle);

return 0;
}

Experiment 8
Aim: Write and test CUDA program for DES encryption and
decryption

Code:
#include <iostream>
#include <cuda.h>
#include <openssl/des.h>

__global__ void desEncryptKernel(unsigned char *in, unsigned


char *out, DES_key_schedule *keySchedule) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < 1) {
DES_ecb_encrypt((DES_cblock *) &in[idx], (DES_cblock *)
&out[idx], keySchedule, DES_ENCRYPT);
}
}

__global__ void desDecryptKernel(unsigned char *in, unsigned


char *out, DES_key_schedule *keySchedule) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < 1) {
DES_ecb_encrypt((DES_cblock *) &in[idx], (DES_cblock *)
&out[idx], keySchedule, DES_DECRYPT);
}
}

int main() {
unsigned char in[8] = {'1','2','3','4','5','6','7','8'};
unsigned char out[8];
unsigned char key[8] = {'S','e','c','r','e','t','K','e'};
DES_cblock keyBlock;
DES_key_schedule keySchedule;

for (int i = 0; i < 8; i++) {


keyBlock[i] = key[i];
}

DES_set_key(&keyBlock, &keySchedule);

unsigned char *d_in, *d_out;


DES_key_schedule *d_keySchedule;

cudaMalloc((void **)&d_in, sizeof(in));


cudaMalloc((void **)&d_out, sizeof(out));
cudaMalloc((void **)&d_keySchedule,
sizeof(DES_key_schedule));

cudaMemcpy(d_in, in, sizeof(in), cudaMemcpyHostToDevice);


cudaMemcpy(d_keySchedule, &keySchedule,
sizeof(DES_key_schedule), cudaMemcpyHostToDevice);

int threadsPerBlock = 1;
int numBlocks = 1;

desEncryptKernel<<<numBlocks, threadsPerBlock>>>(d_in,
d_out, d_keySchedule);
cudaMemcpy(out, d_out, sizeof(out),
cudaMemcpyDeviceToHost);
std::cout << "Encrypted text: ";
for (int i = 0; i < 8; i++) {
std::cout << out[i];
}
std::cout << std::endl;

desDecryptKernel<<<numBlocks, threadsPerBlock>>>(d_out,
d_in, d_keySchedule);
cudaMemcpy(in, d_in, sizeof(in), cudaMemcpyDeviceToHost);

std::cout << "Decrypted text: ";


for (int i = 0; i < 8; i++) {
std::cout << in[i];
}
std::cout << std::endl;

cudaFree(d_in);
cudaFree(d_out);
cudaFree(d_keySchedule);

return 0;
}

Experiment 9
Aim: Write and test CUDA program for AES encryption and
decryption

Code:
#include <iostream>
#include <openssl/aes.h>
#include <cuda.h>

__global__ void aesEncryptKernel(unsigned char *in, unsigned


char *out, AES_KEY *keySchedule) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < 1) {
AES_encrypt(&in[idx], &out[idx], keySchedule);
}
}

__global__ void aesDecryptKernel(unsigned char *in, unsigned


char *out, AES_KEY *keySchedule) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < 1) {
AES_decrypt(&in[idx], &out[idx], keySchedule);
}
}

int main() {
unsigned char in[16] = {'T', 'h', 'i', 's', 'i', 's', 'a', 's', 'e', 'c', 'r', 'e', 't',
'k', 'e', 'y'};
unsigned char out[16];
unsigned char key[16] = {'S', 'e', 'c', 'r', 'e', 't', 'K', 'e', 'y', 'F', 'o', 'r',
'A', 'E', 'S', 'T'};
AES_KEY encryptKey, decryptKey;

AES_set_encrypt_key(key, 128, &encryptKey);


AES_set_decrypt_key(key, 128, &decryptKey);

unsigned char *d_in, *d_out;


AES_KEY *d_encryptKey, *d_decryptKey;

cudaMalloc((void **)&d_in, sizeof(in));


cudaMalloc((void **)&d_out, sizeof(out));
cudaMalloc((void **)&d_encryptKey, sizeof(AES_KEY));
cudaMalloc((void **)&d_decryptKey, sizeof(AES_KEY));

cudaMemcpy(d_in, in, sizeof(in), cudaMemcpyHostToDevice);


cudaMemcpy(d_encryptKey, &encryptKey, sizeof(AES_KEY),
cudaMemcpyHostToDevice);
cudaMemcpy(d_decryptKey, &decryptKey, sizeof(AES_KEY),
cudaMemcpyHostToDevice);

int threadsPerBlock = 1;
int numBlocks = 1;

aesEncryptKernel<<<numBlocks, threadsPerBlock>>>(d_in,
d_out, d_encryptKey);
cudaMemcpy(out, d_out, sizeof(out),
cudaMemcpyDeviceToHost);

std::cout << "Encrypted text: ";


for (int i = 0; i < 16; i++) {
std::cout << out[i];
}
std::cout << std::endl;

aesDecryptKernel<<<numBlocks, threadsPerBlock>>>(d_out,
d_in, d_decryptKey);
cudaMemcpy(in, d_in, sizeof(in), cudaMemcpyDeviceToHost);

std::cout << "Decrypted text: ";


for (int i = 0; i < 16; i++) {
std::cout << in[i];
}
std::cout << std::endl;

cudaFree(d_in);
cudaFree(d_out);
cudaFree(d_encryptKey);
cudaFree(d_decryptKey);

return 0;
}

Experiment 10
Aim: Write and test CUDA program for random number generation

Code:
#include <iostream>
#include <curand_kernel.h>

__global__ void generateRandomNumbers(int *randomNumbers,


int numElements, unsigned long long seed) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < numElements) {
curandState state;
curand_init(seed, idx, 0, &state);
randomNumbers[idx] = curand(&state);
}
}

int main() {
int numElements = 100;
int *d_randomNumbers, *h_randomNumbers;
h_randomNumbers = (int *)malloc(numElements * sizeof(int));

cudaMalloc((void **)&d_randomNumbers, numElements *


sizeof(int));

int threadsPerBlock = 256;


int numBlocks = (numElements + threadsPerBlock - 1) /
threadsPerBlock;
generateRandomNumbers<<<numBlocks,
threadsPerBlock>>>(d_randomNumbers, numElements,
time(NULL));

cudaMemcpy(h_randomNumbers, d_randomNumbers,
numElements * sizeof(int), cudaMemcpyDeviceToHost);

std::cout << "Generated random numbers: \n";


for (int i = 0; i < numElements; i++) {
std::cout << h_randomNumbers[i] << " ";
}
std::cout << std::endl;

cudaFree(d_randomNumbers);
free(h_randomNumbers);

return 0;
}

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy