0% found this document useful (0 votes)
17 views44 pages

Week 6 10

Uploaded by

Aditya Raj
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
17 views44 pages

Week 6 10

Uploaded by

Aditya Raj
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 44

Aditya Raj

210968136
Batch 4
Parallel Programming Lab
04 April 2024.
Week 6 – 16 February 2024.
1) Write a simple MPI program to find out pow (x, rank) for all the processes where 'x' is the integer
constant, and 'rank' is the rank of the process.

#include <stdio.h>

#include <math.h>

#include <mpi.h>

int main(int argc, char* argv[])

MPI_Init(&argc, &argv);

int world_size, world_rank;

MPI_Comm_size(MPI_COMM_WORLD, &world_size);

MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

// Assuming x is a constant value

int x = 40;

double result = pow(x, world_rank);

printf("Process %d: pow(%d, %d) = %f\n", world_rank, x, world_rank, result);

MPI_Finalize();

return 0;

Output:

Process 2: pow(40, 2) = 1600.000000

Process 1: pow(40, 1) = 40.000000

Process 0: pow(40, 0) = 1.000000

Process 4: pow(40, 4) = 2560000.000000

Process 3: pow(40, 3) = 64000.000000

2) Write a program in MPI where even ranked process prints "Hello" and odd ranked process prints "World".

#include <stdio.h>

#include <mpi.h>

int main(int argc, char* argv[]) {

MPI_Init(&argc, &argv);

int world_size, world_rank;

MPI_Comm_size(MPI_COMM_WORLD, &world_size);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

if (world_rank % 2 == 0) {

printf(" EVEN Hello from process %d\n", world_rank);

else {

printf("ODD World from process %d\n", world_rank);

MPI_Finalize();

return 0;

Output:

Odd World from process 13

Even Hello from process 2

Odd World from process 9

Odd World from process 1

Even Hello from process 8

Even Hello from process 12

Odd World from process 5

Odd World from process 3

Odd World from process 11

Even Hello from process 6

Even Hello from process 0

Even Hello from process 4

Odd World from process 7

Even Hello from process 10

3) Write a program in MPI to simulate simple calculator. Perform each operation using diBerent process in
parallel.

#include <stdio.h>

#include <stdlib.h>

#include <mpi.h>

int main(int argc, char* argv[]) {

MPI_Init(&argc, &argv);
int world_size, world_rank;

MPI_Comm_size(MPI_COMM_WORLD, &world_size);

MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

char operations[] = { '+', '-', '*', '/' };

char operation = operations[world_rank % 4];

double operand1 = 10.0, operand2 = 2.0;

double result = 0.0;

switch (operation) {

case '+':

result = operand1 + operand2;

break;

case '-':

result = operand1 - operand2;

break;

case '*':

result = operand1 * operand2;

break;

case '/':

if (operand2 != 0) {

result = operand1 / operand2;

else {

fprintf(stderr, "Error: Division by zero.\n");

MPI_Abort(MPI_COMM_WORLD, 1);

break;

default:

fprintf(stderr, "Error: Unknown operation.\n");

MPI_Abort(MPI_COMM_WORLD, 1);

// Allocate memory dynamically for gathered_results

double* gathered_results = (double*)malloc(world_size * sizeof(double));

MPI_Gather(&result, 1, MPI_DOUBLE, gathered_results, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);

if (world_rank == 0) {
printf("Results:\n");

for (int i = 0; i < world_size; ++i) {

printf("Process %d: %.2f %c %.2f = %.2f\n", i, operand1, operations[i % 4], operand2, gathered_results[i]);

// Free dynamically allocated memory

free(gathered_results);

MPI_Finalize();

return 0;

Output:

Process 0: 10.00 + 2.00 = 12.00

Process 1: 10.00 - 2.00 = 8.00

Process 2: 10.00 * 2.00 = 20.00

Process 3: 10.00 / 2.00 = 5.00

Process 4: 10.00 + 2.00 = 12.00

Process 5: 10.00 - 2.00 = 8.00

Process 6: 10.00 * 2.00 = 20.00

Process 7: 10.00 / 2.00 = 5.00

Process 8: 10.00 + 2.00 = 12.00

Process 9: 10.00 - 2.00 = 8.00

Process 10: 10.00 * 2.00 = 20.00

Process 11: 10.00 / 2.00 = 5.00

Process 12: 10.00 + 2.00 = 12.00

Process 13: 10.00 - 2.00 = 8.00

4) Write a program in MPI to toggle the character of a given string indexed by the rank of the process.

#include <stdio.h>

#include <string.h>

#include <mpi.h>

#define MAX_STRING_SIZE 100

int main(int argc, char *argv[]) {

int rank, size;


char str[MAX_STRING_SIZE] = "HeLLO";

int str_len;

MPI_Init(&argc, &argv);

MPI_Comm_rank(MPI_COMM_WORLD, &rank);

MPI_Comm_size(MPI_COMM_WORLD, &size);

str_len = strlen(str);

// Determine the character index to toggle based on process rank

int char_index = rank % str_len;

// Toggle the character

if (rank < str_len) {

if (str[char_index] >= 'a' && str[char_index] <= 'z') {

str[char_index] = str[char_index] - 32; // Convert to uppercase

} else if (str[char_index] >= 'A' && str[char_index] <= 'Z') {

str[char_index] = str[char_index] + 32; // Convert to lowercase

// Gather all modified strings to process 0

MPI_Gather(rank < str_len ? &str[char_index] : NULL, 1, MPI_CHAR,

str, 1, MPI_CHAR, 0, MPI_COMM_WORLD);

// Print the result in process 0

if (rank == 0) {

printf("Original String: HeLLO\n");

printf("Modified String: %s\n", str);

MPI_Finalize();

return 0;

Output:

Original String: ThisIsALongerString

Modified String: tHiSiSaLoNgErStRiNg


5) Write a program in MPI to reverse the digits of the following integer array of size 9 with 9 processes.
Initialize the Input array to the following values.

Input array : 18, 523, 301, 1234, 2, 14, 108, 150, 1928 Output array: 81, 325, 103, 4321, 2, 41, 801, 51, 8291

#include <stdio.h>

#include <mpi.h>

#define ARRAY_SIZE 9

int reverseDigits(int num) {

int reversed = 0;

while (num > 0) {

reversed = reversed * 10 + num % 10;

num /= 10;

return reversed;

int main(int argc, char* argv[]) {

MPI_Init(&argc, &argv);

int rank, size;

MPI_Comm_rank(MPI_COMM_WORLD, &rank);

MPI_Comm_size(MPI_COMM_WORLD, &size);

if (size != ARRAY_SIZE) {

if (rank == 0) {

fprintf(stderr, "Please run the program with exactly 9 processes.\n");

MPI_Finalize();

return 1;

int inputArray[ARRAY_SIZE] = { 18, 523, 301, 1234, 2, 14, 108, 150, 1928 };

int outputArray[ARRAY_SIZE];

int gatheredArray[ARRAY_SIZE];

// Scatter the input array among processes

MPI_Scatter(inputArray, 1, MPI_INT, &outputArray[rank], 1, MPI_INT, 0, MPI_COMM_WORLD);

// Reverse the digits

outputArray[rank] = reverseDigits(outputArray[rank]);
// Gather the reversed array using a separate buver

MPI_Gather(&outputArray[rank], 1, MPI_INT, gatheredArray, 1, MPI_INT, 0, MPI_COMM_WORLD);

// Print the result

if (rank == 0) {

printf("Input array : ");

for (int i = 0; i < ARRAY_SIZE; i++) {

printf("%d ", inputArray[i]);

printf("\nOutput array: ");

for (int i = 0; i < ARRAY_SIZE; i++) {

printf("%d ", gatheredArray[i]);

printf("\n");

MPI_Finalize();

return 0;

Output:

Input array : 18 523 301 1234 2 14 108 150 1928

Output array: 81 325 103 4321 2 41 801 51 8291

6) Write a MPI program to find the prime numbers between 1 and 100 using 2 processes.

#include <stdio.h>

#include <stdbool.h>

#include <mpi.h>

#define RANGE_START 1

#define RANGE_END 100

bool isPrime(int num) {

if (num < 2) {

return false;

for (int i = 2; i * i <= num; i++) {


if (num % i == 0) {

return false;

return true;

int main(int argc, char* argv[]) {

MPI_Init(&argc, &argv);

int rank, size;

MPI_Comm_rank(MPI_COMM_WORLD, &rank);

MPI_Comm_size(MPI_COMM_WORLD, &size);

if (size != 2) {

if (rank == 0) {

fprintf(stderr, "Please run the program with exactly 2 processes.\n");

MPI_Finalize();

return 1;

int start, end;

int primesInRange = 0;

if (rank == 0) {

// Process 0 will check for primes in the first half of the range

start = RANGE_START;

end = RANGE_END / 2;

else {

// Process 1 will check for primes in the second half of the range

start = RANGE_END / 2 + 1;

end = RANGE_END;

// Each process checks for prime numbers in its assigned range

for (int num = start; num <= end; num++) {

if (isPrime(num)) {
primesInRange++;

// Process 0 receives the count of primes from Process 1 and adds them

if (rank == 0) {

int receivedPrimes;

MPI_Recv(&receivedPrimes, 1, MPI_INT, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

primesInRange += receivedPrimes;

printf("Prime numbers between %d and %d: %d\n", RANGE_START, RANGE_END, primesInRange);

else {

// Process 1 sends its count of primes to Process 0

MPI_Send(&primesInRange, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);

MPI_Finalize();

return 0;

Output:

Prime numbers between 1 and 100: 25


Week 7 – 23 February 2024.
1) Write a MPI program using synchronous send. The sender process sends a word to the receiver. The
second process receives the word, toggles each letter of the word and sends it back to the first process.
Both processes use synchronous send operations.

#include <stdio.h>

#include <string.h>

#include <ctype.h>

#include <mpi.h>

void toggle_case(char* word) {

for (int i = 0; word[i] != '\0'; i++) {

if (isupper(word[i]))

word[i] = tolower(word[i]);

else if (islower(word[i]))

word[i] = toupper(word[i]);

int main(int argc, char *argv[]) {

MPI_Init(&argc, &argv);

int rank;

MPI_Comm_rank(MPI_COMM_WORLD, &rank);

const int tag = 0;

const int root = 0;

if (rank == root) {

// Sender process

const char* word = "Hello";

printf("Process %d sends word: %s\n", rank, word);

MPI_Ssend(word, strlen(word) + 1, MPI_CHAR, 1, tag, MPI_COMM_WORLD);

// Receive the toggled word back from process 1

char received_word[100];

MPI_Recv(received_word, 100, MPI_CHAR, 1, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

printf("Process %d received toggled word: %s\n", rank, received_word);

} else if (rank == 1) {

// Receiver process

char word[100];
MPI_Recv(word, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

printf("Process %d received word: %s\n", rank, word);

// Toggle the case of each letter in the word

toggle_case(word);

// Send it back to the sender process

MPI_Ssend(word, strlen(word) + 1, MPI_CHAR, 0, tag, MPI_COMM_WORLD);

MPI_Finalize();

return 0;

Output:

Process 1 received word: Hello

Process 0 sends word: Hello

Process 0 received toggled word: hELLO

2) Write a MPI program where the master process (process 0) sends a number to each of the slaves and
the slave processes receive the number and prints it. Use standard send.

#include <mpi.h>

#include <stdio.h>

int main(int argc, char** argv) {

MPI_Init(&argc, &argv);

int world_size;

MPI_Comm_size(MPI_COMM_WORLD, &world_size);

int world_rank;

MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

int number;

if (world_rank == 0) {

// Master process

// Choose a number to send to all slave processes

number = 777; // You can change this number to any number you want to send

// Use MPI_Send to send it to all the other processes

for (int i = 1; i < world_size; i++) {


MPI_Send(&number, 1, MPI_INT, i, 0, MPI_COMM_WORLD);

printf("Master process sending number %d to process %d\n", number, i);

} else {

// Slave processes

MPI_Recv(&number, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

printf("Slave process %d received number %d from master process\n", world_rank, number);

MPI_Finalize();

return 0;

Output:

Slave process 3 received number 123 from master process

Slave process 2 received number 123 from master process

Slave process 1 received number 123 from master process

Master process sending number 123 to process 1

Master process sending number 123 to process 2

Master process sending number 123 to process 3

3) Write a MPI program to read N elements of the array in the root process (process 0) where N is equal to the total
number of process. The root process sends one value to each of the slaves. Let even ranked process finds square of
the received element and odd ranked process finds cube of received element. Use BuKered send.

#include <stdio.h>

#include <stdlib.h>

#include <mpi.h>

int main(int argc, char *argv[]) {

MPI_Init(&argc, &argv);

int world_size;

MPI_Comm_size(MPI_COMM_WORLD, &world_size);

int world_rank;

MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

// The root process (process 0) will read N elements where N is equal to world_size

int *numbers = NULL;


if (world_rank == 0) {

numbers = (int*)malloc(sizeof(int) * world_size);

// Initialize the array with some values

for (int i = 0; i < world_size; i++) {

numbers[i] = i + 1; // Or any other logic to initialize the array

// Buver for buvered send

int buver_size = world_size * sizeof(int) + MPI_BSEND_OVERHEAD;

void *buver = malloc(buver_size);

MPI_Buver_attach(buver, buver_size);

// Distribute one number to each slave process

if (world_rank == 0) {

for (int i = 1; i < world_size; i++) {

MPI_Bsend(&numbers[i], 1, MPI_INT, i, 0, MPI_COMM_WORLD);

// Each slave process receives a number and performs its operation

int received_number;

if (world_rank != 0) {

MPI_Recv(&received_number, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

if (world_rank % 2 == 0) {

// Even rank: square the number

printf("Process %d received %d, squaring it to %d\n", world_rank, received_number, received_number *


received_number);

} else {

// Odd rank: cube the number

printf("Process %d received %d, cubing it to %d\n", world_rank, received_number, received_number *


received_number * received_number);

// Detach and free the buver

MPI_Buver_detach(&buver, &buver_size);

free(buver);

// Free numbers array on root process


if(world_rank == 0) {

free(numbers);

MPI_Finalize();

return 0;

Output:

Process 1 received 2, cubing it to 8

Process 3 received 4, cubing it to 64

Process 2 received 3, squaring it to 9

4) Write a MPI program to read an integer value in the root process. Root process sends this value to
Process1, Process1 sends this value to Process2 and so on. Last process sends the value back to root
process. When sending the value each process will first increment the received value by one. Write the
program using point to point communication routines.

#include <stdio.h>

#include <mpi.h>

int main(int argc, char** argv) {

int value;

int world_rank, world_size;

MPI_Init(&argc, &argv);

MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

MPI_Comm_size(MPI_COMM_WORLD, &world_size);

if (world_rank == 0) {

// Root process reads the integer value

value = 10; // Example value, can be read from user input

printf("Root process starts with value: %d\n", value);

MPI_Send(&value, 1, MPI_INT, world_rank + 1, 0, MPI_COMM_WORLD);

} else {

MPI_Recv(&value, 1, MPI_INT, world_rank - 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

value++; // Increment the value by one

printf("Process %d incremented value to: %d\n", world_rank, value);

if (world_rank < world_size - 1) {

MPI_Send(&value, 1, MPI_INT, world_rank + 1, 0, MPI_COMM_WORLD);


} else {

// Last process sends it back to the root process

MPI_Send(&value, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);

// Root process receives the value from the last process

if (world_rank == 0) {

MPI_Recv(&value, 1, MPI_INT, world_size - 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

printf("Root process received final value: %d\n", value);

MPI_Finalize();

return 0;

Output:

Process 1 incremented value to: 11

Root process starts with value: 10

Root process received final value: 13

Process 3 incremented value to: 13

Process 2 incremented value to: 12

5) Write a MPI program to read N elements of an array in the master process. Let N processes including
master process check the array values are prime or not.

#include <stdio.h>

#include <stdlib.h>

#include <mpi.h>

// Function to check if a number is prime

int is_prime(int number) {

if (number <= 1) return 0;

for (int i = 2; i * i <= number; i++) {

if (number % i == 0) return 0;

return 1;

}
int main(int argc, char** argv) {

int world_rank, world_size, number_to_check;

int* array = NULL; // Initialize the pointer to NULL

MPI_Init(&argc, &argv);

MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

MPI_Comm_size(MPI_COMM_WORLD, &world_size);

if (world_rank == 0) {

array = (int*)malloc(sizeof(int) * world_size); // Allocate memory only on the root process

for (int i = 0; i < world_size; i++) {

array[i] = i + 2; // Example values, start from 2 (first prime number)

// Use MPI_Scatter to distribute the values

MPI_Scatter(array, 1, MPI_INT, &number_to_check, 1, MPI_INT, 0, MPI_COMM_WORLD);

// Each process checks if the number received is prime

int result = is_prime(number_to_check);

printf("Process %d received %d, prime? %s\n", world_rank, number_to_check, result ? "Yes" : "No");

// Free the allocated memory on the root process

if (world_rank == 0) {

free(array);

MPI_Finalize();

return 0;

Output:

Process 1 received 3, prime? Yes

Process 2 received 4, prime? No

Process 0 received 2, prime? Yes

Process 3 received 5, prime? Yes


Week 8 – 1 March 2024.
1. Write a program in CUDA to add two vectors of length N using

a) block size as N b) N threads

#include <stdio.h>

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#include <cstdlib>

// CUDA kernel to add elements of two arrays

__global__ void add(int* x, int* y, int* z, int n) {

int index = threadIdx.x;

if (index < n)

z[index] = x[index] + y[index];

// Function to print the vector

void printVector(int* vector, int size) {

for (int i = 0; i < size; i++) {

printf("%d ", vector[i]);

printf("\n");

int main() {

int N = 1024; // Example size, make sure it does not exceed your GPU's capability

int* x, * y, * z, * d_x, * d_y, * d_z;

// Allocate memory

x = (int*)malloc(N * sizeof(int));

y = (int*)malloc(N * sizeof(int));

z = (int*)malloc(N * sizeof(int));

cudaMalloc(&d_x, N * sizeof(int));

cudaMalloc(&d_y, N * sizeof(int));

cudaMalloc(&d_z, N * sizeof(int));

// Initialize arrays
for (int i = 0; i < N; i++) {

x[i] = 5;

y[i] = 2;

// Copy inputs to device

cudaMemcpy(d_x, x, N * sizeof(int), cudaMemcpyHostToDevice);

cudaMemcpy(d_y, y, N * sizeof(int), cudaMemcpyHostToDevice);

// Launch add() kernel on GPU

add << <1, N >> > (d_x, d_y, d_z, N);

// Copy result back to host

cudaMemcpy(z, d_z, N * sizeof(int), cudaMemcpyDeviceToHost);

// Print the result

printf("Result: ");

printVector(z, N);

// Cleanup

cudaFree(d_x); cudaFree(d_y); cudaFree(d_z);

free(x); free(y); free(z);

return 0;

Output:

Result: 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777

#include <stdio.h>

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>
#include <cstdlib>

__global__ void add(int* x, int* y, int* z, int n) {

int index = blockIdx.x * blockDim.x + threadIdx.x;

if (index < n)

z[index] = x[index] + y[index];

void printVector(int* vector, int size) {

for (int i = 0; i < size; i++) {

printf("%d ", vector[i]);

printf("\n");

int main() {

int N = 2048; // Example size

int* x, * y, * z, * d_x, * d_y, * d_z;

x = (int*)malloc(N * sizeof(int));

y = (int*)malloc(N * sizeof(int));

z = (int*)malloc(N * sizeof(int));

cudaMalloc(&d_x, N * sizeof(int));

cudaMalloc(&d_y, N * sizeof(int));

cudaMalloc(&d_z, N * sizeof(int));

for (int i = 0; i < N; i++) {

x[i] = 5;

y[i] = 2;

cudaMemcpy(d_x, x, N * sizeof(int), cudaMemcpyHostToDevice);

cudaMemcpy(d_y, y, N * sizeof(int), cudaMemcpyHostToDevice);

int blockSize = 256; // Choose appropriate block size

int numBlocks = (N + blockSize - 1) / blockSize;

add << <numBlocks, blockSize >> > (d_x, d_y, d_z, N);

cudaMemcpy(z, d_z, N * sizeof(int), cudaMemcpyDeviceToHost);

printf("Result: ");

printVector(z, N);

cudaFree(d_x); cudaFree(d_y); cudaFree(d_z);


free(x); free(y); free(z);

return 0;

Output:

Results: 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
77777777777777777777777777777777777777777777777777777777777

2) Implement a CUDA program to add two vectors of length N by keeping the number of threads per block
as 256 (constant) and vary the number of blocks to handle N elements.

#include <stdio.h>

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#include <cstdlib>

// CUDA kernel to add elements of two arrays

__global__ void add(int* a, int* b, int* c, int N) {

int index = threadIdx.x + blockIdx.x * blockDim.x;

if (index < N)

c[index] = a[index] + b[index];

int main() {

int N = 1 << 20; // Example: Number of elements in each vector (1 Million elements)

int size = N * sizeof(int);

int threadsPerBlock = 256;

int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; // Calculate needed blocks

// Allocate memory on the host

int* h_a, * h_b, * h_c;


h_a = (int*)malloc(size);

h_b = (int*)malloc(size);

h_c = (int*)malloc(size);

// Initialize vectors on the host

for (int i = 0; i < N; i++) {

h_a[i] = i;

h_b[i] = i;

// Allocate vectors in device memory

int* d_a, * d_b, * d_c;

cudaMalloc(&d_a, size);

cudaMalloc(&d_b, size);

cudaMalloc(&d_c, size);

// Copy vectors from host memory to device memory

cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);

cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

// Invoke kernel

add << <blocksPerGrid, threadsPerBlock >> > (d_a, d_b, d_c, N);

// Copy result from device memory to host memory

cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

// Print the results: Print the first 10 elements

printf("Result of Vector Addition (First 10 Elements):\n");

for (int i = 0; i < 10; i++) {

printf("h_c[%d] = %d\n", i, h_c[i]);

// Free device memory

cudaFree(d_a);

cudaFree(d_b);

cudaFree(d_c);

// Free host memory

free(h_a);

free(h_b);

free(h_c);

return 0;
}

Output:

Result of Vector Addition (First 10 Elements):

h_c[0] = 0

h_c[1] = 2

h_c[2] = 4

h_c[3] = 6

h_c[4] = 8

h_c[5] = 10

h_c[6] = 12

h_c[7] = 14

h_c[8] = 16

h_c[9] = 18

3) Write a program in CUDA which performs convolution operation on one dimensional input array N of size width
using a mask array M of size mask_width to produce the resultant one-dimensional array P of size width.

#include <stdio.h>

#include <cuda_runtime.h>

#include "device_launch_parameters.h"

#include <stdio.h>

#include <cstdlib>

// CUDA Kernel for one-dimensional convolution

__global__ void convolution_1d(int* N, int* M, int* P, int width, int mask_width) {

int i = blockIdx.x * blockDim.x + threadIdx.x;

int k;

// Each thread computes one element of P

if (i < width) {

int pValue = 0;

// Convolution operation

for (k = 0; k < mask_width; ++k) {

int maskIndex = mask_width - 1 - k;

int nIndex = i - (mask_width / 2) + k;

if (nIndex >= 0 && nIndex < width) {

pValue += N[nIndex] * M[maskIndex];


}

P[i] = pValue;

// Function to print the array

void printArray(int* array, int size) {

for (int i = 0; i < size; i++) {

printf("%d ", array[i]);

printf("\n");

int main() {

int width = 10; // Example array size

int mask_width = 3; // Example mask size

int N[10] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; // Example input array

int M[3] = { 1, 0, -1 }; // Example mask array

int P[10]; // Resultant array

int* d_N, * d_M, * d_P;

// Allocate memory on the device

cudaMalloc(&d_N, width * sizeof(int));

cudaMalloc(&d_M, mask_width * sizeof(int));

cudaMalloc(&d_P, width * sizeof(int));

// Copy inputs to device

cudaMemcpy(d_N, N, width * sizeof(int), cudaMemcpyHostToDevice);

cudaMemcpy(d_M, M, mask_width * sizeof(int), cudaMemcpyHostToDevice);

// Launch kernel on the GPU

convolution_1d << <(width + 255) / 256, 256 >> > (d_N, d_M, d_P, width, mask_width);

// Copy result back to host

cudaMemcpy(P, d_P, width * sizeof(int), cudaMemcpyDeviceToHost);

// Print the resultant array

printf("Resultant array: ");

printArray(P, width);

// Free device memory

cudaFree(d_N);
cudaFree(d_M);

cudaFree(d_P);

return 0;

Output:

Resultant array: 2 2 2 2 2 2 2 2 2 -9

4) Write a program in CUDA to process a ID array containing angles in radians to generate sine of the
angles in the output array. Use appropriate function.

#include <stdio.h>

#include <cuda_runtime.h>

#include <math.h>

#include "device_launch_parameters.h"

#include <stdio.h>

#include <cstdlib>

// Define M_PI if it's not defined by the math library

#ifndef M_PI

#define M_PI 3.14159265358979323846

#endif

// CUDA Kernel to compute sine of angles in radians

__global__ void compute_sine(float* input, float* output, int n) {

int index = blockIdx.x * blockDim.x + threadIdx.x;

if (index < n) {

output[index] = sinf(input[index]); // Use sinf for single precision float

// Function to print the array

void printArray(float* array, int size) {

for (int i = 0; i < size; i++) {

printf("%f ", array[i]);

printf("\n");
}

int main() {

int n = 5; // Example array size

float inputAngles[] = { 0.0, M_PI / 6, M_PI / 4, M_PI / 2, M_PI }; // Example angles in radians

// Allocate memory on the device

float* d_input, * d_output;

cudaMalloc((void**)&d_input, n * sizeof(float));

cudaMalloc((void**)&d_output, n * sizeof(float));

// Copy the input array from host to device

cudaMemcpy(d_input, inputAngles, n * sizeof(float), cudaMemcpyHostToDevice);

// Calculate the number of blocks and threads per block

int threadsPerBlock = 256;

int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;

// Allocate memory for the output array on the host

float* output = new float[n]; // Use dynamic allocation

// Launch the CUDA Kernel

compute_sine << <blocksPerGrid, threadsPerBlock >> > (d_input, d_output, n);

// Copy the result back to the host

cudaMemcpy(output, d_output, n * sizeof(float), cudaMemcpyDeviceToHost);

// Print the resultant array

printf("Sine of angles: ");

printArray(output, n);

// Free device memory

cudaFree(d_input);

cudaFree(d_output);

// Free host memory

delete[] output;

return 0;

Output:

Sine of angles: 0.000000 0.500000 0.707107 1.000000 -0.000000


Week 9 – 8 March 2024.
1) Write a program in CUDA to count the number of times a given word is repeated in a sentence (Use
Atomic Function).

#include <stdio.h>

#include <cuda_runtime.h>

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#include <cstdlib>

#include <string.h>

#include <sm_20_atomic_functions.h>

__global__ void countWordOccurrences(char* sentence, char* word, int sentenceLength, int wordLength, int*
count) {

int index = threadIdx.x + blockIdx.x * blockDim.x;

int stride = blockDim.x * gridDim.x;

int localCount = 0;

// Each thread checks a part of the sentence for the word

for (int i = index; i <= sentenceLength - wordLength; i += stride) {

bool wordFound = true;

for (int j = 0; j < wordLength; j++) {

if (sentence[i + j] != word[j]) {

wordFound = false;

break;

if (wordFound) {

localCount++;

// Use atomicAdd to safely add the local count to the global total

atomicAdd(count, localCount);

int main() {

char* sentence = "hello hello world world world world"; // Example sentence
char* word = "world"; // Word to count in the sentence

int* count;

char* d_sentence, * d_word;

int* d_count;

int sentenceLength = strlen(sentence);

int wordLength = strlen(word);

// Allocate memory for device copies of sentence, word, count

cudaMalloc((void**)&d_sentence, sentenceLength);

cudaMalloc((void**)&d_word, wordLength);

cudaMalloc((void**)&d_count, sizeof(int));

// Copy inputs to device

cudaMemcpy(d_sentence, sentence, sentenceLength, cudaMemcpyHostToDevice);

cudaMemcpy(d_word, word, wordLength, cudaMemcpyHostToDevice);

cudaMemset(d_count, 0, sizeof(int));

// Launch countWordOccurrences() kernel on GPU with enough blocks and threads

countWordOccurrences << <1, 256 >> > (d_sentence, d_word, sentenceLength, wordLength, d_count);

// Copy result back to host

cudaMemcpy(count, d_count, sizeof(int), cudaMemcpyDeviceToHost);

printf("The word '%s' appears %d times in the sentence.\n", word, *count);

// Cleanup

cudaFree(d_sentence); cudaFree(d_word); cudaFree(d_count);

return 0;

Output:

The word ‘world’ appears 4 times in the sentence.

2) Write a CUDA program that reads a string S and produces the string RS as follows:

Input string S: PCAP Output string RS: PCAPPCAPCP Note: Each work item copies required number of
characters from S in RS.

#include <stdio.h>

#include <cuda_runtime.h>
__global__ void generateString(char* inputString, char* outputString, int length) {

int index = threadIdx.x + blockIdx.x * blockDim.x;

int stride = blockDim.x * gridDim.x;

for (int i = index; i < length; i += stride) {

outputString[i] = inputString[i % 4]; // Modulo operation to repeat the string

int main() {

char inputString[] = "PCAP"; // Example input string

int length = 12; // Length of the output string (4 times the length of inputString)

char* d_inputString, * d_outputString;

// Allocate memory for device copies of inputString and outputString

cudaMalloc((void**)&d_inputString, strlen(inputString));

cudaMalloc((void**)&d_outputString, length);

// Copy inputString to device

cudaMemcpy(d_inputString, inputString, strlen(inputString), cudaMemcpyHostToDevice);

// Launch generateString kernel on GPU with enough blocks and threads

generateString << <1, 256 >> > (d_inputString, d_outputString, length);

// Copy result back to host

char* outputString = (char*)malloc(length);

cudaMemcpy(outputString, d_outputString, length, cudaMemcpyDeviceToHost);

printf("Input string S: %s\n", inputString);

printf("Output string RS: %s\n", outputString);

// Cleanup

cudaFree(d_inputString);

cudaFree(d_outputString);

free(outputString);

return 0;

Output:

Input string S: PCAP

Output string RS: PCAPPCAPPCAP


3) Write a CUDA program which reads a string consisting of N words and reverse each word of it in parallel.

#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

__global__ void reverseWords(char* inputString, char* outputString, int* wordLengths, int numWords) {

int index = threadIdx.x + blockIdx.x * blockDim.x;

int stride = blockDim.x * gridDim.x;

// Iterate over each word

for (int i = index; i < numWords; i += stride) {

int wordStart = (i == 0) ? 0 : wordLengths[i - 1]; // Start index of the word

int wordEnd = wordLengths[i]; // End index of the word

// Reverse the word

for (int j = wordStart; j < (wordStart + wordEnd) / 2; j++) {

char temp = inputString[j];

outputString[j] = inputString[wordStart + wordEnd - j - 1];

outputString[wordStart + wordEnd - j - 1] = temp;

int main() {

char inputString[] = "Hello World CUDA Program"; // Example input string

int length = strlen(inputString);

int numWords = 0;

// Count the number of words

for (int i = 0; i < length; i++) {

if (inputString[i] == ' ') {

numWords++;

numWords++; // Increment for the last word

// Allocate memory for device copies of inputString, outputString, and wordLengths

char* d_inputString, * d_outputString;

int* d_wordLengths;

cudaMalloc((void**)&d_inputString, length);
cudaMalloc((void**)&d_outputString, length);

cudaMalloc((void**)&d_wordLengths, numWords * sizeof(int));

// Copy inputString to device

cudaMemcpy(d_inputString, inputString, length, cudaMemcpyHostToDevice);

// Calculate word lengths

int wordStartIndex = 0;

int* wordLengths = (int*)malloc(numWords * sizeof(int));

for (int i = 0; i < length; i++) {

if (inputString[i] == ' ') {

wordLengths[numWords - 1] = i - wordStartIndex; // Store length of each word

wordStartIndex = i + 1; // Move to the start of next word

numWords--;

wordLegths[numWords - 1] = length - wordStartIndex; // Length of the last word

// Copy wordLengths to device

cudaMemcpy(d_wordLengths, wordLengths, numWords * sizeof(int), cudaMemcpyHostToDevice);

// Launch reverseWords kernel on GPU with enough blocks and threads

int blockSize = 256;

int numBlocks = (length + blockSize - 1) / blockSize;

reverseWords << <numBlocks, blockSize >> > (d_inputString, d_outputString, d_wordLengths, numWords);

// Copy result back to host

char* outputString = (char*)malloc(length);

cudaMemcpy(outputString, d_outputString, length, cudaMemcpyDeviceToHost);

printf("Input string: %s\n", inputString);

printf("Output string: %s\n", outputString);

// Cleanup

cudaFree(d_inputString);

cudaFree(d_outputString);

cudaFree(d_wordLengths);

free(wordLengths);

free(outputString);

return 0;

}
Output:

Input string: Hello World

Output string: olleH dlroW

4) Write a CUDA program that takes a string Sin as input and one integer value N and produces an output
string, Sout, in parallel by concatenating input string Sin, N times as shown below.

INPUT : Sin ="Hello" N =3 OUTPUT : Sout = "HelloHelloHello" Note: Every thread copies the same
character from the Input string S, N times to the required position.

#include <stdio.h>

#include <cuda_runtime.h>

__global__ void concatenateString(char* inputString, int inputLength, char* outputString, int repetitions) {

int index = threadIdx.x + blockIdx.x * blockDim.x;

int stride = blockDim.x * gridDim.x;

// Copy characters from input string to output string

for (int i = index; i < inputLength * repetitions; i += stride) {

outputString[i] = inputString[i % inputLength]; // Copy characters from input string repeatedly

int main() {

char inputString[] = "Hello"; // Example input string

int length = strlen(inputString);

int repetitions = 3; // Number of times input string should be repeated

// Allocate memory for device copies of inputString and outputString

char* d_inputString, * d_outputString;

int outputLength = length * repetitions; // Length of the output string

cudaMalloc((void**)&d_inputString, length);

cudaMalloc((void**)&d_outputString, outputLength);

// Copy inputString to device

cudaMemcpy(d_inputString, inputString, length, cudaMemcpyHostToDevice);

// Launch concatenateString kernel on GPU with enough blocks and threads

int blockSize = 256;

int numBlocks = (outputLength + blockSize - 1) / blockSize;

concatenateString <<<numBlocks, blockSize>>> (d_inputString, length, d_outputString, repetitions);

// Copy result back to host

char* outputString = (char*)malloc(outputLength);


cudaMemcpy(outputString, d_outputString, outputLength, cudaMemcpyDeviceToHost);

printf("Input string: %s\n", inputString);

printf("Output string: %s\n", outputString);

// Cleanup

cudaFree(d_inputString);

cudaFree(d_outputString);

free(outputString);

return 0;

Output:

Input string: Hello

Output string: HelloHelloHello

5) Write a CUDA program which reads a string Sin and produces an output string T as shownbelow.

Input: Sin: "Hai" Ouput: T: "Haaiii" Note:Every thread stores a character from input string Sin, required
number of times intooutput string T.

#include <stdio.h>

#include <cuda_runtime.h>

__global__ void repeatCharacters(char* inputString, int inputLength, char* outputString, int repetitions) {

int index = threadIdx.x + blockIdx.x * blockDim.x;

int stride = blockDim.x * gridDim.x;

// Copy characters from input string to output string

for (int i = index; i < inputLength * repetitions; i += stride) {

outputString[i] = inputString[i % inputLength]; // Copy characters from input string repeatedly

// Repeat the last character multiple times

if (index == 0) {

char lastChar = inputString[inputLength - 1];

for (int i = inputLength * repetitions - 1; i >= inputLength * (repetitions - 1); --i) {

outputString[i] = lastChar;

}
int main() {

char inputString[] = "Hai"; // Example input string

int length = strlen(inputString);

int repetitions = 3; // Number of times each character should be repeated, except for the last character

// Allocate memory for device copies of inputString and outputString

char* d_inputString, * d_outputString;

int outputLength = length * repetitions + 2; // Length of the output string

cudaMalloc((void**)&d_inputString, length);

cudaMalloc((void**)&d_outputString, outputLength);

// Copy inputString to device

cudaMemcpy(d_inputString, inputString, length, cudaMemcpyHostToDevice);

// Launch repeatCharacters kernel on GPU with enough blocks and threads

int blockSize = 256;

int numBlocks = (outputLength + blockSize - 1) / blockSize;

repeatCharacters<<<numBlocks, blockSize>>>(d_inputString, length, d_outputString, repetitions);

// Copy result back to host

char* outputString = (char*)malloc(outputLength);

cudaMemcpy(outputString, d_outputString, outputLength, cudaMemcpyDeviceToHost);

printf("Input string: %s\n", inputString);

printf("Output string: %s\n", outputString);

// Cleanup

cudaFree(d_inputString);

cudaFree(d_outputString);

free(outputString);

return 0;

Output:

Input string: Hello

Output string: Heelllllllooooo


Week 10 – 26 March 2024.
1) Write a program in CUDA to add two matrices for the following specifications:

• Each row of resultant matrix to be computed by one thread.

• Each column of resultant matrix to be computed by one thread.

• Each element of resultant matrix to be computed by one thread.

#include <stdio.h>

#include <cuda_runtime.h>

#define N 3 // Matrix size

// Kernel to add two matrices where each row of the resultant matrix is computed by one thread

__global__ void addMatrixRows(float* A, float* B, float* C) {

int row = blockIdx.x; // Each block computes one row

int col = threadIdx.x; // Each thread computes one element of the row

C[row * N + col] = A[row * N + col] + B[row * N + col];

// Kernel to add two matrices where each column of the resultant matrix is computed by one thread

__global__ void addMatrixColumns(float* A, float* B, float* C) {

int row = threadIdx.x; // Each thread computes one element of the column

int col = blockIdx.x; // Each block computes one column

C[row * N + col] = A[row * N + col] + B[row * N + col];

// Kernel to add two matrices where each element of the resultant matrix is computed by one thread

__global__ void addMatrixElements(float* A, float* B, float* C) {

int index = threadIdx.x + blockIdx.x * blockDim.x; // Each thread computes one element of the matrix

if (index < N * N) {

C[index] = A[index] + B[index];

// Helper function to initialize matrices with random values

void initializeMatrix(float* matrix) {

for (int i = 0; i < N * N; i++) {

matrix[i] = (float)rand() / RAND_MAX; // Random value between 0 and 1

}
// Helper function to print matrix

void printMatrix(float* matrix) {

for (int i = 0; i < N; i++) {

for (int j = 0; j < N; j++) {

printf("%.2f\t", matrix[i * N + j]);

printf("\n");

int main() {

// Allocate memory for host matrices

float *h_A, *h_B, *h_C;

h_A = (float*)malloc(N * N * sizeof(float));

h_B = (float*)malloc(N * N * sizeof(float));

h_C = (float*)malloc(N * N * sizeof(float));

// Initialize host matrices with random values

initializeMatrix(h_A);

initializeMatrix(h_B);

// Allocate memory for device matrices

float *d_A, *d_B, *d_C;

cudaMalloc((void**)&d_A, N * N * sizeof(float));

cudaMalloc((void**)&d_B, N * N * sizeof(float));

cudaMalloc((void**)&d_C, N * N * sizeof(float));

// Copy host matrices to device

cudaMemcpy(d_A, h_A, N * N * sizeof(float), cudaMemcpyHostToDevice);

cudaMemcpy(d_B, h_B, N * N * sizeof(float), cudaMemcpyHostToDevice);

// Launch kernel to add matrices by rows

addMatrixRows<<<N, N>>>(d_A, d_B, d_C);

// Copy result matrix from device to host

cudaMemcpy(h_C, d_C, N * N * sizeof(float), cudaMemcpyDeviceToHost);

// Print result matrix

printf("Matrix C (Sum of A and B by rows):\n");

printMatrix(h_C);

// Launch kernel to add matrices by columns


addMatrixColumns<<<N, N>>>(d_A, d_B, d_C);

// Copy result matrix from device to host

cudaMemcpy(h_C, d_C, N * N * sizeof(float), cudaMemcpyDeviceToHost);

// Print result matrix

printf("\nMatrix C (Sum of A and B by columns):\n");

printMatrix(h_C);

// Launch kernel to add matrices element-wise

addMatrixElements<<<(N * N + 255) / 256, 256>>>(d_A, d_B, d_C);

// Copy result matrix from device to host

cudaMemcpy(h_C, d_C, N * N * sizeof(float), cudaMemcpyDeviceToHost);

// Print result matrix

printf("\nMatrix C (Sum of A and B element-wise):\n");

printMatrix(h_C);

// Free device and host memory

cudaFree(d_A);

cudaFree(d_B);

cudaFree(d_C);

free(h_A);

free(h_B);

free(h_C);

return 0;

Output:

Matrix A:

1.00 2.00 3.00

4.00 5.00 6.00

7.00 8.00 9.00

Matrix B:

0.34 -0.67 -2.38

-3.33 -3.66 -4.60

-6.08 -7.32 -7.47


Matrix C (Sum of A and B by rows):

1.34 1.33 0.62

0.67 1.34 1.40

0.92 1.68 1.53

Matrix C (Sum of A and B by columns):

1.34 1.33 0.62

0.67 1.34 1.40

0.92 1.68 1.53

Matrix C (Sum of A and B element-wise):

1.34 1.33 0.62

0.67 1.34 1.40

0.92 1.68 1.53

2) Write a program in CUDA to multiply two matrices for the following specifications:

• Each row of resultant matrix to be computed by one thread.

• Each column of resultant matrix to be computed by one thread.

• Each element of resultant matrix to be computed by one thread.

#include <stdio.h>

#include <cuda_runtime.h>

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

// Kernel function to multiply two matrices

__global__ void matrixMultiplication(int *a, int *b, int *c, int width) {

int row = blockIdx.y * blockDim.y + threadIdx.y; // Row index

int col = blockIdx.x * blockDim.x + threadIdx.x; // Column index

// Check if within matrix bounds

if (row < width && col < width) {

int sum = 0;

for (int i = 0; i < width; i++) {

sum += a[row * width + i] * b[i * width + col];

c[row * width + col] = sum;


}

int main() {

int width = 4; // Width of matrices

int size = width * width * sizeof(int); // Size of matrices in bytes

// Host matrices and result matrix

int *h_a, *h_b, *h_c;

h_a = (int*)malloc(size);

h_b = (int*)malloc(size);

h_c = (int*)malloc(size);

// Initialize host matrices with sample data

for (int i = 0; i < width * width; i++) {

h_a[i] = i;

h_b[i] = i * 2;

// Device matrices

int *d_a, *d_b, *d_c;

cudaMalloc(&d_a, size);

cudaMalloc(&d_b, size);

cudaMalloc(&d_c, size);

// Copy host matrices to device

cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);

cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

// Define kernel launch configuration

dim3 threadsPerBlock(2, 2);

dim3 numBlocks(width / threadsPerBlock.x, width / threadsPerBlock.y);

// Launch kernel

matrixMultiplication<<<numBlocks, threadsPerBlock>>>(d_a, d_b, d_c, width);

// Copy result matrix from device to host

cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

// Print result matrix

printf("Result Matrix:\n");

for (int i = 0; i < width; i++) {

for (int j = 0; j < width; j++) {


printf("%d ", h_c[i * width + j]);

printf("\n");

// Free device memory

cudaFree(d_a);

cudaFree(d_b);

cudaFree(d_c);

// Free host memory

free(h_a);

free(h_b);

free(h_c);

return 0;

Output:

Matrix A:

0123

4567

8 9 10 11

12 13 14 15

Matrix B:

0246

8 10 12 14

16 18 20 22

24 26 28 30

Result Matrix:

28 34 40 46

76 98 120 142

124 162 200 238

172 226 280 334


3) Write a CUDA program to perform linear algebra function of the form y=(alpha)x+y, where x and y are
vectors and "alpha" ia scalar value.

#include <stdio.h>

#include <cuda_runtime.h>

__global__ void vectorAdd(float *x, float *y, float alpha, int N) {

int idx = threadIdx.x + blockDim.x * blockIdx.x;

if (idx < N) {

y[idx] = alpha * x[idx] + y[idx];

int main() {

const int N = 5;

float x[N] = {1, 2, 3, 4, 5};

float y[N] = {6, 7, 8, 9, 10};

float alpha = 2;

float *d_x, *d_y;

cudaMalloc(&d_x, N * sizeof(float));

cudaMalloc(&d_y, N * sizeof(float));

cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);

cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);

int blockSize = 256;

int numBlocks = (N + blockSize - 1) / blockSize;

vectorAdd<<<numBlocks, blockSize>>>(d_x, d_y, alpha, N);

cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);

cudaFree(d_x);

cudaFree(d_y);

printf("Resulting vector y:\n");

for (int i = 0; i < N; ++i) {

printf("%f ", y[i]);

printf("\n");

return 0;

}
Output:

Input vector x:

12345

Input vector y:

6 7 8 9 10

Scalar alpha: 2

Resulting vector y:

8. 11 14 17 20

4) Write a CUDA program to sort every row of a matrix using selection sort.

#include <stdio.h>

#include <cuda_runtime.h>

__device__ void selectionSort(int *row, int size) {

for (int i = 0; i < size - 1; ++i) {

int minIndex = i;

for (int j = i + 1; j < size; ++j) {

if (row[j] < row[minIndex]) {

minIndex = j;

int temp = row[i];

row[i] = row[minIndex];

row[minIndex] = temp;

__global__ void sortRows(int *matrix, int rows, int cols) {

int row = blockIdx.x * blockDim.x + threadIdx.x;

if (row < rows) {

int *rowPtr = matrix + row * cols;

selectionSort(rowPtr, cols);

int main() {
int rows = 3;

int cols = 4;

int matrix[3][4] = {{9, 7, 5, 8},

{4, 6, 2, 1},

{3, 0, 2, 5}};

int *d_matrix;

cudaMalloc((void**)&d_matrix, rows * cols * sizeof(int));

cudaMemcpy(d_matrix, matrix, rows * cols * sizeof(int), cudaMemcpyHostToDevice);

int blockSize = 4; // Number of threads per block

int numBlocks = (rows + blockSize - 1) / blockSize;

sortRows<<<numBlocks, blockSize>>>(d_matrix, rows, cols);

cudaMemcpy(matrix, d_matrix, rows * cols * sizeof(int), cudaMemcpyDeviceToHost);

cudaFree(d_matrix);

printf("Sorted Matrix:\n");

for (int i = 0; i < rows; ++i) {

for (int j = 0; j < cols; ++j) {

printf("%d ", matrix[i][j]);

printf("\n");

return 0;

Output:

Input matrix:

97351

64820

59247

Sorted matrix (each row):

13579

02468

24579
5) Write a CUDA program to perform odd even transposition sort in parallel.

#include <stdio.h>

#include <cuda_runtime.h>

__global__ void oddEvenSort(int *arr, int n) {

int tid = threadIdx.x;

int ovset = blockDim.x;

for (int i = 0; i < n; ++i) {

int partner_tid = tid ^ 1;

int temp;

if ((i + tid) % 2 == 0 && partner_tid < n) {

if (arr[tid] > arr[partner_tid]) {

temp = arr[tid];

arr[tid] = arr[partner_tid];

arr[partner_tid] = temp; } }

__syncthreads(); }

int main() {

int n = 10, arr[] = {9, 4, 6, 2, 8, 5, 7, 1, 3, 0};

int *d_arr;

cudaMalloc((void**)&d_arr, n * sizeof(int));

cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice);

int blockSize = 10; // Number of threads per block

oddEvenSort<<<1, blockSize>>>(d_arr, n);

cudaMemcpy(arr, d_arr, n * sizeof(int), cudaMemcpyDeviceToHost);

cudaFree(d_arr);

printf("Sorted Array:\n");

for (int i = 0; i < n; ++i)

printf("%d ", arr[i]);

return 0;

Output:

Sorted Array:

0123456789

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy