0% found this document useful (0 votes)

17 views44 pages

Week 6 10

Uploaded by

Aditya Raj

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

17 views44 pages

Week 6 10

Uploaded by

Aditya Raj

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 44

Aditya Raj

210968136
Batch 4
Parallel Programming Lab
04 April 2024.
Week 6 – 16 February 2024.
1) Write a simple MPI program to ﬁnd out pow (x, rank) for all the processes where 'x' is the integer
constant, and 'rank' is the rank of the process.

#include <stdio.h>

#include <math.h>

#include <mpi.h>

int main(int argc, char* argv[])

MPI_Init(&argc, &argv);

int world_size, world_rank;

MPI_Comm_size(MPI_COMM_WORLD, &world_size);

MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

// Assuming x is a constant value

int x = 40;

double result = pow(x, world_rank);

printf("Process %d: pow(%d, %d) = %f\n", world_rank, x, world_rank, result);

MPI_Finalize();

return 0;

Output:

Process 2: pow(40, 2) = 1600.000000

Process 1: pow(40, 1) = 40.000000

Process 0: pow(40, 0) = 1.000000

Process 4: pow(40, 4) = 2560000.000000

Process 3: pow(40, 3) = 64000.000000

2) Write a program in MPI where even ranked process prints "Hello" and odd ranked process prints "World".

#include <stdio.h>

#include <mpi.h>

int main(int argc, char* argv[]) {

MPI_Init(&argc, &argv);

int world_size, world_rank;

MPI_Comm_size(MPI_COMM_WORLD, &world_size);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

if (world_rank % 2 == 0) {

printf(" EVEN Hello from process %d\n", world_rank);

else {

printf("ODD World from process %d\n", world_rank);

MPI_Finalize();

return 0;

Output:

Odd World from process 13

Even Hello from process 2

Odd World from process 9

Odd World from process 1

Even Hello from process 8

Even Hello from process 12

Odd World from process 5

Odd World from process 3

Odd World from process 11

Even Hello from process 6

Even Hello from process 0

Even Hello from process 4

Odd World from process 7

Even Hello from process 10

3) Write a program in MPI to simulate simple calculator. Perform each operation using diBerent process in
parallel.

#include <stdio.h>

#include <stdlib.h>

#include <mpi.h>

int main(int argc, char* argv[]) {

MPI_Init(&argc, &argv);
int world_size, world_rank;

MPI_Comm_size(MPI_COMM_WORLD, &world_size);

MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

char operations[] = { '+', '-', '*', '/' };

char operation = operations[world_rank % 4];

double operand1 = 10.0, operand2 = 2.0;

double result = 0.0;

switch (operation) {

case '+':

result = operand1 + operand2;

break;

case '-':

result = operand1 - operand2;

break;

case '*':

result = operand1 * operand2;

break;

case '/':

if (operand2 != 0) {

result = operand1 / operand2;

else {

fprintf(stderr, "Error: Division by zero.\n");

MPI_Abort(MPI_COMM_WORLD, 1);

break;

default:

fprintf(stderr, "Error: Unknown operation.\n");

MPI_Abort(MPI_COMM_WORLD, 1);

// Allocate memory dynamically for gathered_results

double* gathered_results = (double)malloc(world_size sizeof(double));

MPI_Gather(&result, 1, MPI_DOUBLE, gathered_results, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);

if (world_rank == 0) {
printf("Results:\n");

for (int i = 0; i < world_size; ++i) {

printf("Process %d: %.2f %c %.2f = %.2f\n", i, operand1, operations[i % 4], operand2, gathered_results[i]);

// Free dynamically allocated memory

free(gathered_results);

MPI_Finalize();

return 0;

Output:

Process 0: 10.00 + 2.00 = 12.00

Process 1: 10.00 - 2.00 = 8.00

Process 2: 10.00 * 2.00 = 20.00

Process 3: 10.00 / 2.00 = 5.00

Process 4: 10.00 + 2.00 = 12.00

Process 5: 10.00 - 2.00 = 8.00

Process 6: 10.00 * 2.00 = 20.00

Process 7: 10.00 / 2.00 = 5.00

Process 8: 10.00 + 2.00 = 12.00

Process 9: 10.00 - 2.00 = 8.00

Process 10: 10.00 * 2.00 = 20.00

Process 11: 10.00 / 2.00 = 5.00

Process 12: 10.00 + 2.00 = 12.00

Process 13: 10.00 - 2.00 = 8.00

4) Write a program in MPI to toggle the character of a given string indexed by the rank of the process.

#include <stdio.h>

#include <string.h>

#include <mpi.h>

#deﬁne MAX_STRING_SIZE 100

int main(int argc, char *argv[]) {

int rank, size;

char str[MAX_STRING_SIZE] = "HeLLO";

int str_len;

MPI_Init(&argc, &argv);

MPI_Comm_rank(MPI_COMM_WORLD, &rank);

MPI_Comm_size(MPI_COMM_WORLD, &size);

str_len = strlen(str);

// Determine the character index to toggle based on process rank

int char_index = rank % str_len;

// Toggle the character

if (rank < str_len) {

if (str[char_index] >= 'a' && str[char_index] <= 'z') {

str[char_index] = str[char_index] - 32; // Convert to uppercase

} else if (str[char_index] >= 'A' && str[char_index] <= 'Z') {

str[char_index] = str[char_index] + 32; // Convert to lowercase

// Gather all modiﬁed strings to process 0

MPI_Gather(rank < str_len ? &str[char_index] : NULL, 1, MPI_CHAR,

str, 1, MPI_CHAR, 0, MPI_COMM_WORLD);

// Print the result in process 0

if (rank == 0) {

printf("Original String: HeLLO\n");

printf("Modiﬁed String: %s\n", str);

MPI_Finalize();

return 0;

Output:

Original String: ThisIsALongerString

Modiﬁed String: tHiSiSaLoNgErStRiNg

5) Write a program in MPI to reverse the digits of the following integer array of size 9 with 9 processes.
Initialize the Input array to the following values.

Input array : 18, 523, 301, 1234, 2, 14, 108, 150, 1928 Output array: 81, 325, 103, 4321, 2, 41, 801, 51, 8291

#include <stdio.h>

#include <mpi.h>

#deﬁne ARRAY_SIZE 9

int reverseDigits(int num) {

int reversed = 0;

while (num > 0) {

reversed = reversed * 10 + num % 10;

num /= 10;

return reversed;

int main(int argc, char* argv[]) {

MPI_Init(&argc, &argv);

int rank, size;

MPI_Comm_rank(MPI_COMM_WORLD, &rank);

MPI_Comm_size(MPI_COMM_WORLD, &size);

if (size != ARRAY_SIZE) {

if (rank == 0) {

fprintf(stderr, "Please run the program with exactly 9 processes.\n");

MPI_Finalize();

return 1;

int inputArray[ARRAY_SIZE] = { 18, 523, 301, 1234, 2, 14, 108, 150, 1928 };

int outputArray[ARRAY_SIZE];

int gatheredArray[ARRAY_SIZE];

// Scatter the input array among processes

MPI_Scatter(inputArray, 1, MPI_INT, &outputArray[rank], 1, MPI_INT, 0, MPI_COMM_WORLD);

// Reverse the digits

outputArray[rank] = reverseDigits(outputArray[rank]);
// Gather the reversed array using a separate buver

MPI_Gather(&outputArray[rank], 1, MPI_INT, gatheredArray, 1, MPI_INT, 0, MPI_COMM_WORLD);

// Print the result

if (rank == 0) {

printf("Input array : ");

for (int i = 0; i < ARRAY_SIZE; i++) {

printf("%d ", inputArray[i]);

printf("\nOutput array: ");

for (int i = 0; i < ARRAY_SIZE; i++) {

printf("%d ", gatheredArray[i]);

printf("\n");

MPI_Finalize();

return 0;

Output:

Input array : 18 523 301 1234 2 14 108 150 1928

Output array: 81 325 103 4321 2 41 801 51 8291

6) Write a MPI program to ﬁnd the prime numbers between 1 and 100 using 2 processes.

#include <stdio.h>

#include <stdbool.h>

#include <mpi.h>

#deﬁne RANGE_START 1

#deﬁne RANGE_END 100

bool isPrime(int num) {

if (num < 2) {

return false;

for (int i = 2; i * i <= num; i++) {

if (num % i == 0) {

return false;

return true;

int main(int argc, char* argv[]) {

MPI_Init(&argc, &argv);

int rank, size;

MPI_Comm_rank(MPI_COMM_WORLD, &rank);

MPI_Comm_size(MPI_COMM_WORLD, &size);

if (size != 2) {

if (rank == 0) {

fprintf(stderr, "Please run the program with exactly 2 processes.\n");

MPI_Finalize();

return 1;

int start, end;

int primesInRange = 0;

if (rank == 0) {

// Process 0 will check for primes in the ﬁrst half of the range

start = RANGE_START;

end = RANGE_END / 2;

else {

// Process 1 will check for primes in the second half of the range

start = RANGE_END / 2 + 1;

end = RANGE_END;

// Each process checks for prime numbers in its assigned range

for (int num = start; num <= end; num++) {

if (isPrime(num)) {
primesInRange++;

// Process 0 receives the count of primes from Process 1 and adds them

if (rank == 0) {

int receivedPrimes;

MPI_Recv(&receivedPrimes, 1, MPI_INT, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

primesInRange += receivedPrimes;

printf("Prime numbers between %d and %d: %d\n", RANGE_START, RANGE_END, primesInRange);

else {

// Process 1 sends its count of primes to Process 0

MPI_Send(&primesInRange, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);

MPI_Finalize();

return 0;

Output:

Prime numbers between 1 and 100: 25

Week 7 – 23 February 2024.
1) Write a MPI program using synchronous send. The sender process sends a word to the receiver. The
second process receives the word, toggles each letter of the word and sends it back to the ﬁrst process.
Both processes use synchronous send operations.

#include <stdio.h>

#include <string.h>

#include <ctype.h>

#include <mpi.h>

void toggle_case(char* word) {

for (int i = 0; word[i] != '\0'; i++) {

if (isupper(word[i]))

word[i] = tolower(word[i]);

else if (islower(word[i]))

word[i] = toupper(word[i]);

int main(int argc, char *argv[]) {

MPI_Init(&argc, &argv);

int rank;

MPI_Comm_rank(MPI_COMM_WORLD, &rank);

const int tag = 0;

const int root = 0;

if (rank == root) {

// Sender process

const char* word = "Hello";

printf("Process %d sends word: %s\n", rank, word);

MPI_Ssend(word, strlen(word) + 1, MPI_CHAR, 1, tag, MPI_COMM_WORLD);

// Receive the toggled word back from process 1

char received_word[100];

MPI_Recv(received_word, 100, MPI_CHAR, 1, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

printf("Process %d received toggled word: %s\n", rank, received_word);

} else if (rank == 1) {

// Receiver process

char word[100];
MPI_Recv(word, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

printf("Process %d received word: %s\n", rank, word);

// Toggle the case of each letter in the word

toggle_case(word);

// Send it back to the sender process

MPI_Ssend(word, strlen(word) + 1, MPI_CHAR, 0, tag, MPI_COMM_WORLD);

MPI_Finalize();

return 0;

Output:

Process 1 received word: Hello

Process 0 sends word: Hello

Process 0 received toggled word: hELLO

2) Write a MPI program where the master process (process 0) sends a number to each of the slaves and
the slave processes receive the number and prints it. Use standard send.

#include <mpi.h>

#include <stdio.h>

int main(int argc, char** argv) {

MPI_Init(&argc, &argv);

int world_size;

MPI_Comm_size(MPI_COMM_WORLD, &world_size);

int world_rank;

MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

int number;

if (world_rank == 0) {

// Master process

// Choose a number to send to all slave processes

number = 777; // You can change this number to any number you want to send

// Use MPI_Send to send it to all the other processes

for (int i = 1; i < world_size; i++) {

MPI_Send(&number, 1, MPI_INT, i, 0, MPI_COMM_WORLD);

printf("Master process sending number %d to process %d\n", number, i);

} else {

// Slave processes

MPI_Recv(&number, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

printf("Slave process %d received number %d from master process\n", world_rank, number);

MPI_Finalize();

return 0;

Output:

Slave process 3 received number 123 from master process

Slave process 2 received number 123 from master process

Slave process 1 received number 123 from master process

Master process sending number 123 to process 1

Master process sending number 123 to process 2

Master process sending number 123 to process 3

3) Write a MPI program to read N elements of the array in the root process (process 0) where N is equal to the total
number of process. The root process sends one value to each of the slaves. Let even ranked process ﬁnds square of
the received element and odd ranked process ﬁnds cube of received element. Use BuKered send.

#include <stdio.h>

#include <stdlib.h>

#include <mpi.h>

int main(int argc, char *argv[]) {

MPI_Init(&argc, &argv);

int world_size;

MPI_Comm_size(MPI_COMM_WORLD, &world_size);

int world_rank;

MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

// The root process (process 0) will read N elements where N is equal to world_size

int *numbers = NULL;

if (world_rank == 0) {

numbers = (int)malloc(sizeof(int) world_size);

// Initialize the array with some values

for (int i = 0; i < world_size; i++) {

numbers[i] = i + 1; // Or any other logic to initialize the array

// Buver for buvered send

int buver_size = world_size * sizeof(int) + MPI_BSEND_OVERHEAD;

void *buver = malloc(buver_size);

MPI_Buver_attach(buver, buver_size);

// Distribute one number to each slave process

if (world_rank == 0) {

for (int i = 1; i < world_size; i++) {

MPI_Bsend(&numbers[i], 1, MPI_INT, i, 0, MPI_COMM_WORLD);

// Each slave process receives a number and performs its operation

int received_number;

if (world_rank != 0) {

MPI_Recv(&received_number, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

if (world_rank % 2 == 0) {

// Even rank: square the number

printf("Process %d received %d, squaring it to %d\n", world_rank, received_number, received_number *

received_number);

} else {

// Odd rank: cube the number

printf("Process %d received %d, cubing it to %d\n", world_rank, received_number, received_number *

received_number * received_number);

// Detach and free the buver

MPI_Buver_detach(&buver, &buver_size);

free(buver);

// Free numbers array on root process

if(world_rank == 0) {

free(numbers);

MPI_Finalize();

return 0;

Output:

Process 1 received 2, cubing it to 8

Process 3 received 4, cubing it to 64

Process 2 received 3, squaring it to 9

4) Write a MPI program to read an integer value in the root process. Root process sends this value to
Process1, Process1 sends this value to Process2 and so on. Last process sends the value back to root
process. When sending the value each process will ﬁrst increment the received value by one. Write the
program using point to point communication routines.

#include <stdio.h>

#include <mpi.h>

int main(int argc, char** argv) {

int value;

int world_rank, world_size;

MPI_Init(&argc, &argv);

MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

MPI_Comm_size(MPI_COMM_WORLD, &world_size);

if (world_rank == 0) {

// Root process reads the integer value

value = 10; // Example value, can be read from user input

printf("Root process starts with value: %d\n", value);

MPI_Send(&value, 1, MPI_INT, world_rank + 1, 0, MPI_COMM_WORLD);

} else {

MPI_Recv(&value, 1, MPI_INT, world_rank - 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

value++; // Increment the value by one

printf("Process %d incremented value to: %d\n", world_rank, value);

if (world_rank < world_size - 1) {

MPI_Send(&value, 1, MPI_INT, world_rank + 1, 0, MPI_COMM_WORLD);

} else {

// Last process sends it back to the root process

MPI_Send(&value, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);

// Root process receives the value from the last process

if (world_rank == 0) {

MPI_Recv(&value, 1, MPI_INT, world_size - 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

printf("Root process received ﬁnal value: %d\n", value);

MPI_Finalize();

return 0;

Output:

Process 1 incremented value to: 11

Root process starts with value: 10

Root process received ﬁnal value: 13

Process 3 incremented value to: 13

Process 2 incremented value to: 12

5) Write a MPI program to read N elements of an array in the master process. Let N processes including
master process check the array values are prime or not.

#include <stdio.h>

#include <stdlib.h>

#include <mpi.h>

// Function to check if a number is prime

int is_prime(int number) {

if (number <= 1) return 0;

for (int i = 2; i * i <= number; i++) {

if (number % i == 0) return 0;

return 1;

}
int main(int argc, char** argv) {

int world_rank, world_size, number_to_check;

int* array = NULL; // Initialize the pointer to NULL

MPI_Init(&argc, &argv);

MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

MPI_Comm_size(MPI_COMM_WORLD, &world_size);

if (world_rank == 0) {

array = (int)malloc(sizeof(int) world_size); // Allocate memory only on the root process

for (int i = 0; i < world_size; i++) {

array[i] = i + 2; // Example values, start from 2 (ﬁrst prime number)

// Use MPI_Scatter to distribute the values

MPI_Scatter(array, 1, MPI_INT, &number_to_check, 1, MPI_INT, 0, MPI_COMM_WORLD);

// Each process checks if the number received is prime

int result = is_prime(number_to_check);

printf("Process %d received %d, prime? %s\n", world_rank, number_to_check, result ? "Yes" : "No");

// Free the allocated memory on the root process

if (world_rank == 0) {

free(array);

MPI_Finalize();

return 0;

Output:

Process 1 received 3, prime? Yes

Process 2 received 4, prime? No

Process 0 received 2, prime? Yes

Process 3 received 5, prime? Yes

Week 8 – 1 March 2024.
1. Write a program in CUDA to add two vectors of length N using

a) block size as N b) N threads

#include <stdio.h>

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#include <cstdlib>

// CUDA kernel to add elements of two arrays

global void add(int* x, int* y, int* z, int n) {

int index = threadIdx.x;

if (index < n)

z[index] = x[index] + y[index];

// Function to print the vector

void printVector(int* vector, int size) {

for (int i = 0; i < size; i++) {

printf("%d ", vector[i]);

printf("\n");

int main() {

int N = 1024; // Example size, make sure it does not exceed your GPU's capability

int* x, * y, * z, * d_x, * d_y, * d_z;

// Allocate memory

x = (int*)malloc(N * sizeof(int));

y = (int*)malloc(N * sizeof(int));

z = (int*)malloc(N * sizeof(int));

cudaMalloc(&d_x, N * sizeof(int));

cudaMalloc(&d_y, N * sizeof(int));

cudaMalloc(&d_z, N * sizeof(int));

// Initialize arrays
for (int i = 0; i < N; i++) {

x[i] = 5;

y[i] = 2;

// Copy inputs to device

cudaMemcpy(d_x, x, N * sizeof(int), cudaMemcpyHostToDevice);

cudaMemcpy(d_y, y, N * sizeof(int), cudaMemcpyHostToDevice);

// Launch add() kernel on GPU

add << <1, N >> > (d_x, d_y, d_z, N);

// Copy result back to host

cudaMemcpy(z, d_z, N * sizeof(int), cudaMemcpyDeviceToHost);

// Print the result

printf("Result: ");

printVector(z, N);

// Cleanup

cudaFree(d_x); cudaFree(d_y); cudaFree(d_z);

free(x); free(y); free(z);

return 0;

Output:

Result: 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777

#include <stdio.h>

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>
#include <cstdlib>

global void add(int* x, int* y, int* z, int n) {

int index = blockIdx.x * blockDim.x + threadIdx.x;

if (index < n)

z[index] = x[index] + y[index];

void printVector(int* vector, int size) {

for (int i = 0; i < size; i++) {

printf("%d ", vector[i]);

printf("\n");

int main() {

int N = 2048; // Example size

int* x, * y, * z, * d_x, * d_y, * d_z;

x = (int*)malloc(N * sizeof(int));

y = (int*)malloc(N * sizeof(int));

z = (int*)malloc(N * sizeof(int));

cudaMalloc(&d_x, N * sizeof(int));

cudaMalloc(&d_y, N * sizeof(int));

cudaMalloc(&d_z, N * sizeof(int));

for (int i = 0; i < N; i++) {

x[i] = 5;

y[i] = 2;

cudaMemcpy(d_x, x, N * sizeof(int), cudaMemcpyHostToDevice);

cudaMemcpy(d_y, y, N * sizeof(int), cudaMemcpyHostToDevice);

int blockSize = 256; // Choose appropriate block size

int numBlocks = (N + blockSize - 1) / blockSize;

add << <numBlocks, blockSize >> > (d_x, d_y, d_z, N);

cudaMemcpy(z, d_z, N * sizeof(int), cudaMemcpyDeviceToHost);

printf("Result: ");

printVector(z, N);

cudaFree(d_x); cudaFree(d_y); cudaFree(d_z);

free(x); free(y); free(z);

return 0;

Output:

Results: 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
77777777777777777777777777777777777777777777777777777777777

2) Implement a CUDA program to add two vectors of length N by keeping the number of threads per block
as 256 (constant) and vary the number of blocks to handle N elements.

#include <stdio.h>

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#include <cstdlib>

// CUDA kernel to add elements of two arrays

global void add(int* a, int* b, int* c, int N) {

int index = threadIdx.x + blockIdx.x * blockDim.x;

if (index < N)

c[index] = a[index] + b[index];

int main() {

int N = 1 << 20; // Example: Number of elements in each vector (1 Million elements)

int size = N * sizeof(int);

int threadsPerBlock = 256;

int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; // Calculate needed blocks

// Allocate memory on the host

int* h_a, * h_b, * h_c;

h_a = (int*)malloc(size);

h_b = (int*)malloc(size);

h_c = (int*)malloc(size);

// Initialize vectors on the host

for (int i = 0; i < N; i++) {

h_a[i] = i;

h_b[i] = i;

// Allocate vectors in device memory

int* d_a, * d_b, * d_c;

cudaMalloc(&d_a, size);

cudaMalloc(&d_b, size);

cudaMalloc(&d_c, size);

// Copy vectors from host memory to device memory

cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);

cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

// Invoke kernel

add << <blocksPerGrid, threadsPerBlock >> > (d_a, d_b, d_c, N);

// Copy result from device memory to host memory

cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

// Print the results: Print the ﬁrst 10 elements

printf("Result of Vector Addition (First 10 Elements):\n");

for (int i = 0; i < 10; i++) {

printf("h_c[%d] = %d\n", i, h_c[i]);

// Free device memory

cudaFree(d_a);

cudaFree(d_b);

cudaFree(d_c);

// Free host memory

free(h_a);

free(h_b);

free(h_c);

return 0;
}

Output:

Result of Vector Addition (First 10 Elements):

h_c[0] = 0

h_c[1] = 2

h_c[2] = 4

h_c[3] = 6

h_c[4] = 8

h_c[5] = 10

h_c[6] = 12

h_c[7] = 14

h_c[8] = 16

h_c[9] = 18

3) Write a program in CUDA which performs convolution operation on one dimensional input array N of size width
using a mask array M of size mask_width to produce the resultant one-dimensional array P of size width.

#include <stdio.h>

#include <cuda_runtime.h>

#include "device_launch_parameters.h"

#include <stdio.h>

#include <cstdlib>

// CUDA Kernel for one-dimensional convolution

global void convolution_1d(int* N, int* M, int* P, int width, int mask_width) {

int i = blockIdx.x * blockDim.x + threadIdx.x;

int k;

// Each thread computes one element of P

if (i < width) {

int pValue = 0;

// Convolution operation

for (k = 0; k < mask_width; ++k) {

int maskIndex = mask_width - 1 - k;

int nIndex = i - (mask_width / 2) + k;

if (nIndex >= 0 && nIndex < width) {

pValue += N[nIndex] * M[maskIndex];

}

P[i] = pValue;

// Function to print the array

void printArray(int* array, int size) {

for (int i = 0; i < size; i++) {

printf("%d ", array[i]);

printf("\n");

int main() {

int width = 10; // Example array size

int mask_width = 3; // Example mask size

int N[10] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; // Example input array

int M[3] = { 1, 0, -1 }; // Example mask array

int P[10]; // Resultant array

int* d_N, * d_M, * d_P;

// Allocate memory on the device

cudaMalloc(&d_N, width * sizeof(int));

cudaMalloc(&d_M, mask_width * sizeof(int));

cudaMalloc(&d_P, width * sizeof(int));

// Copy inputs to device

cudaMemcpy(d_N, N, width * sizeof(int), cudaMemcpyHostToDevice);

cudaMemcpy(d_M, M, mask_width * sizeof(int), cudaMemcpyHostToDevice);

// Launch kernel on the GPU

convolution_1d << <(width + 255) / 256, 256 >> > (d_N, d_M, d_P, width, mask_width);

// Copy result back to host

cudaMemcpy(P, d_P, width * sizeof(int), cudaMemcpyDeviceToHost);

// Print the resultant array

printf("Resultant array: ");

printArray(P, width);

// Free device memory

cudaFree(d_N);
cudaFree(d_M);

cudaFree(d_P);

return 0;

Output:

Resultant array: 2 2 2 2 2 2 2 2 2 -9

4) Write a program in CUDA to process a ID array containing angles in radians to generate sine of the
angles in the output array. Use appropriate function.

#include <stdio.h>

#include <cuda_runtime.h>

#include <math.h>

#include "device_launch_parameters.h"

#include <stdio.h>

#include <cstdlib>

// Deﬁne M_PI if it's not deﬁned by the math library

#ifndef M_PI

#deﬁne M_PI 3.14159265358979323846

#endif

// CUDA Kernel to compute sine of angles in radians

global void compute_sine(ﬂoat* input, ﬂoat* output, int n) {

int index = blockIdx.x * blockDim.x + threadIdx.x;

if (index < n) {

output[index] = sinf(input[index]); // Use sinf for single precision ﬂoat

// Function to print the array

void printArray(ﬂoat* array, int size) {

for (int i = 0; i < size; i++) {

printf("%f ", array[i]);

printf("\n");
}

int main() {

int n = 5; // Example array size

ﬂoat inputAngles[] = { 0.0, M_PI / 6, M_PI / 4, M_PI / 2, M_PI }; // Example angles in radians

// Allocate memory on the device

ﬂoat* d_input, * d_output;

cudaMalloc((void**)&d_input, n * sizeof(ﬂoat));

cudaMalloc((void**)&d_output, n * sizeof(ﬂoat));

// Copy the input array from host to device

cudaMemcpy(d_input, inputAngles, n * sizeof(ﬂoat), cudaMemcpyHostToDevice);

// Calculate the number of blocks and threads per block

int threadsPerBlock = 256;

int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;

// Allocate memory for the output array on the host

ﬂoat* output = new ﬂoat[n]; // Use dynamic allocation

// Launch the CUDA Kernel

compute_sine << <blocksPerGrid, threadsPerBlock >> > (d_input, d_output, n);

// Copy the result back to the host

cudaMemcpy(output, d_output, n * sizeof(ﬂoat), cudaMemcpyDeviceToHost);

// Print the resultant array

printf("Sine of angles: ");

printArray(output, n);

// Free device memory

cudaFree(d_input);

cudaFree(d_output);

// Free host memory

delete[] output;

return 0;

Output:

Sine of angles: 0.000000 0.500000 0.707107 1.000000 -0.000000

Week 9 – 8 March 2024.
1) Write a program in CUDA to count the number of times a given word is repeated in a sentence (Use
Atomic Function).

#include <stdio.h>

#include <cuda_runtime.h>

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#include <cstdlib>

#include <string.h>

#include <sm_20_atomic_functions.h>

__global__ void countWordOccurrences(char* sentence, char* word, int sentenceLength, int wordLength, int*
count) {

int index = threadIdx.x + blockIdx.x * blockDim.x;

int stride = blockDim.x * gridDim.x;

int localCount = 0;

// Each thread checks a part of the sentence for the word

for (int i = index; i <= sentenceLength - wordLength; i += stride) {

bool wordFound = true;

for (int j = 0; j < wordLength; j++) {

if (sentence[i + j] != word[j]) {

wordFound = false;

break;

if (wordFound) {

localCount++;

// Use atomicAdd to safely add the local count to the global total

atomicAdd(count, localCount);

int main() {

char* sentence = "hello hello world world world world"; // Example sentence
char* word = "world"; // Word to count in the sentence

int* count;

char* d_sentence, * d_word;

int* d_count;

int sentenceLength = strlen(sentence);

int wordLength = strlen(word);

// Allocate memory for device copies of sentence, word, count

cudaMalloc((void**)&d_sentence, sentenceLength);

cudaMalloc((void**)&d_word, wordLength);

cudaMalloc((void**)&d_count, sizeof(int));

// Copy inputs to device

cudaMemcpy(d_sentence, sentence, sentenceLength, cudaMemcpyHostToDevice);

cudaMemcpy(d_word, word, wordLength, cudaMemcpyHostToDevice);

cudaMemset(d_count, 0, sizeof(int));

// Launch countWordOccurrences() kernel on GPU with enough blocks and threads

countWordOccurrences << <1, 256 >> > (d_sentence, d_word, sentenceLength, wordLength, d_count);

// Copy result back to host

cudaMemcpy(count, d_count, sizeof(int), cudaMemcpyDeviceToHost);

printf("The word '%s' appears %d times in the sentence.\n", word, *count);

// Cleanup

cudaFree(d_sentence); cudaFree(d_word); cudaFree(d_count);

return 0;

Output:

The word ‘world’ appears 4 times in the sentence.

2) Write a CUDA program that reads a string S and produces the string RS as follows:

Input string S: PCAP Output string RS: PCAPPCAPCP Note: Each work item copies required number of
characters from S in RS.

#include <stdio.h>

#include <cuda_runtime.h>
__global__ void generateString(char* inputString, char* outputString, int length) {

int index = threadIdx.x + blockIdx.x * blockDim.x;

int stride = blockDim.x * gridDim.x;

for (int i = index; i < length; i += stride) {

outputString[i] = inputString[i % 4]; // Modulo operation to repeat the string

int main() {

char inputString[] = "PCAP"; // Example input string

int length = 12; // Length of the output string (4 times the length of inputString)

char* d_inputString, * d_outputString;

// Allocate memory for device copies of inputString and outputString

cudaMalloc((void**)&d_inputString, strlen(inputString));

cudaMalloc((void**)&d_outputString, length);

// Copy inputString to device

cudaMemcpy(d_inputString, inputString, strlen(inputString), cudaMemcpyHostToDevice);

// Launch generateString kernel on GPU with enough blocks and threads

generateString << <1, 256 >> > (d_inputString, d_outputString, length);

// Copy result back to host

char* outputString = (char*)malloc(length);

cudaMemcpy(outputString, d_outputString, length, cudaMemcpyDeviceToHost);

printf("Input string S: %s\n", inputString);

printf("Output string RS: %s\n", outputString);

// Cleanup

cudaFree(d_inputString);

cudaFree(d_outputString);

free(outputString);

return 0;

Output:

Input string S: PCAP

Output string RS: PCAPPCAPPCAP

3) Write a CUDA program which reads a string consisting of N words and reverse each word of it in parallel.

#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

__global__ void reverseWords(char* inputString, char* outputString, int* wordLengths, int numWords) {

int index = threadIdx.x + blockIdx.x * blockDim.x;

int stride = blockDim.x * gridDim.x;

// Iterate over each word

for (int i = index; i < numWords; i += stride) {

int wordStart = (i == 0) ? 0 : wordLengths[i - 1]; // Start index of the word

int wordEnd = wordLengths[i]; // End index of the word

// Reverse the word

for (int j = wordStart; j < (wordStart + wordEnd) / 2; j++) {

char temp = inputString[j];

outputString[j] = inputString[wordStart + wordEnd - j - 1];

outputString[wordStart + wordEnd - j - 1] = temp;

int main() {

char inputString[] = "Hello World CUDA Program"; // Example input string

int length = strlen(inputString);

int numWords = 0;

// Count the number of words

for (int i = 0; i < length; i++) {

if (inputString[i] == ' ') {

numWords++;

numWords++; // Increment for the last word

// Allocate memory for device copies of inputString, outputString, and wordLengths

char* d_inputString, * d_outputString;

int* d_wordLengths;

cudaMalloc((void**)&d_inputString, length);
cudaMalloc((void**)&d_outputString, length);

cudaMalloc((void**)&d_wordLengths, numWords * sizeof(int));

// Copy inputString to device

cudaMemcpy(d_inputString, inputString, length, cudaMemcpyHostToDevice);

// Calculate word lengths

int wordStartIndex = 0;

int* wordLengths = (int)malloc(numWords sizeof(int));

for (int i = 0; i < length; i++) {

if (inputString[i] == ' ') {

wordLengths[numWords - 1] = i - wordStartIndex; // Store length of each word

wordStartIndex = i + 1; // Move to the start of next word

numWords--;

wordLegths[numWords - 1] = length - wordStartIndex; // Length of the last word

// Copy wordLengths to device

cudaMemcpy(d_wordLengths, wordLengths, numWords * sizeof(int), cudaMemcpyHostToDevice);

// Launch reverseWords kernel on GPU with enough blocks and threads

int blockSize = 256;

int numBlocks = (length + blockSize - 1) / blockSize;

reverseWords << <numBlocks, blockSize >> > (d_inputString, d_outputString, d_wordLengths, numWords);

// Copy result back to host

char* outputString = (char*)malloc(length);

cudaMemcpy(outputString, d_outputString, length, cudaMemcpyDeviceToHost);

printf("Input string: %s\n", inputString);

printf("Output string: %s\n", outputString);

// Cleanup

cudaFree(d_inputString);

cudaFree(d_outputString);

cudaFree(d_wordLengths);

free(wordLengths);

free(outputString);

return 0;

}
Output:

Input string: Hello World

Output string: olleH dlroW

4) Write a CUDA program that takes a string Sin as input and one integer value N and produces an output
string, Sout, in parallel by concatenating input string Sin, N times as shown below.

INPUT : Sin ="Hello" N =3 OUTPUT : Sout = "HelloHelloHello" Note: Every thread copies the same
character from the Input string S, N times to the required position.

#include <stdio.h>

#include <cuda_runtime.h>

__global__ void concatenateString(char* inputString, int inputLength, char* outputString, int repetitions) {

int index = threadIdx.x + blockIdx.x * blockDim.x;

int stride = blockDim.x * gridDim.x;

// Copy characters from input string to output string

for (int i = index; i < inputLength * repetitions; i += stride) {

outputString[i] = inputString[i % inputLength]; // Copy characters from input string repeatedly

int main() {

char inputString[] = "Hello"; // Example input string

int length = strlen(inputString);

int repetitions = 3; // Number of times input string should be repeated

// Allocate memory for device copies of inputString and outputString

char* d_inputString, * d_outputString;

int outputLength = length * repetitions; // Length of the output string

cudaMalloc((void**)&d_inputString, length);

cudaMalloc((void**)&d_outputString, outputLength);

// Copy inputString to device

cudaMemcpy(d_inputString, inputString, length, cudaMemcpyHostToDevice);

// Launch concatenateString kernel on GPU with enough blocks and threads

int blockSize = 256;

int numBlocks = (outputLength + blockSize - 1) / blockSize;

concatenateString <<<numBlocks, blockSize>>> (d_inputString, length, d_outputString, repetitions);

// Copy result back to host

char* outputString = (char*)malloc(outputLength);

cudaMemcpy(outputString, d_outputString, outputLength, cudaMemcpyDeviceToHost);

printf("Input string: %s\n", inputString);

printf("Output string: %s\n", outputString);

// Cleanup

cudaFree(d_inputString);

cudaFree(d_outputString);

free(outputString);

return 0;

Output:

Input string: Hello

Output string: HelloHelloHello

5) Write a CUDA program which reads a string Sin and produces an output string T as shownbelow.

Input: Sin: "Hai" Ouput: T: "Haaiii" Note:Every thread stores a character from input string Sin, required
number of times intooutput string T.

#include <stdio.h>

#include <cuda_runtime.h>

__global__ void repeatCharacters(char* inputString, int inputLength, char* outputString, int repetitions) {

int index = threadIdx.x + blockIdx.x * blockDim.x;

int stride = blockDim.x * gridDim.x;

// Copy characters from input string to output string

for (int i = index; i < inputLength * repetitions; i += stride) {

outputString[i] = inputString[i % inputLength]; // Copy characters from input string repeatedly

// Repeat the last character multiple times

if (index == 0) {

char lastChar = inputString[inputLength - 1];

for (int i = inputLength * repetitions - 1; i >= inputLength * (repetitions - 1); --i) {

outputString[i] = lastChar;

}
int main() {

char inputString[] = "Hai"; // Example input string

int length = strlen(inputString);

int repetitions = 3; // Number of times each character should be repeated, except for the last character

// Allocate memory for device copies of inputString and outputString

char* d_inputString, * d_outputString;

int outputLength = length * repetitions + 2; // Length of the output string

cudaMalloc((void**)&d_inputString, length);

cudaMalloc((void**)&d_outputString, outputLength);

// Copy inputString to device

cudaMemcpy(d_inputString, inputString, length, cudaMemcpyHostToDevice);

// Launch repeatCharacters kernel on GPU with enough blocks and threads

int blockSize = 256;

int numBlocks = (outputLength + blockSize - 1) / blockSize;

repeatCharacters<<<numBlocks, blockSize>>>(d_inputString, length, d_outputString, repetitions);

// Copy result back to host

char* outputString = (char*)malloc(outputLength);

cudaMemcpy(outputString, d_outputString, outputLength, cudaMemcpyDeviceToHost);

printf("Input string: %s\n", inputString);

printf("Output string: %s\n", outputString);

// Cleanup

cudaFree(d_inputString);

cudaFree(d_outputString);

free(outputString);

return 0;

Output:

Input string: Hello

Output string: Heelllllllooooo

Week 10 – 26 March 2024.
1) Write a program in CUDA to add two matrices for the following speciﬁcations:

• Each row of resultant matrix to be computed by one thread.

• Each column of resultant matrix to be computed by one thread.

• Each element of resultant matrix to be computed by one thread.

#include <stdio.h>

#include <cuda_runtime.h>

#deﬁne N 3 // Matrix size

// Kernel to add two matrices where each row of the resultant matrix is computed by one thread

global void addMatrixRows(float* A, float* B, float* C) {

int row = blockIdx.x; // Each block computes one row

int col = threadIdx.x; // Each thread computes one element of the row

C[row * N + col] = A[row * N + col] + B[row * N + col];

// Kernel to add two matrices where each column of the resultant matrix is computed by one thread

global void addMatrixColumns(float* A, float* B, float* C) {

int row = threadIdx.x; // Each thread computes one element of the column

int col = blockIdx.x; // Each block computes one column

C[row * N + col] = A[row * N + col] + B[row * N + col];

// Kernel to add two matrices where each element of the resultant matrix is computed by one thread

global void addMatrixElements(float* A, float* B, float* C) {

int index = threadIdx.x + blockIdx.x * blockDim.x; // Each thread computes one element of the matrix

if (index < N * N) {

C[index] = A[index] + B[index];

// Helper function to initialize matrices with random values

void initializeMatrix(ﬂoat* matrix) {

for (int i = 0; i < N * N; i++) {

matrix[i] = (ﬂoat)rand() / RAND_MAX; // Random value between 0 and 1

}
// Helper function to print matrix

void printMatrix(ﬂoat* matrix) {

for (int i = 0; i < N; i++) {

for (int j = 0; j < N; j++) {

printf("%.2f\t", matrix[i * N + j]);

printf("\n");

int main() {

// Allocate memory for host matrices

ﬂoat h_A, h_B, *h_C;

h_A = (ﬂoat)malloc(N N * sizeof(ﬂoat));

h_B = (ﬂoat)malloc(N N * sizeof(ﬂoat));

h_C = (ﬂoat)malloc(N N * sizeof(ﬂoat));

// Initialize host matrices with random values

initializeMatrix(h_A);

initializeMatrix(h_B);

// Allocate memory for device matrices

ﬂoat d_A, d_B, *d_C;

cudaMalloc((void**)&d_A, N * N * sizeof(ﬂoat));

cudaMalloc((void**)&d_B, N * N * sizeof(ﬂoat));

cudaMalloc((void**)&d_C, N * N * sizeof(ﬂoat));

// Copy host matrices to device

cudaMemcpy(d_A, h_A, N * N * sizeof(ﬂoat), cudaMemcpyHostToDevice);

cudaMemcpy(d_B, h_B, N * N * sizeof(ﬂoat), cudaMemcpyHostToDevice);

// Launch kernel to add matrices by rows

addMatrixRows<<<N, N>>>(d_A, d_B, d_C);

// Copy result matrix from device to host

cudaMemcpy(h_C, d_C, N * N * sizeof(ﬂoat), cudaMemcpyDeviceToHost);

// Print result matrix

printf("Matrix C (Sum of A and B by rows):\n");

printMatrix(h_C);

// Launch kernel to add matrices by columns

addMatrixColumns<<<N, N>>>(d_A, d_B, d_C);

// Copy result matrix from device to host

cudaMemcpy(h_C, d_C, N * N * sizeof(ﬂoat), cudaMemcpyDeviceToHost);

// Print result matrix

printf("\nMatrix C (Sum of A and B by columns):\n");

printMatrix(h_C);

// Launch kernel to add matrices element-wise

addMatrixElements<<<(N * N + 255) / 256, 256>>>(d_A, d_B, d_C);

// Copy result matrix from device to host

cudaMemcpy(h_C, d_C, N * N * sizeof(ﬂoat), cudaMemcpyDeviceToHost);

// Print result matrix

printf("\nMatrix C (Sum of A and B element-wise):\n");

printMatrix(h_C);

// Free device and host memory

cudaFree(d_A);

cudaFree(d_B);

cudaFree(d_C);

free(h_A);

free(h_B);

free(h_C);

return 0;

Output:

Matrix A:

1.00 2.00 3.00

4.00 5.00 6.00

7.00 8.00 9.00

Matrix B:

0.34 -0.67 -2.38

-3.33 -3.66 -4.60

-6.08 -7.32 -7.47

Matrix C (Sum of A and B by rows):

1.34 1.33 0.62

0.67 1.34 1.40

0.92 1.68 1.53

Matrix C (Sum of A and B by columns):

1.34 1.33 0.62

0.67 1.34 1.40

0.92 1.68 1.53

Matrix C (Sum of A and B element-wise):

1.34 1.33 0.62

0.67 1.34 1.40

0.92 1.68 1.53

2) Write a program in CUDA to multiply two matrices for the following speciﬁcations:

• Each row of resultant matrix to be computed by one thread.

• Each column of resultant matrix to be computed by one thread.

• Each element of resultant matrix to be computed by one thread.

#include <stdio.h>

#include <cuda_runtime.h>

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

// Kernel function to multiply two matrices

__global__ void matrixMultiplication(int *a, int *b, int *c, int width) {

int row = blockIdx.y * blockDim.y + threadIdx.y; // Row index

int col = blockIdx.x * blockDim.x + threadIdx.x; // Column index

// Check if within matrix bounds

if (row < width && col < width) {

int sum = 0;

for (int i = 0; i < width; i++) {

sum += a[row * width + i] * b[i * width + col];

c[row * width + col] = sum;

}

int main() {

int width = 4; // Width of matrices

int size = width * width * sizeof(int); // Size of matrices in bytes

// Host matrices and result matrix

int h_a, h_b, *h_c;

h_a = (int*)malloc(size);

h_b = (int*)malloc(size);

h_c = (int*)malloc(size);

// Initialize host matrices with sample data

for (int i = 0; i < width * width; i++) {

h_a[i] = i;

h_b[i] = i * 2;

// Device matrices

int d_a, d_b, *d_c;

cudaMalloc(&d_a, size);

cudaMalloc(&d_b, size);

cudaMalloc(&d_c, size);

// Copy host matrices to device

cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);

cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

// Deﬁne kernel launch conﬁguration

dim3 threadsPerBlock(2, 2);

dim3 numBlocks(width / threadsPerBlock.x, width / threadsPerBlock.y);

// Launch kernel

matrixMultiplication<<<numBlocks, threadsPerBlock>>>(d_a, d_b, d_c, width);

// Copy result matrix from device to host

cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

// Print result matrix

printf("Result Matrix:\n");

for (int i = 0; i < width; i++) {

for (int j = 0; j < width; j++) {

printf("%d ", h_c[i * width + j]);

printf("\n");

// Free device memory

cudaFree(d_a);

cudaFree(d_b);

cudaFree(d_c);

// Free host memory

free(h_a);

free(h_b);

free(h_c);

return 0;

Output:

Matrix A:

0123

4567

8 9 10 11

12 13 14 15

Matrix B:

0246

8 10 12 14

16 18 20 22

24 26 28 30

Result Matrix:

28 34 40 46

76 98 120 142

124 162 200 238

172 226 280 334

3) Write a CUDA program to perform linear algebra function of the form y=(alpha)x+y, where x and y are
vectors and "alpha" ia scalar value.

#include <stdio.h>

#include <cuda_runtime.h>

global void vectorAdd(float x, float y, float alpha, int N) {

int idx = threadIdx.x + blockDim.x * blockIdx.x;

if (idx < N) {

y[idx] = alpha * x[idx] + y[idx];

int main() {

const int N = 5;

ﬂoat x[N] = {1, 2, 3, 4, 5};

ﬂoat y[N] = {6, 7, 8, 9, 10};

ﬂoat alpha = 2;

ﬂoat d_x, d_y;

cudaMalloc(&d_x, N * sizeof(ﬂoat));

cudaMalloc(&d_y, N * sizeof(ﬂoat));

cudaMemcpy(d_x, x, N * sizeof(ﬂoat), cudaMemcpyHostToDevice);

cudaMemcpy(d_y, y, N * sizeof(ﬂoat), cudaMemcpyHostToDevice);

int blockSize = 256;

int numBlocks = (N + blockSize - 1) / blockSize;

vectorAdd<<<numBlocks, blockSize>>>(d_x, d_y, alpha, N);

cudaMemcpy(y, d_y, N * sizeof(ﬂoat), cudaMemcpyDeviceToHost);

cudaFree(d_x);

cudaFree(d_y);

printf("Resulting vector y:\n");

for (int i = 0; i < N; ++i) {

printf("%f ", y[i]);

printf("\n");

return 0;

}
Output:

Input vector x:

12345

Input vector y:

6 7 8 9 10

Scalar alpha: 2

Resulting vector y:

8. 11 14 17 20

4) Write a CUDA program to sort every row of a matrix using selection sort.

#include <stdio.h>

#include <cuda_runtime.h>

device void selectionSort(int *row, int size) {

for (int i = 0; i < size - 1; ++i) {

int minIndex = i;

for (int j = i + 1; j < size; ++j) {

if (row[j] < row[minIndex]) {

minIndex = j;

int temp = row[i];

row[i] = row[minIndex];

row[minIndex] = temp;

global void sortRows(int *matrix, int rows, int cols) {

int row = blockIdx.x * blockDim.x + threadIdx.x;

if (row < rows) {

int rowPtr = matrix + row cols;

selectionSort(rowPtr, cols);

int main() {
int rows = 3;

int cols = 4;

int matrix[3][4] = {{9, 7, 5, 8},

{4, 6, 2, 1},

{3, 0, 2, 5}};

int *d_matrix;

cudaMalloc((void**)&d_matrix, rows * cols * sizeof(int));

cudaMemcpy(d_matrix, matrix, rows * cols * sizeof(int), cudaMemcpyHostToDevice);

int blockSize = 4; // Number of threads per block

int numBlocks = (rows + blockSize - 1) / blockSize;

sortRows<<<numBlocks, blockSize>>>(d_matrix, rows, cols);

cudaMemcpy(matrix, d_matrix, rows * cols * sizeof(int), cudaMemcpyDeviceToHost);

cudaFree(d_matrix);

printf("Sorted Matrix:\n");

for (int i = 0; i < rows; ++i) {

for (int j = 0; j < cols; ++j) {

printf("%d ", matrix[i][j]);

printf("\n");

return 0;

Output:

Input matrix:

97351

64820

59247

Sorted matrix (each row):

13579

02468

24579
5) Write a CUDA program to perform odd even transposition sort in parallel.

#include <stdio.h>

#include <cuda_runtime.h>

global void oddEvenSort(int *arr, int n) {

int tid = threadIdx.x;

int ovset = blockDim.x;

for (int i = 0; i < n; ++i) {

int partner_tid = tid ^ 1;

int temp;

if ((i + tid) % 2 == 0 && partner_tid < n) {

if (arr[tid] > arr[partner_tid]) {

temp = arr[tid];

arr[tid] = arr[partner_tid];

arr[partner_tid] = temp; } }

__syncthreads(); }

int main() {

int n = 10, arr[] = {9, 4, 6, 2, 8, 5, 7, 1, 3, 0};

int *d_arr;

cudaMalloc((void**)&d_arr, n * sizeof(int));

cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice);

int blockSize = 10; // Number of threads per block

oddEvenSort<<<1, blockSize>>>(d_arr, n);

cudaMemcpy(arr, d_arr, n * sizeof(int), cudaMemcpyDeviceToHost);

cudaFree(d_arr);

printf("Sorted Array:\n");

for (int i = 0; i < n; ++i)

printf("%d ", arr[i]);

return 0;

Output:

Sorted Array:

0123456789

Sybex - Maya - Secrets of The Pros - 2003 (PDF)
No ratings yet
Sybex - Maya - Secrets of The Pros - 2003 (PDF)
384 pages
Course Plan - Linux Lab
No ratings yet
Course Plan - Linux Lab
12 pages
670 Series 2.0 ANSI: DNP3 Communication Protocol Manual
No ratings yet
670 Series 2.0 ANSI: DNP3 Communication Protocol Manual
74 pages
Realtime Festival Overview
No ratings yet
Realtime Festival Overview
28 pages
3M Versaflo Respirator Systems Are Easy To Select: Modular Means Versatile
No ratings yet
3M Versaflo Respirator Systems Are Easy To Select: Modular Means Versatile
2 pages
Bus Times
No ratings yet
Bus Times
2 pages
InDesign 100 Real Shortcuts Hinglish
No ratings yet
InDesign 100 Real Shortcuts Hinglish
3 pages
Solution of Project
No ratings yet
Solution of Project
5 pages
HV Assignment 1 (Section 2a) (1 47)
No ratings yet
HV Assignment 1 (Section 2a) (1 47)
5 pages
Kunuba Prliminary Pages 8
No ratings yet
Kunuba Prliminary Pages 8
9 pages
Oracle Cloud Enterprise Resource Planning
No ratings yet
Oracle Cloud Enterprise Resource Planning
8 pages
Gree Vireo Gen3 Submittal 9mbh 230v A
No ratings yet
Gree Vireo Gen3 Submittal 9mbh 230v A
6 pages
CP4292 Mcap
No ratings yet
CP4292 Mcap
15 pages
MECH0023 Week 01 Notes
No ratings yet
MECH0023 Week 01 Notes
24 pages
9852 2340 01b Manual Cement Unit Boltec M & L RCS 4.5
No ratings yet
9852 2340 01b Manual Cement Unit Boltec M & L RCS 4.5
56 pages
Using MPI With Fortran - Research Computing University of Colorado Boulder Documentation
No ratings yet
Using MPI With Fortran - Research Computing University of Colorado Boulder Documentation
8 pages
Key Concepts in MPI Programming: Processes
No ratings yet
Key Concepts in MPI Programming: Processes
6 pages
Vaixell Teseu
No ratings yet
Vaixell Teseu
5 pages
Maths Links 8c Homework Book Answers
100% (1)
Maths Links 8c Homework Book Answers
4 pages
Program 1: To Display Message
No ratings yet
Program 1: To Display Message
6 pages
Math Homework Tic Tac Toe
100% (1)
Math Homework Tic Tac Toe
8 pages
What Is The Message Passing Interface (MPI) ?: Standardization
No ratings yet
What Is The Message Passing Interface (MPI) ?: Standardization
5 pages
Hoeganaes Corporation
No ratings yet
Hoeganaes Corporation
11 pages
Casio G-Shock Watch - URBANHUG
No ratings yet
Casio G-Shock Watch - URBANHUG
1 page
Untitled Document
No ratings yet
Untitled Document
23 pages
Mpi 1
No ratings yet
Mpi 1
20 pages
Twelve Tips For Enhancing Anatomy Teaching and Learning Using Radiology
No ratings yet
Twelve Tips For Enhancing Anatomy Teaching and Learning Using Radiology
5 pages
Exercise - 4
No ratings yet
Exercise - 4
8 pages
Exercise - 4
No ratings yet
Exercise - 4
8 pages
MPI Programming Lab
No ratings yet
MPI Programming Lab
3 pages
Roller Chains Catalogue en Kettenwulf
No ratings yet
Roller Chains Catalogue en Kettenwulf
146 pages
Ultratech Cement: Particulars Test Results Requirements of
100% (1)
Ultratech Cement: Particulars Test Results Requirements of
1 page
PDC Lab 8
No ratings yet
PDC Lab 8
7 pages
Mpi Codes
No ratings yet
Mpi Codes
18 pages
High School Students' Perceptions of Motivations For Cyberbullying An Exploratory Study
No ratings yet
High School Students' Perceptions of Motivations For Cyberbullying An Exploratory Study
6 pages
Cluster Lab Session 04 (Examples)
No ratings yet
Cluster Lab Session 04 (Examples)
3 pages
C Language Programming Codes
From Everand
C Language Programming Codes
Durgesh
No ratings yet
2 Mpi
No ratings yet
2 Mpi
13 pages
Article
No ratings yet
Article
16 pages
03 MPIProgramStructure
No ratings yet
03 MPIProgramStructure
42 pages
Department of Civil Engineering (Bbit B.Tech Wing) A.Y. 2019-2020 Even Semester Faculty Database For Online Internal Exam in May 2020
No ratings yet
Department of Civil Engineering (Bbit B.Tech Wing) A.Y. 2019-2020 Even Semester Faculty Database For Online Internal Exam in May 2020
1 page
Manual - Pdms Support Design
No ratings yet
Manual - Pdms Support Design
84 pages
Mpi Assignment
No ratings yet
Mpi Assignment
7 pages
‎⁨تقرير⁩
No ratings yet
‎⁨تقرير⁩
16 pages
MPP Exercises
No ratings yet
MPP Exercises
8 pages
Prime
No ratings yet
Prime
2 pages
Intro MPI
No ratings yet
Intro MPI
60 pages
Week 6 10
No ratings yet
Week 6 10
43 pages
Subash Kumar Rawat - 230155 - WPS02
No ratings yet
Subash Kumar Rawat - 230155 - WPS02
12 pages
Introduction To C MPI PM
No ratings yet
Introduction To C MPI PM
50 pages
Specifications Guide Electric Range EN
No ratings yet
Specifications Guide Electric Range EN
2 pages
DB Lab Manual
No ratings yet
DB Lab Manual
13 pages
Introduction MPI - Chap2 - Slide 3
No ratings yet
Introduction MPI - Chap2 - Slide 3
16 pages
Problemes MPI
No ratings yet
Problemes MPI
4 pages
In3200 Chap09
No ratings yet
In3200 Chap09
56 pages
Chapter 8-Vector Control of Induction Motors PDF
No ratings yet
Chapter 8-Vector Control of Induction Motors PDF
18 pages
MPI Pacheco Ch3
No ratings yet
MPI Pacheco Ch3
124 pages
Mpi 1
No ratings yet
Mpi 1
38 pages
Solutions Midterm 1 March 72020
No ratings yet
Solutions Midterm 1 March 72020
7 pages
Final PDC Exam
No ratings yet
Final PDC Exam
10 pages
ECE 1747H: Parallel Programming: Message Passing (MPI)
No ratings yet
ECE 1747H: Parallel Programming: Message Passing (MPI)
67 pages
Lecture07 MPI by Example
No ratings yet
Lecture07 MPI by Example
27 pages
1.hello World Programme in Mpi
No ratings yet
1.hello World Programme in Mpi
11 pages
Kenwood TRC 80 User Manual PDF
No ratings yet
Kenwood TRC 80 User Manual PDF
33 pages
CSE4001 Parallel and Distributed Computing: Lab Assignment 6
No ratings yet
CSE4001 Parallel and Distributed Computing: Lab Assignment 6
8 pages
Pcap Cse 3263 Lab Manual 2023
No ratings yet
Pcap Cse 3263 Lab Manual 2023
70 pages
Mpi Basic Operations
No ratings yet
Mpi Basic Operations
6 pages
Code: First Method:: (1) Write A C Program Using Open MP To Estimate The Value of PI (Use Minimum Two Methods)
No ratings yet
Code: First Method:: (1) Write A C Program Using Open MP To Estimate The Value of PI (Use Minimum Two Methods)
8 pages
PDC 5 PDF
No ratings yet
PDC 5 PDF
7 pages
Name:Bhumika Verma REG NO:19BCE1418 COURSE:CSE4001 (L49+50) Faculty:Dr.R.Gayathri
No ratings yet
Name:Bhumika Verma REG NO:19BCE1418 COURSE:CSE4001 (L49+50) Faculty:Dr.R.Gayathri
3 pages
Parallel and Distributed Computing Lab Digital Assignment - 5
No ratings yet
Parallel and Distributed Computing Lab Digital Assignment - 5
7 pages
ICT in Education
No ratings yet
ICT in Education
26 pages
Lab 7 PP
No ratings yet
Lab 7 PP
2 pages
PDCLabMan Updated
No ratings yet
PDCLabMan Updated
46 pages
Message Passing Interface (MPI) : Steve Lantz Center For Advanced Computing Cornell University
No ratings yet
Message Passing Interface (MPI) : Steve Lantz Center For Advanced Computing Cornell University
53 pages
MPI Plamen Krastev
No ratings yet
MPI Plamen Krastev
49 pages
#Include #Include Int Main (Int Argc, Char Argv ) (MPI - Init (&argc,&argv) Printf ("Hello World/n") MPI - Finalize Return 0 )
No ratings yet
#Include #Include Int Main (Int Argc, Char Argv ) (MPI - Init (&argc,&argv) Printf ("Hello World/n") MPI - Finalize Return 0 )
6 pages
Computer Structures - MPI
No ratings yet
Computer Structures - MPI
16 pages
SERC IntroMPI 2019-09-14 v0
No ratings yet
SERC IntroMPI 2019-09-14 v0
43 pages
Structure of A MPI Program
No ratings yet
Structure of A MPI Program
26 pages
Case Study 1
No ratings yet
Case Study 1
12 pages
10 MPI Programmes
No ratings yet
10 MPI Programmes
26 pages
Sunil Kumar L 24
No ratings yet
Sunil Kumar L 24
21 pages
Parallel & Distributed Computing: MPI - Message Passing Interface
No ratings yet
Parallel & Distributed Computing: MPI - Message Passing Interface
49 pages
Distributed Memory Programming With MPI: Peter Pacheco
No ratings yet
Distributed Memory Programming With MPI: Peter Pacheco
121 pages
Intro To MPI: Hpc-Support@duke - Edu
No ratings yet
Intro To MPI: Hpc-Support@duke - Edu
56 pages
Mpi
No ratings yet
Mpi
30 pages
The Message Passing Interface (MPI)
No ratings yet
The Message Passing Interface (MPI)
18 pages
ECE 1747H: Parallel Programming: Message Passing (MPI)
No ratings yet
ECE 1747H: Parallel Programming: Message Passing (MPI)
67 pages
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

Week 6 10

Uploaded by

Week 6 10

Uploaded by

Aditya Raj

int main(int argc, char* argv[])

int world_size, world_rank;

// Assuming x is a constant value

double result = pow(x, world_rank);

printf("Process %d: pow(%d, %d) = %f\n", world_rank, x, world_rank, result);

Process 2: pow(40, 2) = 1600.000000

Process 1: pow(40, 1) = 40.000000

Process 0: pow(40, 0) = 1.000000

Process 4: pow(40, 4) = 2560000.000000

Process 3: pow(40, 3) = 64000.000000

int main(int argc, char* argv[]) {

int world_size, world_rank;

printf(" EVEN Hello from process %d\n", world_rank);

printf("ODD World from process %d\n", world_rank);

Odd World from process 13

Even Hello from process 2

Odd World from process 9

Odd World from process 1

Even Hello from process 8

Even Hello from process 12

Odd World from process 5

Odd World from process 3

Odd World from process 11

Even Hello from process 6

Even Hello from process 0

Even Hello from process 4

Odd World from process 7

Even Hello from process 10

int main(int argc, char* argv[]) {

char operations[] = { '+', '-', '*', '/' };

char operation = operations[world_rank % 4];

double operand1 = 10.0, operand2 = 2.0;

double result = 0.0;

result = operand1 + operand2;

result = operand1 - operand2;

result = operand1 * operand2;

result = operand1 / operand2;

fprintf(stderr, "Error: Division by zero.\n");

fprintf(stderr, "Error: Unknown operation.\n");

// Allocate memory dynamically for gathered_results

double* gathered_results = (double*)malloc(world_size * sizeof(double));

MPI_Gather(&result, 1, MPI_DOUBLE, gathered_results, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);

for (int i = 0; i < world_size; ++i) {

// Free dynamically allocated memory

Process 0: 10.00 + 2.00 = 12.00

Process 1: 10.00 - 2.00 = 8.00

Process 2: 10.00 * 2.00 = 20.00

Process 3: 10.00 / 2.00 = 5.00

Process 4: 10.00 + 2.00 = 12.00

Process 5: 10.00 - 2.00 = 8.00

Process 6: 10.00 * 2.00 = 20.00

Process 7: 10.00 / 2.00 = 5.00

Process 8: 10.00 + 2.00 = 12.00

Process 9: 10.00 - 2.00 = 8.00

Process 10: 10.00 * 2.00 = 20.00

Process 11: 10.00 / 2.00 = 5.00

Process 12: 10.00 + 2.00 = 12.00

Process 13: 10.00 - 2.00 = 8.00

#deﬁne MAX_STRING_SIZE 100

int main(int argc, char *argv[]) {

int rank, size;

// Determine the character index to toggle based on process rank

int char_index = rank % str_len;

// Toggle the character

if (rank < str_len) {

if (str[char_index] >= 'a' && str[char_index] <= 'z') {

str[char_index] = str[char_index] - 32; // Convert to uppercase

} else if (str[char_index] >= 'A' && str[char_index] <= 'Z') {

str[char_index] = str[char_index] + 32; // Convert to lowercase

// Gather all modiﬁed strings to process 0

MPI_Gather(rank < str_len ? &str[char_index] : NULL, 1, MPI_CHAR,

str, 1, MPI_CHAR, 0, MPI_COMM_WORLD);

// Print the result in process 0

printf("Original String: HeLLO\n");

printf("Modiﬁed String: %s\n", str);

double* gathered_results = (double)malloc(world_size sizeof(double));

numbers = (int)malloc(sizeof(int) world_size);