Week 6 10
Week 6 10
210968136
Batch 4
Parallel Programming Lab
04 April 2024.
Week 6 – 16 February 2024.
1) Write a simple MPI program to find out pow (x, rank) for all the processes where 'x' is the integer
constant, and 'rank' is the rank of the process.
#include <stdio.h>
#include <math.h>
#include <mpi.h>
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
int x = 40;
MPI_Finalize();
return 0;
Output:
2) Write a program in MPI where even ranked process prints "Hello" and odd ranked process prints "World".
#include <stdio.h>
#include <mpi.h>
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
if (world_rank % 2 == 0) {
else {
MPI_Finalize();
return 0;
Output:
3) Write a program in MPI to simulate simple calculator. Perform each operation using diBerent process in
parallel.
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
MPI_Init(&argc, &argv);
int world_size, world_rank;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
switch (operation) {
case '+':
break;
case '-':
break;
case '*':
break;
case '/':
if (operand2 != 0) {
else {
MPI_Abort(MPI_COMM_WORLD, 1);
break;
default:
MPI_Abort(MPI_COMM_WORLD, 1);
if (world_rank == 0) {
printf("Results:\n");
printf("Process %d: %.2f %c %.2f = %.2f\n", i, operand1, operations[i % 4], operand2, gathered_results[i]);
free(gathered_results);
MPI_Finalize();
return 0;
Output:
4) Write a program in MPI to toggle the character of a given string indexed by the rank of the process.
#include <stdio.h>
#include <string.h>
#include <mpi.h>
int str_len;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
str_len = strlen(str);
if (rank == 0) {
MPI_Finalize();
return 0;
Output:
Input array : 18, 523, 301, 1234, 2, 14, 108, 150, 1928 Output array: 81, 325, 103, 4321, 2, 41, 801, 51, 8291
#include <stdio.h>
#include <mpi.h>
#define ARRAY_SIZE 9
int reversed = 0;
num /= 10;
return reversed;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if (size != ARRAY_SIZE) {
if (rank == 0) {
MPI_Finalize();
return 1;
int inputArray[ARRAY_SIZE] = { 18, 523, 301, 1234, 2, 14, 108, 150, 1928 };
int outputArray[ARRAY_SIZE];
int gatheredArray[ARRAY_SIZE];
outputArray[rank] = reverseDigits(outputArray[rank]);
// Gather the reversed array using a separate buver
if (rank == 0) {
printf("\n");
MPI_Finalize();
return 0;
Output:
6) Write a MPI program to find the prime numbers between 1 and 100 using 2 processes.
#include <stdio.h>
#include <stdbool.h>
#include <mpi.h>
#define RANGE_START 1
if (num < 2) {
return false;
return false;
return true;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if (size != 2) {
if (rank == 0) {
MPI_Finalize();
return 1;
int primesInRange = 0;
if (rank == 0) {
// Process 0 will check for primes in the first half of the range
start = RANGE_START;
end = RANGE_END / 2;
else {
// Process 1 will check for primes in the second half of the range
start = RANGE_END / 2 + 1;
end = RANGE_END;
if (isPrime(num)) {
primesInRange++;
// Process 0 receives the count of primes from Process 1 and adds them
if (rank == 0) {
int receivedPrimes;
primesInRange += receivedPrimes;
else {
MPI_Finalize();
return 0;
Output:
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <mpi.h>
if (isupper(word[i]))
word[i] = tolower(word[i]);
else if (islower(word[i]))
word[i] = toupper(word[i]);
MPI_Init(&argc, &argv);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank == root) {
// Sender process
char received_word[100];
} else if (rank == 1) {
// Receiver process
char word[100];
MPI_Recv(word, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
toggle_case(word);
MPI_Finalize();
return 0;
Output:
2) Write a MPI program where the master process (process 0) sends a number to each of the slaves and
the slave processes receive the number and prints it. Use standard send.
#include <mpi.h>
#include <stdio.h>
MPI_Init(&argc, &argv);
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
int number;
if (world_rank == 0) {
// Master process
number = 777; // You can change this number to any number you want to send
} else {
// Slave processes
MPI_Finalize();
return 0;
Output:
3) Write a MPI program to read N elements of the array in the root process (process 0) where N is equal to the total
number of process. The root process sends one value to each of the slaves. Let even ranked process finds square of
the received element and odd ranked process finds cube of received element. Use BuKered send.
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
MPI_Init(&argc, &argv);
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
// The root process (process 0) will read N elements where N is equal to world_size
MPI_Buver_attach(buver, buver_size);
if (world_rank == 0) {
int received_number;
if (world_rank != 0) {
if (world_rank % 2 == 0) {
} else {
MPI_Buver_detach(&buver, &buver_size);
free(buver);
free(numbers);
MPI_Finalize();
return 0;
Output:
4) Write a MPI program to read an integer value in the root process. Root process sends this value to
Process1, Process1 sends this value to Process2 and so on. Last process sends the value back to root
process. When sending the value each process will first increment the received value by one. Write the
program using point to point communication routines.
#include <stdio.h>
#include <mpi.h>
int value;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
if (world_rank == 0) {
} else {
if (world_rank == 0) {
MPI_Finalize();
return 0;
Output:
5) Write a MPI program to read N elements of an array in the master process. Let N processes including
master process check the array values are prime or not.
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
if (number % i == 0) return 0;
return 1;
}
int main(int argc, char** argv) {
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
if (world_rank == 0) {
printf("Process %d received %d, prime? %s\n", world_rank, number_to_check, result ? "Yes" : "No");
if (world_rank == 0) {
free(array);
MPI_Finalize();
return 0;
Output:
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cstdlib>
if (index < n)
printf("\n");
int main() {
int N = 1024; // Example size, make sure it does not exceed your GPU's capability
// Allocate memory
x = (int*)malloc(N * sizeof(int));
y = (int*)malloc(N * sizeof(int));
z = (int*)malloc(N * sizeof(int));
cudaMalloc(&d_x, N * sizeof(int));
cudaMalloc(&d_y, N * sizeof(int));
cudaMalloc(&d_z, N * sizeof(int));
// Initialize arrays
for (int i = 0; i < N; i++) {
x[i] = 5;
y[i] = 2;
printf("Result: ");
printVector(z, N);
// Cleanup
return 0;
Output:
Result: 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cstdlib>
if (index < n)
printf("\n");
int main() {
x = (int*)malloc(N * sizeof(int));
y = (int*)malloc(N * sizeof(int));
z = (int*)malloc(N * sizeof(int));
cudaMalloc(&d_x, N * sizeof(int));
cudaMalloc(&d_y, N * sizeof(int));
cudaMalloc(&d_z, N * sizeof(int));
x[i] = 5;
y[i] = 2;
add << <numBlocks, blockSize >> > (d_x, d_y, d_z, N);
printf("Result: ");
printVector(z, N);
return 0;
Output:
Results: 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
7777777777777777777777777777777777777777777777777777777777777777
77777777777777777777777777777777777777777777777777777777777
2) Implement a CUDA program to add two vectors of length N by keeping the number of threads per block
as 256 (constant) and vary the number of blocks to handle N elements.
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cstdlib>
if (index < N)
int main() {
int N = 1 << 20; // Example: Number of elements in each vector (1 Million elements)
h_b = (int*)malloc(size);
h_c = (int*)malloc(size);
h_a[i] = i;
h_b[i] = i;
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
cudaMalloc(&d_c, size);
// Invoke kernel
add << <blocksPerGrid, threadsPerBlock >> > (d_a, d_b, d_c, N);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(h_a);
free(h_b);
free(h_c);
return 0;
}
Output:
h_c[0] = 0
h_c[1] = 2
h_c[2] = 4
h_c[3] = 6
h_c[4] = 8
h_c[5] = 10
h_c[6] = 12
h_c[7] = 14
h_c[8] = 16
h_c[9] = 18
3) Write a program in CUDA which performs convolution operation on one dimensional input array N of size width
using a mask array M of size mask_width to produce the resultant one-dimensional array P of size width.
#include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cstdlib>
int k;
if (i < width) {
int pValue = 0;
// Convolution operation
P[i] = pValue;
printf("\n");
int main() {
convolution_1d << <(width + 255) / 256, 256 >> > (d_N, d_M, d_P, width, mask_width);
printArray(P, width);
cudaFree(d_N);
cudaFree(d_M);
cudaFree(d_P);
return 0;
Output:
Resultant array: 2 2 2 2 2 2 2 2 2 -9
4) Write a program in CUDA to process a ID array containing angles in radians to generate sine of the
angles in the output array. Use appropriate function.
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cstdlib>
#ifndef M_PI
#endif
if (index < n) {
printf("\n");
}
int main() {
float inputAngles[] = { 0.0, M_PI / 6, M_PI / 4, M_PI / 2, M_PI }; // Example angles in radians
cudaMalloc((void**)&d_input, n * sizeof(float));
cudaMalloc((void**)&d_output, n * sizeof(float));
printArray(output, n);
cudaFree(d_input);
cudaFree(d_output);
delete[] output;
return 0;
Output:
#include <stdio.h>
#include <cuda_runtime.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cstdlib>
#include <string.h>
#include <sm_20_atomic_functions.h>
__global__ void countWordOccurrences(char* sentence, char* word, int sentenceLength, int wordLength, int*
count) {
int localCount = 0;
if (sentence[i + j] != word[j]) {
wordFound = false;
break;
if (wordFound) {
localCount++;
// Use atomicAdd to safely add the local count to the global total
atomicAdd(count, localCount);
int main() {
char* sentence = "hello hello world world world world"; // Example sentence
char* word = "world"; // Word to count in the sentence
int* count;
int* d_count;
cudaMalloc((void**)&d_sentence, sentenceLength);
cudaMalloc((void**)&d_word, wordLength);
cudaMalloc((void**)&d_count, sizeof(int));
cudaMemset(d_count, 0, sizeof(int));
countWordOccurrences << <1, 256 >> > (d_sentence, d_word, sentenceLength, wordLength, d_count);
// Cleanup
return 0;
Output:
2) Write a CUDA program that reads a string S and produces the string RS as follows:
Input string S: PCAP Output string RS: PCAPPCAPCP Note: Each work item copies required number of
characters from S in RS.
#include <stdio.h>
#include <cuda_runtime.h>
__global__ void generateString(char* inputString, char* outputString, int length) {
int main() {
int length = 12; // Length of the output string (4 times the length of inputString)
cudaMalloc((void**)&d_inputString, strlen(inputString));
cudaMalloc((void**)&d_outputString, length);
// Cleanup
cudaFree(d_inputString);
cudaFree(d_outputString);
free(outputString);
return 0;
Output:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
__global__ void reverseWords(char* inputString, char* outputString, int* wordLengths, int numWords) {
int main() {
int numWords = 0;
numWords++;
int* d_wordLengths;
cudaMalloc((void**)&d_inputString, length);
cudaMalloc((void**)&d_outputString, length);
int wordStartIndex = 0;
numWords--;
reverseWords << <numBlocks, blockSize >> > (d_inputString, d_outputString, d_wordLengths, numWords);
// Cleanup
cudaFree(d_inputString);
cudaFree(d_outputString);
cudaFree(d_wordLengths);
free(wordLengths);
free(outputString);
return 0;
}
Output:
4) Write a CUDA program that takes a string Sin as input and one integer value N and produces an output
string, Sout, in parallel by concatenating input string Sin, N times as shown below.
INPUT : Sin ="Hello" N =3 OUTPUT : Sout = "HelloHelloHello" Note: Every thread copies the same
character from the Input string S, N times to the required position.
#include <stdio.h>
#include <cuda_runtime.h>
__global__ void concatenateString(char* inputString, int inputLength, char* outputString, int repetitions) {
int main() {
cudaMalloc((void**)&d_inputString, length);
cudaMalloc((void**)&d_outputString, outputLength);
// Cleanup
cudaFree(d_inputString);
cudaFree(d_outputString);
free(outputString);
return 0;
Output:
5) Write a CUDA program which reads a string Sin and produces an output string T as shownbelow.
Input: Sin: "Hai" Ouput: T: "Haaiii" Note:Every thread stores a character from input string Sin, required
number of times intooutput string T.
#include <stdio.h>
#include <cuda_runtime.h>
__global__ void repeatCharacters(char* inputString, int inputLength, char* outputString, int repetitions) {
if (index == 0) {
outputString[i] = lastChar;
}
int main() {
int repetitions = 3; // Number of times each character should be repeated, except for the last character
cudaMalloc((void**)&d_inputString, length);
cudaMalloc((void**)&d_outputString, outputLength);
// Cleanup
cudaFree(d_inputString);
cudaFree(d_outputString);
free(outputString);
return 0;
Output:
#include <stdio.h>
#include <cuda_runtime.h>
// Kernel to add two matrices where each row of the resultant matrix is computed by one thread
int col = threadIdx.x; // Each thread computes one element of the row
// Kernel to add two matrices where each column of the resultant matrix is computed by one thread
int row = threadIdx.x; // Each thread computes one element of the column
// Kernel to add two matrices where each element of the resultant matrix is computed by one thread
int index = threadIdx.x + blockIdx.x * blockDim.x; // Each thread computes one element of the matrix
if (index < N * N) {
}
// Helper function to print matrix
printf("\n");
int main() {
initializeMatrix(h_A);
initializeMatrix(h_B);
cudaMalloc((void**)&d_A, N * N * sizeof(float));
cudaMalloc((void**)&d_B, N * N * sizeof(float));
cudaMalloc((void**)&d_C, N * N * sizeof(float));
printMatrix(h_C);
printMatrix(h_C);
printMatrix(h_C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
return 0;
Output:
Matrix A:
Matrix B:
2) Write a program in CUDA to multiply two matrices for the following specifications:
#include <stdio.h>
#include <cuda_runtime.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void matrixMultiplication(int *a, int *b, int *c, int width) {
int sum = 0;
int main() {
h_a = (int*)malloc(size);
h_b = (int*)malloc(size);
h_c = (int*)malloc(size);
h_a[i] = i;
h_b[i] = i * 2;
// Device matrices
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
cudaMalloc(&d_c, size);
// Launch kernel
printf("Result Matrix:\n");
printf("\n");
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(h_a);
free(h_b);
free(h_c);
return 0;
Output:
Matrix A:
0123
4567
8 9 10 11
12 13 14 15
Matrix B:
0246
8 10 12 14
16 18 20 22
24 26 28 30
Result Matrix:
28 34 40 46
76 98 120 142
#include <stdio.h>
#include <cuda_runtime.h>
if (idx < N) {
int main() {
const int N = 5;
float alpha = 2;
cudaMalloc(&d_x, N * sizeof(float));
cudaMalloc(&d_y, N * sizeof(float));
cudaFree(d_x);
cudaFree(d_y);
printf("\n");
return 0;
}
Output:
Input vector x:
12345
Input vector y:
6 7 8 9 10
Scalar alpha: 2
Resulting vector y:
8. 11 14 17 20
4) Write a CUDA program to sort every row of a matrix using selection sort.
#include <stdio.h>
#include <cuda_runtime.h>
int minIndex = i;
minIndex = j;
row[i] = row[minIndex];
row[minIndex] = temp;
selectionSort(rowPtr, cols);
int main() {
int rows = 3;
int cols = 4;
{4, 6, 2, 1},
{3, 0, 2, 5}};
int *d_matrix;
cudaFree(d_matrix);
printf("Sorted Matrix:\n");
printf("\n");
return 0;
Output:
Input matrix:
97351
64820
59247
13579
02468
24579
5) Write a CUDA program to perform odd even transposition sort in parallel.
#include <stdio.h>
#include <cuda_runtime.h>
int temp;
temp = arr[tid];
arr[tid] = arr[partner_tid];
arr[partner_tid] = temp; } }
__syncthreads(); }
int main() {
int *d_arr;
cudaMalloc((void**)&d_arr, n * sizeof(int));
cudaFree(d_arr);
printf("Sorted Array:\n");
return 0;
Output:
Sorted Array:
0123456789