0% found this document useful (0 votes)
7 views

HPC CODES

The document contains multiple code snippets demonstrating parallel algorithms using OpenMP and CUDA for tasks like tree traversal, sorting, and matrix operations. It includes implementations for parallel breadth-first search (BFS), depth-first search (DFS), bubble sort, merge sort, and vector addition/multiplication. Additionally, it showcases performance comparisons between sequential and parallel algorithms.

Uploaded by

krishnagavane36
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
7 views

HPC CODES

The document contains multiple code snippets demonstrating parallel algorithms using OpenMP and CUDA for tasks like tree traversal, sorting, and matrix operations. It includes implementations for parallel breadth-first search (BFS), depth-first search (DFS), bubble sort, merge sort, and vector addition/multiplication. Additionally, it showcases performance comparisons between sequential and parallel algorithms.

Uploaded by

krishnagavane36
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 18

#include <iostream>

#include <vector>

#include <queue>

#include <omp.h>

using namespace std;

struct Node {

int data;

vector<Node*> children;

};

// Thread-safe level-order traversal using parallelism

void parallel_BFS(Node* root) {

if (!root) return;

queue<Node*> q;

q.push(root);

while (!q.empty()) {

int level_size = q.size();

vector<Node*> next_level;

#pragma omp parallel for shared(q, next_level)

for (int i = 0; i < level_size; ++i) {

Node* current;

// Synchronize access to queue

#pragma omp critical

current = q.front();

q.pop();

// Print node value (thread-safe)

#pragma omp critical

cout << current->data << " ";


}

// Push children to a temporary next-level buffer

#pragma omp critical

for (Node* child : current->children) {

next_level.push_back(child);

for (Node* node : next_level) {

q.push(node);

int main() {

// Create a sample N-ary tree

Node* root = new Node{1, {

new Node{2, {}},

new Node{3, {

new Node{4, {}}

}}

}};

cout << "Parallel BFS Output: ";

parallel_BFS(root);

cout << endl;

return 0;

OUTPUT:
#include <iostream>

#include <vector>

#include <omp.h>

using namespace std;

struct Node {

int data;

vector<Node*> neighbors;

};

// Thread-safe DFS using OpenMP

void parallel_DFS(Node* node, vector<bool>& visited) {

// Use a critical section to avoid race conditions on the visited array

#pragma omp critical

if (visited[node->data]) return;

visited[node->data] = true;

cout << node->data << " ";

// Parallel traversal of neighbors

#pragma omp parallel for

for (int i = 0; i < node->neighbors.size(); ++i) {

Node* neighbor = node->neighbors[i];

// Manual re-check in case visited was updated after critical section

#pragma omp critical

if (!visited[neighbor->data]) {

parallel_DFS(neighbor, visited);
}

int main() {

// Create a sample undirected graph (0-1-3, 0-2-4)

vector<Node> graph(5);

for (int i = 0; i < 5; ++i) {

graph[i].data = i;

graph[0].neighbors = {&graph[1], &graph[2]};

graph[1].neighbors = {&graph[0], &graph[3]};

graph[2].neighbors = {&graph[0], &graph[4]};

graph[3].neighbors = {&graph[1]};

graph[4].neighbors = {&graph[2]};

vector<bool> visited(graph.size(), false);

cout << "Parallel DFS Output: ";

#pragma omp parallel

#pragma omp single

parallel_DFS(&graph[0], visited);

cout << endl;

return 0;

}
INPUT :

OUTPUT:
Code: Parallel and Sequential Sorting Algorithms with Time Comparison
#include <iostream>

#include <vector>

#include <chrono>

#include <omp.h>

#include <cstdlib>

using namespace std;

// Swap two elements

void swap(int& a, int& b) {

int temp = a;

a = b;

b = temp;

// Sequential Bubble Sort

void bubbleSortSequential(vector<int>& arr) {

int n = arr.size();

for (int i = 0; i < n - 1; ++i) {

bool swapped = false;

for (int j = 0; j < n - i - 1; ++j) {

if (arr[j] > arr[j + 1]) {

swap(arr[j], arr[j + 1]);

swapped = true;

if (!swapped) break;

// Parallel Bubble Sort

void bubbleSortParallel(vector<int>& arr) {

int n = arr.size();

for (int i = 0; i < n - 1; ++i) {


int swapped = 0;

#pragma omp parallel for reduction(|:swapped)

for (int j = 0; j < n - i - 1; ++j) {

if (arr[j] > arr[j + 1]) {

swap(arr[j], arr[j + 1]);

swapped = 1;

if (!swapped) break;

// Merge function

void merge(vector<int>& arr, int left, int mid, int right) {

int n1 = mid - left + 1;

int n2 = right - mid;

vector<int> L(n1), R(n2);

for (int i = 0; i < n1; i++) L[i] = arr[left + i];

for (int j = 0; j < n2; j++) R[j] = arr[mid + 1 + j];

int i = 0, j = 0, k = left;

while (i < n1 && j < n2) {

arr[k++] = (L[i] <= R[j]) ? L[i++] : R[j++];

while (i < n1) arr[k++] = L[i++];

while (j < n2) arr[k++] = R[j++];

// Sequential Merge Sort

void mergeSortSequential(vector<int>& arr, int left, int right) {


if (left < right) {

int mid = (left + right) / 2;

mergeSortSequential(arr, left, mid);

mergeSortSequential(arr, mid + 1, right);

merge(arr, left, mid, right);

// Parallel Merge Sort

void mergeSortParallel(vector<int>& arr, int left, int right) {

if (right - left < 1000) { // Small arrays are sorted sequentially

mergeSortSequential(arr, left, right);

} else {

int mid = (left + right) / 2;

#pragma omp task

mergeSortParallel(arr, left, mid);

#pragma omp task

mergeSortParallel(arr, mid + 1, right);

#pragma omp taskwait

merge(arr, left, mid, right);

int main() {

int N;

cout << "Enter the size of the array: ";

cin >> N;

vector<int> original(N);

cout << "Enter elements: ";

for (int i = 0; i < N; ++i) {

cin >> original[i];

}
// Sequential Bubble Sort

vector<int> arr = original;

auto start = chrono::high_resolution_clock::now();

bubbleSortSequential(arr);

auto end = chrono::high_resolution_clock::now();

double seqBubbleTime = chrono::duration<double, milli>(end - start).count();

cout << "Sequential Bubble Sort Time: " << seqBubbleTime << " ms\n";

// Parallel Bubble Sort

arr = original;

start = chrono::high_resolution_clock::now();

bubbleSortParallel(arr);

end = chrono::high_resolution_clock::now();

double parBubbleTime = chrono::duration<double, milli>(end - start).count();

cout << "Parallel Bubble Sort Time: " << parBubbleTime << " ms\n";

// Sequential Merge Sort

arr = original;

start = chrono::high_resolution_clock::now();

mergeSortSequential(arr, 0, N - 1);

end = chrono::high_resolution_clock::now();

double seqMergeTime = chrono::duration<double, milli>(end - start).count();

cout << "Sequential Merge Sort Time: " << seqMergeTime << " ms\n";

// Parallel Merge Sort

arr = original;

start = chrono::high_resolution_clock::now();

#pragma omp parallel

#pragma omp single

mergeSortParallel(arr, 0, N - 1);
}

end = chrono::high_resolution_clock::now();

double parMergeTime = chrono::duration<double, milli>(end - start).count();

cout << "Parallel Merge Sort Time: " << parMergeTime << " ms\n";

return 0;

Input:

Output:
#include <iostream>

#include <omp.h>

#include <climits>

#include <chrono>

using namespace std;

using namespace std::chrono;

// Function to find the minimum value in an array using parallel reduction

void min_reduction(int arr[], int n) {

int min_value = INT_MAX;

#pragma omp parallel for reduction(min: min_value)

for (int i = 0; i < n; i++) {

if (arr[i] < min_value) {

min_value = arr[i];

cout << "Minimum value: " << min_value << endl;

// Function to find the maximum value in an array using parallel reduction

void max_reduction(int arr[], int n) {

int max_value = INT_MIN;

#pragma omp parallel for reduction(max: max_value)

for (int i = 0; i < n; i++) {

if (arr[i] > max_value) {

max_value = arr[i];

cout << "Maximum value: " << max_value << endl;

// Function to calculate the sum of elements in an array using parallel reduction

void sum_reduction(int arr[], int n) {


int sum = 0;

#pragma omp parallel for reduction(+: sum)

for (int i = 0; i < n; i++) {

sum += arr[i];

cout << "Sum: " << sum << endl;

// Function to calculate the average of elements in an array using parallel reduction

void average_reduction(int arr[], int n) {

int sum = 0;

#pragma omp parallel for reduction(+: sum)

for (int i = 0; i < n; i++) {

sum += arr[i];

double average = (double)sum / n; // Fixed average calculation

cout << "Average: " << average << endl;

int main() {

int n;

cout << "\nEnter the total number of elements: ";

cin >> n;

int *arr = new int[n];

cout << "\nEnter the elements: ";

for (int i = 0; i < n; i++) {

cin >> arr[i];

auto start = high_resolution_clock::now();

min_reduction(arr, n);

max_reduction(arr, n);

sum_reduction(arr, n);

average_reduction(arr, n);
auto stop = high_resolution_clock::now();

auto duration = duration_cast<milliseconds>(stop - start);

cout << "Time taken for all reductions: " << duration.count() << " ms" << endl;

delete[] arr;

return 0;
}

Input:

Output:
#include <iostream>

#include <cuda_runtime.h>

using namespace std;

// CUDA kernel to add two vectors

__global__ void addVectors(int* A, int* B, int* C, int n) {

int i = blockIdx.x * blockDim.x + threadIdx.x;

if (i < n) {

C[i] = A[i] + B[i];

int main() {

int n = 1000000;

int* A, *B, *h_C; // Host pointers

int size = n * sizeof(int);

// Allocate pinned memory on host

cudaMallocHost(&A, size);

cudaMallocHost(&B, size);

cudaMallocHost(&h_C, size); // corrected from malloc

// Initialize vectors A and B

for (int i = 0; i < n; i++) {

A[i] = i;

B[i] = i * 2;

// Allocate memory on the device

int *dev_A, *dev_B, *dev_C;

cudaMalloc(&dev_A, size);

cudaMalloc(&dev_B, size);

cudaMalloc(&dev_C, size);

// Copy data from host to device

cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice);

cudaMemcpy(dev_B, B, size, cudaMemcpyHostToDevice);


// Kernel launch setup

int blockSize = 256;

int numBlocks = (n + blockSize - 1) / blockSize;

// Launch the CUDA kernel

addVectors<<<numBlocks, blockSize>>>(dev_A, dev_B, dev_C, n);

// Wait for the GPU to finish

cudaDeviceSynchronize();

// Copy result from device to host

cudaMemcpy(h_C, dev_C, size, cudaMemcpyDeviceToHost);

// Print first 10 results

cout << "First 10 results of vector addition:\n";

for (int i = 0; i < 10; i++) {

cout << h_C[i] << " ";

cout << endl;

// Free device memory

cudaFree(dev_A);

cudaFree(dev_B);

cudaFree(dev_C);

// Free host memory

cudaFreeHost(A);

cudaFreeHost(B);

cudaFreeHost(h_C);

return 0;

Output:
#include <iostream>

#include <cuda_runtime.h>

using namespace std;

// CUDA code to multiply matrices

__global__ void multiply(int* A, int* B, int* C, int size) {

// Uses thread indices and block indices to compute each element

int row = blockIdx.y * blockDim.y + threadIdx.y;

int col = blockIdx.x * blockDim.x + threadIdx.x;

if (row < size && col < size) {

int sum = 0;

for (int i = 0; i < size; i++) {

sum += A[row * size + i] * B[i * size + col];

C[row * size + col] = sum;

void initialize(int* matrix, int size) {

for (int i = 0; i < size * size; i++) {

matrix[i] = rand() % 10;

void print(int* matrix, int size) {

for (int row = 0; row < size; row++) {

for (int col = 0; col < size; col++) {

cout << matrix[row * size + col] << " ";

cout << '\n';

cout << '\n';

}
int main() {

int* A, * B, * C;

int N = 4; // Increasing N to 4 for better parallelization

int blockSize = 16;

int matrixSize = N * N;

size_t matrixBytes = matrixSize * sizeof(int);

A = new int[matrixSize];

B = new int[matrixSize];

C = new int[matrixSize];

initialize(A, N);

initialize(B, N);

cout << "Matrix A: \n";

print(A, N);

cout << "Matrix B: \n";

print(B, N);

int* X, * Y, * Z;

// Allocate space on GPU

cudaMalloc(&X, matrixBytes);

cudaMalloc(&Y, matrixBytes);

cudaMalloc(&Z, matrixBytes);

// Copy values from A and B to GPU

cudaMemcpy(X, A, matrixBytes, cudaMemcpyHostToDevice);

cudaMemcpy(Y, B, matrixBytes, cudaMemcpyHostToDevice);

// Set the number of threads per block and blocks per grid

int THREADS = 2; // Each block will have THREADS x THREADS threads

int BLOCKS = (N + THREADS - 1) / THREADS; // Compute the number of blocks needed

// Launch the kernel

dim3 threads(THREADS, THREADS);

dim3 blocks(BLOCKS, BLOCKS);

multiply<<<blocks, threads>>>(X, Y, Z, N);

cudaMemcpy(C, Z, matrixBytes, cudaMemcpyDeviceToHost);


cout << "Multiplication of matrix A and B: \n";

print(C, N);

delete[] A;

delete[] B;

delete[] C;

cudaFree(X);

cudaFree(Y);

cudaFree(Z);

return 0;

}
Output:

You might also like