KMM
KMM is a lightweight C++ middleware for accelerated computing, with native support for CUDA.
Highlights of KMM:
- KMM manages the application memory
Allocations of host and device memory is automated
Data transfers to and from device are transparently performed when necessary
- No need to rewrite your kernels
KMM can schedule and execute native C++ and CUDA functions
Basic Example
This example shows how to run a CUDA kernel implementing a vector add operation with KMM.
1#include <deque>
2#include <iostream>
3
4#include "spdlog/spdlog.h"
5
6#include "kmm/array.hpp"
7#include "kmm/cuda/cuda.hpp"
8#include "kmm/host/host.hpp"
9#include "kmm/runtime_handle.hpp"
10
11#define SIZE 65536000
12
13__global__ void vector_add(const float* A, const float* B, float* C, int size) {
14 int item = (blockDim.x * blockIdx.x) + threadIdx.x;
15
16 if (item < size) {
17 C[item] = A[item] + B[item];
18 }
19}
20
21void initialize(float* A, float* B) {
22#pragma omp parallel for
23 for (unsigned int item = 0; item < SIZE; item++) {
24 A[item] = 1.0;
25 B[item] = 2.0;
26 }
27
28 std::cout << "initialize\n";
29}
30
31void verify(const float* C) {
32#pragma omp parallel for
33 for (unsigned int item = 0; item < SIZE; item++) {
34 if (fabsf(C[item] - 3.0f) > 1.0e-9) {
35 std::cout << "ERROR" << std::endl;
36 break;
37 }
38 }
39
40 std::cout << "SUCCESS" << std::endl;
41}
42
43int main(void) {
44 spdlog::set_level(spdlog::level::debug);
45
46 unsigned int threads_per_block = 256;
47 unsigned int n_blocks = ceil((1.0 * SIZE) / threads_per_block);
48 int n = SIZE;
49
50 // Create manager
51 auto manager = kmm::build_runtime();
52 std::deque<kmm::EventId> events;
53
54 for (size_t i = 0; i < 20; i++) {
55 // Request 3 memory areas of a certain size
56 auto A = kmm::Array<float>(n);
57 auto B = kmm::Array<float>(n);
58 auto C = kmm::Array<float>(n);
59
60 // Initialize array A and B on the host
61 manager.submit(kmm::Host(), initialize, write(A), write(B));
62
63 // Execute the function on the device.
64 manager.submit(kmm::CudaKernel(n_blocks, threads_per_block), vector_add, A, B, write(C), n);
65
66 // Verify the result on the host.
67 auto verify_event = manager.submit(kmm::Host(), verify, C);
68
69 events.push_back(verify_event);
70 if (events.size() >= 5) {
71 manager.wait(events[events.size() - 5]);
72 }
73 }
74
75 manager.synchronize();
76 std::cout << "done\n";
77
78 return 0;
79}