KMM

KMM is a lightweight C++ middleware for accelerated computing, with native support for CUDA.

Highlights of KMM:

  • KMM manages the application memory
    • Allocations of host and device memory is automated

    • Data transfers to and from device are transparently performed when necessary

  • No need to rewrite your kernels
    • KMM can schedule and execute native C++ and CUDA functions

Basic Example

This example shows how to run a CUDA kernel implementing a vector add operation with KMM.

 1#include <deque>
 2#include <iostream>
 3
 4#include "spdlog/spdlog.h"
 5
 6#include "kmm/array.hpp"
 7#include "kmm/cuda/cuda.hpp"
 8#include "kmm/host/host.hpp"
 9#include "kmm/runtime_handle.hpp"
10
11#define SIZE 65536000
12
13__global__ void vector_add(const float* A, const float* B, float* C, int size) {
14    int item = (blockDim.x * blockIdx.x) + threadIdx.x;
15
16    if (item < size) {
17        C[item] = A[item] + B[item];
18    }
19}
20
21void initialize(float* A, float* B) {
22#pragma omp parallel for
23    for (unsigned int item = 0; item < SIZE; item++) {
24        A[item] = 1.0;
25        B[item] = 2.0;
26    }
27
28    std::cout << "initialize\n";
29}
30
31void verify(const float* C) {
32#pragma omp parallel for
33    for (unsigned int item = 0; item < SIZE; item++) {
34        if (fabsf(C[item] - 3.0f) > 1.0e-9) {
35            std::cout << "ERROR" << std::endl;
36            break;
37        }
38    }
39
40    std::cout << "SUCCESS" << std::endl;
41}
42
43int main(void) {
44    spdlog::set_level(spdlog::level::debug);
45
46    unsigned int threads_per_block = 256;
47    unsigned int n_blocks = ceil((1.0 * SIZE) / threads_per_block);
48    int n = SIZE;
49
50    // Create manager
51    auto manager = kmm::build_runtime();
52    std::deque<kmm::EventId> events;
53
54    for (size_t i = 0; i < 20; i++) {
55        // Request 3 memory areas of a certain size
56        auto A = kmm::Array<float>(n);
57        auto B = kmm::Array<float>(n);
58        auto C = kmm::Array<float>(n);
59
60        // Initialize array A and B on the host
61        manager.submit(kmm::Host(), initialize, write(A), write(B));
62
63        // Execute the function on the device.
64        manager.submit(kmm::CudaKernel(n_blocks, threads_per_block), vector_add, A, B, write(C), n);
65
66        // Verify the result on the host.
67        auto verify_event = manager.submit(kmm::Host(), verify, C);
68
69        events.push_back(verify_event);
70        if (events.size() >= 5) {
71            manager.wait(events[events.size() - 5]);
72        }
73    }
74
75    manager.synchronize();
76    std::cout << "done\n";
77
78    return 0;
79}

Indices and tables