common.h 3.75 KB
Newer Older
Nianchen Deng's avatar
sync    
Nianchen Deng committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#pragma once
#include <memory>
#include <stdexcept>
#include <vector>
#include <string>
#include <sstream>
#include <GL/glew.h>
#include <cuda_gl_interop.h>
#include "../glm/glm.hpp"
#include "Logger.h"

inline unsigned int getElementSize(nv::DataType t) {
    switch (t) {
    case nv::DataType::kINT32:
        return 4;
    case nv::DataType::kFLOAT:
        return 4;
    case nv::DataType::kHALF:
        return 2;
    case nv::DataType::kBOOL:
    case nv::DataType::kINT8:
        return 1;
    }
    throw std::runtime_error("Invalid DataType.");
    return 0;
}

template <typename T> void dumpRow(std::ostream &os, T *buf, size_t n) {
    os << buf[0];
    for (size_t i = 1; i < n; ++i) {
        os << " " << buf[i];
    }
    os << std::endl;
}

template <typename T>
void dumpHostBuffer(std::ostream &os, void *buf, size_t bufSize, size_t rowCount,
                    size_t maxDumpRows = 0) {
    T *typedBuf = static_cast<T *>(buf);
    size_t numItems = bufSize / sizeof(T);
    size_t nInLastRow = numItems % rowCount;
    size_t rows;
    if (nInLastRow == 0) {
        rows = numItems / rowCount;
        nInLastRow = rowCount;
    } else {
        rows = numItems / rowCount + 1;
    }
    if (maxDumpRows == 0) {
        for (size_t i = 0; i < rows - 1; ++i) {
            dumpRow(os, typedBuf, rowCount);
            typedBuf += rowCount;
        }
        dumpRow(os, typedBuf, nInLastRow);
    } else {
        for (size_t i = 0; i < maxDumpRows / 2; ++i)
            dumpRow(os, typedBuf + i * rowCount, rowCount);
        os << "..." << std::endl;
        for (size_t i = rows - maxDumpRows + maxDumpRows / 2; i < rows - 1; ++i)
            dumpRow(os, typedBuf + i * rowCount, rowCount);
        dumpRow(os, typedBuf + (rows - 1) * rowCount, nInLastRow);
    }
}

class CudaStream {
public:
    CudaStream() { cudaStreamCreate(&stream); }

    operator cudaStream_t() { return stream; }

    virtual ~CudaStream() { cudaStreamDestroy(stream); }

private:
    cudaStream_t stream;
};

class CudaEvent {
public:
    CudaEvent() { cudaEventCreate(&mEvent); }

    operator cudaEvent_t() { return mEvent; }

    virtual ~CudaEvent() { cudaEventDestroy(mEvent); }

private:
    cudaEvent_t mEvent;
};

struct CudaMapScope {
    std::vector<cudaGraphicsResource_t> resources_;
    cudaStream_t stream_;

    CudaMapScope(const std::vector<cudaGraphicsResource_t> &resources,
                 cudaStream_t stream = nullptr)
        : resources_(resources), stream_(stream) {}

    ~CudaMapScope() {
        if (!resources_.empty())
            cudaGraphicsUnmapResources(resources_.size(), resources_.data(), stream_);
    }

    cudaError_t map() {
        if (!resources_.empty())
            return cudaGraphicsMapResources(resources_.size(), resources_.data(), stream_);
        return cudaSuccess;
    }
};

template <typename T> struct Destroy {
    void operator()(T *t) {
        if (t != nullptr)
            t->destroy();
    }
};

class Range {
public:
    Range(glm::vec2 bound, uint steps)
        : _start(bound.x), _step((bound.y - bound.x) / (steps - 1)), _steps(steps) {}

    __host__ __device__ float get(uint i) { return _start + i * _step; }
    __host__ __device__ float start() { return _start; }
    __host__ __device__ float stop() { return _start + _step * _steps; }
    __host__ __device__ float steps() { return _steps; }

private:
    float _start;
    float _step;
    uint _steps;
};

template <class T> using uptr = std::unique_ptr<T, ::Destroy<T>>;
template <class T> using sptr = std::shared_ptr<T>;

#define INTERVAL(__start__, __end__) (((__end__) - (__start__)) / (float)CLOCKS_PER_SEC * 1000)

#include "Resource.h"
#include "Formatter.h"