Commit 6294701e authored by Nianchen Deng's avatar Nianchen Deng
Browse files

sync

parent 2824f796
#pragma once
#include "../utils/common.h"
#include "../utils/Shader.h"
class CrossRenderer {
public:
CrossRenderer(glm::vec2 frameRes, float crossSize, glm::vec4 crossColor);
void render(glm::vec2 p, float globalShift);
private:
glm::vec2 _frameRes;
float _crossSize;
glm::vec4 _crossColor;
sptr<Shader> _shader;
GLuint _vertBuf;
GLuint _shaderProp_crossSize;
GLuint _shaderProp_crossColor;
GLuint _shaderProp_crossPos;
GLuint _shaderProp_frameRes;
GLuint _shaderProp_globalShift;
GLuint _loc_MVP;
};
\ No newline at end of file
#include "Encoder.h"
#include "../utils/cuda.h"
/// idx3.z = 0: x, y, z, sin(x), sin(y), sin(z), cos(x), cos(y), cos(z)
/// idx3.z = 1: sin(2x), sin(2y), sin(2z), cos(2x), cos(2y), cos(2z)
/// ...
/// idx3.z = n_freq-1: sin(2^(n_freq-1)x), sin(2^(n_freq-1)y), sin(2^(n_freq-1)z),
/// cos(2^(n_freq-1)x), cos(2^(n_freq-1)y), cos(2^(n_freq-1)z)
/// Dispatch (n, in_chns, n_freqs)
__global__ void cu_encode0(float *output, float *input, uint n, uint nFreqs) {
glm::uvec3 idx3 = IDX3;
if (idx3.x >= n)
return;
uint inChns = blockDim.y;
uint outChns = inChns * (nFreqs * 2 + 1);
uint i = idx3.x, chn = idx3.y;
output[i * outChns + chn] = input[i * inChns + chn];
}
__global__ void cu_encode(float *output, float *input, float *freqs, uint n, bool catInput) {
glm::uvec3 idx3 = IDX3;
if (idx3.x >= n)
return;
uint offset = (uint)catInput;
uint inChns = blockDim.y, nFreqs = blockDim.z;
uint i = idx3.x, chn = idx3.y, freq = idx3.z;
uint elem = i * inChns + chn;
uint outChns = inChns * (nFreqs * 2 + offset);
uint base = i * outChns + chn;
if (freq == 0 && catInput)
output[base] = input[elem];
float x = freqs[freq] * input[elem];
float s, c;
__sincosf(x, &s, &c);
output[base + inChns * (freq * 2 + offset)] = s;
output[base + inChns * (freq * 2 + offset + 1)] = c;
}
__global__ void cu_encode2(glm::vec2 *output, glm::vec2 *input, float *freqs, uint n) {
glm::uvec3 idx3 = IDX3;
if (idx3.x >= n)
return;
uint nFreqs = blockDim.y;
uint i = idx3.x, freq = idx3.y;
uint outChns = nFreqs * 2 + 1;
uint base = i * outChns;
if (freq == 0)
output[base] = input[i];
glm::vec2 x = freqs[freq] * input[i];
glm::vec2 s, c;
__sincosf(x.x, &s.x, &c.x);
__sincosf(x.y, &s.y, &c.y);
output[base + (freq * 2 + 1)] = s;
output[base + (freq * 2 + 2)] = c;
}
/**
* @brief
*
* @param output encoded data, n x out_chns
* @param input coord data, n x in_chns
*/
void Encoder::encode(sptr<CudaArray<float>> output, sptr<CudaArray<float>> input) {
std::ostringstream sout;
sout << "Encoder => input size: (" << input->n() / _chns << ", " << _chns << "), output size: ("
<< output->n() / outDim() << ", " << outDim() << ")";
//Logger::instance.info(sout.str());
uint n = input->n() / _chns;
dim3 blkSize(1024 / _chns / _multires, _chns, _multires);
dim3 grdSize(ceilDiv(n, blkSize.x), 1, 1);
CU_INVOKE(cu_encode)(*output, *input, *_freqs, n, _catInput);
// blkSize = dim3(1024 / _chns, _chns);
// grdSize = dim3(ceilDiv(n, blkSize.x), 1, 1);
// CU_INVOKE(cu_encode0)(*output, *input, n, _multires);
CHECK_EX(cudaGetLastError());
}
void Encoder::_genFreqArray() {
float *arr = new float[_multires];
arr[0] = 1.0f;
for (auto i = 1; i < _multires; ++i)
arr[i] = arr[i - 1] * 2.0f;
_freqs = sptr<CudaArray<float>>(new CudaArray<float>(_multires));
cudaMemcpy(_freqs->getBuffer(), arr, _multires * sizeof(float), cudaMemcpyHostToDevice);
delete[] arr;
}
#pragma once
#include "../utils/common.h"
class Encoder {
public:
Encoder(unsigned int multires, unsigned int chns, bool catInput)
: _multires(multires), _chns(chns), _catInput(catInput) {
_genFreqArray();
}
unsigned int outDim() const { return _chns * ((int)_catInput + _multires * 2); }
void encode(sptr<CudaArray<float>> output, sptr<CudaArray<float>> input);
private:
unsigned int _multires;
unsigned int _chns;
bool _catInput;
sptr<CudaArray<float>> _freqs;
void _genFreqArray();
};
\ No newline at end of file
#include "FoveatedBlend.h"
#include <glm/gtx/transform.hpp>
static const struct {
float x, y;
float u, v;
} vertices[4] = {
{-1.0f, -1.0f, 0.f, 1.f},
{1.0f, -1.0f, 1.f, 1.f},
{1.0f, 1.0f, 1.f, 0.f},
{-1.0f, 1.0f, 0.f, 0.f}
};
static const char* vertex_shader_text =
"#version 300 es\n"
"uniform mat4 MVP;\n"
"in vec2 vUV;\n"
"in vec2 vPos;\n"
"out vec2 uv;\n"
"void main()\n"
"{\n"
" gl_Position = MVP * vec4(vPos, 0.0, 1.0);\n"
" uv = vUV;\n"
"}\n";
static const char* fragment_shader_text =
"#version 300 es\n"
"precision highp float;\n"
"out vec4 FragColor;\n"
"in vec2 uv;\n"
"uniform sampler2D tex;\n"
"uniform float innerR;\n"
"uniform float outerR;\n"
"uniform float shift;\n"
"uniform float globalShift;\n"
"uniform vec2 foveaCenter;\n"
"uniform vec2 frameRes;\n"
"void main()\n"
"{\n"
" vec2 u = uv;"
" FragColor = texture(tex, u);"
" u.x += (shift + globalShift) / frameRes.x;\n"
" if(outerR < 1e-2) {\n"
" FragColor = texture(tex, u);\n"
" return;\n"
" }\n"
" vec2 p = u * frameRes;\n"
" float r = distance(p, foveaCenter);\n"
" vec2 coord = (p - foveaCenter) / outerR / 2.0 + 0.5;\n"
" FragColor = vec4(coord, 0, 1);\n"
" if(coord.x < 0.0 || coord.x > 1.0 || coord.y < 0.0 || coord.y > 1.0) {\n"
" FragColor = vec4(0, 0, 0, 0);\n"
" return;\n"
" }\n"
" vec4 c = texture(tex, coord);\n"
" float alpha = 1.0 - smoothstep((outerR - innerR) * 0.6 + innerR, outerR, r);\n"
" c.a = c.a * alpha;\n"
" FragColor = c;\n"
"}\n";
static const char* fragment_shader_text_baked =
"#version 300 es\n"
"precision highp float;\n"
"out vec4 FragColor;\n"
"in vec2 uv;\n"
"uniform sampler2D tex;\n"
"uniform float innerR;\n"
"uniform float outerR;\n"
"uniform float shift;\n"
"uniform float globalShift;\n"
"uniform vec2 foveaCenter;\n"
"uniform vec2 frameRes;\n"
"void main()\n"
"{\n"
" vec2 u = uv;"
" u.x += (shift + globalShift) / frameRes.x;\n"
" if(outerR < 1e-2) {\n"
" FragColor = texture(tex, u);\n"
" return;\n"
" }\n"
" vec2 p = u * frameRes;\n"
" float r = distance(p, foveaCenter);\n"
" vec2 coord = (p - foveaCenter) / outerR / 2.0 + 0.5;\n"
" if(coord.x < 0.0 || coord.x > 1.0 || coord.y < 0.0 || coord.y > 1.0) {\n"
" FragColor = vec4(0, 0, 0, 0);\n"
" return;\n"
" }\n"
" vec4 c = texture(tex, u);\n"
" float alpha = 1.0 - smoothstep((outerR - innerR) * 0.6 + innerR, outerR, r);\n"
" c.a = c.a * alpha;\n"
" FragColor = c;\n"
"}\n";
FoveatedBlend::FoveatedBlend(sptr<Camera> cam, const std::vector<sptr<Camera>>& layerCams, bool forBaked) :
_cam(cam), _layerCams(layerCams), _forBaked(forBaked) {
_blendShader.reset(new Shader("FoveaBlend", vertex_shader_text,
forBaked ? fragment_shader_text_baked : fragment_shader_text));
_blendShader->compile();
glGenBuffers(1, &_vertBuf);
glBindBuffer(GL_ARRAY_BUFFER, _vertBuf);
glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW);
auto program = _blendShader->getId();
_shaderProp_tex = glGetUniformLocation(program, "tex");
_shaderProp_innerR = glGetUniformLocation(program, "innerR");
_shaderProp_outerR = glGetUniformLocation(program, "outerR");
_shaderProp_shift = glGetUniformLocation(program, "shift");
_shaderProp_globalShift = glGetUniformLocation(program, "globalShift");
_shaderProp_frameRes = glGetUniformLocation(program, "frameRes");
_shaderProp_foveaCenter = glGetUniformLocation(program, "foveaCenter");
_loc_MVP = glGetUniformLocation(program, "MVP");
auto loc_vPos = glGetAttribLocation(program, "vPos");
auto loc_vUV = glGetAttribLocation(program, "vUV");
glEnableVertexAttribArray(loc_vPos);
glVertexAttribPointer(loc_vPos, 2, GL_FLOAT, GL_FALSE, sizeof(vertices[0]), (void*)0);
glEnableVertexAttribArray(loc_vUV);
glVertexAttribPointer(loc_vUV, 2, GL_FLOAT, GL_FALSE, sizeof(vertices[0]),
(void*)(sizeof(float) * 2));
glGenQueries(1, &_glQuery);
}
void FoveatedBlend::run(GLuint glTexs[], glm::vec2 foveaPos, float shift, float globalShift, bool showPerf) {
glm::mat4 mvp = glm::ortho(-1.f, 1.f, -1.f, 1.f, 1.f, -1.f);
glBeginQuery(GL_TIME_ELAPSED, _glQuery);
glUseProgram(_blendShader->getId());
glUniformMatrix4fv(_loc_MVP, 1, GL_FALSE, (float*)&mvp[0][0]);
glUniform1i(_shaderProp_tex, 0);
glEnable(GL_TEXTURE_2D);
glActiveTexture(GL_TEXTURE0);
int i = _layerCams.size() - 1;
glUniform1f(_shaderProp_outerR, 0.0f);
glUniform1f(_shaderProp_shift, shift);
glUniform1f(_shaderProp_globalShift, globalShift);
glUniform2f(_shaderProp_frameRes, _cam->res().x, _cam->res().y);
glUniform2f(_shaderProp_foveaCenter, _cam->res().x / 2.0f, _cam->res().y / 2.0f);
glBindTexture(GL_TEXTURE_2D, glTexs[i]);
glDrawArrays(GL_QUADS, 0, 4);
for (i -= 1; i >= 0; --i) {
auto outerR = _layerCams[i]->res().y / _layerCams[i]->f().y * _cam->f().y * 0.5f;
auto innerR = i == 0 ?
0.0f :
_layerCams[i - 1]->res().y / _layerCams[i - 1]->f().y * _cam->f().y * 0.5f;
glUniform1f(_shaderProp_outerR, outerR);
glUniform1f(_shaderProp_shift, _forBaked ? shift : 0.0f);
glUniform1f(_shaderProp_globalShift, globalShift);
glUniform2f(_shaderProp_frameRes, _cam->res().x, _cam->res().y);
glUniform2f(_shaderProp_foveaCenter, foveaPos.x, foveaPos.y);
glBindTexture(GL_TEXTURE_2D, glTexs[i]);
glDrawArrays(GL_QUADS, 0, 4);
}
glDisable(GL_TEXTURE_2D);
glEndQuery(GL_TIME_ELAPSED);
if (showPerf) {
GLint available = 0;
while (!available)
glGetQueryObjectiv(_glQuery, GL_QUERY_RESULT_AVAILABLE, &available);
// timer queries can contain more than 32 bits of data, so always
// query them using the 64 bit types to avoid overflow
GLuint64 timeElapsed = 0;
glGetQueryObjectui64v(_glQuery, GL_QUERY_RESULT, &timeElapsed);
Logger::instance.info("Blend: %fms", timeElapsed / 1000000.0f);
}
}
\ No newline at end of file
#pragma once
#include "../utils/common.h"
#include "../utils/Shader.h"
#include "View.h"
class FoveatedBlend {
public:
FoveatedBlend(sptr<Camera> cam, const std::vector<sptr<Camera>>& layerCams, bool forBaked = false);
void run(GLuint glTexs[], glm::vec2 foveaPos, float shift, float globalShift, bool showPerf = false);
private:
bool _forBaked;
sptr<Camera> _cam;
std::vector<sptr<Camera>> _layerCams;
sptr<Shader> _blendShader;
GLuint _vertBuf;
GLuint _shaderProp_tex;
GLuint _shaderProp_innerR;
GLuint _shaderProp_outerR;
GLuint _shaderProp_shift;
GLuint _shaderProp_globalShift;
GLuint _shaderProp_frameRes;
GLuint _shaderProp_foveaCenter;
GLuint _glQuery;
GLuint _loc_MVP;
};
\ No newline at end of file
#include "FoveatedSynthesis.h"
#include "InferPipeline.h"
#include "Enhancement.h"
#include "ImageGen.h"
constexpr auto NUM_LAYERS = 3u;
constexpr auto STEREO_FOVEA_R = NUM_LAYERS;
constexpr auto NUM_NETS = 2u;
class FoveatedSynthesis_Impl {
public:
FoveatedSynthesis_Impl(const std::string& dataDir, glm::vec2 depthRange, uint nSamples[],
uint encodeDim, uint coordChns, sptr<Camera> cam, const std::vector<sptr<Camera>>& layerCams,
bool stereo);
void run(View& view, glm::vec2 foveaPos, bool showPerf, glm::vec2 foveaPosR);
GLuint getGlResultTexture(uint index);
private:
bool _stereo;
uint _nRays;
uint _nSamples;
sptr<Camera> _fullCam;
sptr<Camera> _cams[NUM_LAYERS];
sptr<Msl> _nets[NUM_NETS];
sptr<InferPipeline> _infers[NUM_NETS];
sptr<Enhancement> _enhancements[NUM_LAYERS];
sptr<ImageGen> _imageGens[NUM_LAYERS + 1];
sptr<CudaArray<glm::vec3>> _rays;
sptr<CudaArray<glm::vec4>> _clrs;
sptr<CudaArray<glm::vec4>> _imageData[NUM_LAYERS + 1];
};
FoveatedSynthesis_Impl::FoveatedSynthesis_Impl(const std::string& dataDir, glm::vec2 depthRange,
uint nSamples[], uint encodeDim, uint coordChns, sptr<Camera> cam,
const std::vector<sptr<Camera>>& layerCams, bool stereo) :
_fullCam(cam), _stereo(stereo) {
// Load nets
for (uint i = 0; i < NUM_NETS; ++i)
_nets[i].reset(new Msl());
_nets[0]->load(dataDir + "/fovea.trt");
_nets[1]->load(dataDir + "/periph.trt");
// Init cams
for (uint i = 0; i < NUM_LAYERS; ++i)
_cams[i] = layerCams[i];
uint nRays[NUM_LAYERS];
uint nTotRays = 0;
for (uint i = 0; i < NUM_LAYERS; ++i)
nTotRays += nRays[i] = _cams[i]->nRays();
if (_stereo)
nTotRays += nRays[0];
// Init infers
_infers[0].reset(new InferPipeline(_nets[0], nRays[0], nSamples[0],
depthRange, encodeDim, coordChns));
_infers[1].reset(new InferPipeline(_nets[1], nRays[1] + nRays[2], nSamples[1],
depthRange, encodeDim, coordChns));
// Init image gens
for (uint i = 0; i < NUM_LAYERS; ++i)
_imageGens[i].reset(new ImageGen(_cams[i]->res()));
if (_stereo)
_imageGens[STEREO_FOVEA_R].reset(new ImageGen(_cams[0]->res()));
// Init enhancements
glm::vec2 enhancementParams[] = {
{3.0f, 0.2f}, {5.0f, 0.2f}, {5.0f, 0.2f}
};
for (uint i = 0; i < NUM_LAYERS; ++i)
_enhancements[i].reset(new Enhancement(_cams[i]->res(), enhancementParams[i]));
// Create buffers
_rays.reset(new CudaArray<glm::vec3>(nTotRays));
_clrs.reset(new CudaArray<glm::vec4>(nTotRays));
for (uint i = 0; i < NUM_LAYERS; ++i)
_imageData[i].reset(new CudaArray<glm::vec4>(_cams[i]->nPixels()));
if (_stereo)
_imageData[STEREO_FOVEA_R].reset(new CudaArray<glm::vec4>(_cams[0]->nPixels()));
}
void FoveatedSynthesis_Impl::run(View& view, glm::vec2 foveaPos, bool showPerf, glm::vec2 foveaPosR) {
CudaEvent eStart, eGenRays, eInferred, eGenImage, eEnhance;
uint offset;
cudaEventRecord(eStart);
glm::vec2 foveaOffset(foveaPos - (glm::vec2)_fullCam->res() / 2.0f);
foveaOffset /= _fullCam->f();
glm::vec3 foveaOffset3(foveaOffset.x, foveaOffset.y, 0.0f);
glm::vec2 foveaOffsetR(foveaPosR - (glm::vec2)_fullCam->res() / 2.0f);
foveaOffsetR /= _fullCam->f();
glm::vec3 foveaOffset3R(foveaOffsetR.x, foveaOffsetR.y, 0.0f);
auto viewL = view.getStereoEye(0.06f, Eye_Left);
auto viewR = view.getStereoEye(0.06f, Eye_Right);
if (_stereo) {
offset = 0;
_cams[0]->getRays(sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), viewL, foveaOffset3);
offset += _cams[0]->nRays();
_cams[1]->getRays(sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), view, (foveaOffset3 + foveaOffset3R) / 2.0f);
offset += _cams[1]->nRays();
_cams[2]->getRays(sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), view, {});
offset += _cams[2]->nRays();
_cams[0]->getRays(sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), viewR, foveaOffset3R);
} else {
offset = 0;
for (uint i = 0; i < NUM_LAYERS; ++i) {
_cams[i]->getRays(sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)),
view, i == NUM_LAYERS - 1 ? glm::vec3() : foveaOffset3);
offset += _cams[i]->nRays();
}
}
cudaEventRecord(eGenRays);
if (_stereo) {
offset = 0;
_infers[0]->run(sptr<CudaArray<glm::vec4>>(_clrs->subArray(offset)),
sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), viewL.t(), showPerf);
offset += _infers[0]->nRays();
_infers[1]->run(sptr<CudaArray<glm::vec4>>(_clrs->subArray(offset)),
sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), view.t(), showPerf);
offset += _infers[1]->nRays();
_infers[0]->run(sptr<CudaArray<glm::vec4>>(_clrs->subArray(offset)),
sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), viewR.t(), showPerf);
} else {
offset = 0;
for (uint i = 0; i < NUM_NETS; ++i) {
_infers[i]->run(sptr<CudaArray<glm::vec4>>(_clrs->subArray(offset)),
sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), view.t(), showPerf);
offset += _infers[i]->nRays();
}
}
cudaEventRecord(eInferred);
offset = 0;
for (uint i = 0; i < NUM_LAYERS; ++i) {
_cams[i]->restoreImage(_imageData[i], sptr<CudaArray<glm::vec4>>(_clrs->subArray(offset)));
offset += _cams[i]->nRays();
}
if (_stereo)
_cams[0]->restoreImage(_imageData[STEREO_FOVEA_R], sptr<CudaArray<glm::vec4>>(_clrs->subArray(offset)));
cudaEventRecord(eGenImage);
for (uint i = 0; i < NUM_LAYERS; ++i)
_enhancements[i]->run(_imageData[i]);
if (_stereo)
_enhancements[0]->run(_imageData[STEREO_FOVEA_R]);
cudaEventRecord(eEnhance);
CHECK_EX(cudaDeviceSynchronize());
for (uint i = 0; i < NUM_LAYERS; ++i)
_imageGens[i]->run(_imageData[i]);
if (_stereo)
_imageGens[STEREO_FOVEA_R]->run(_imageData[STEREO_FOVEA_R]);
float timeTotal, timeGenRays, timeInfer, timeGenImage, timeEnhance;
cudaEventElapsedTime(&timeTotal, eStart, eGenImage);
cudaEventElapsedTime(&timeGenRays, eStart, eGenRays);
cudaEventElapsedTime(&timeInfer, eGenRays, eInferred);
cudaEventElapsedTime(&timeGenImage, eInferred, eGenImage);
cudaEventElapsedTime(&timeEnhance, eGenImage, eEnhance);
if (showPerf) {
std::ostringstream sout;
sout << "Synthesis => Total: " << timeTotal << "ms (Gen rays: " << timeGenRays
<< "ms, Infer: " << timeInfer << "ms, Gen image: " << timeGenImage
<< "ms, Enhance: " << timeEnhance << "ms)";
Logger::instance.info(sout.str().c_str());
}
}
GLuint FoveatedSynthesis_Impl::getGlResultTexture(uint index) {
return _imageGens[index]->getGlResultTexture();
}
FoveatedSynthesis::FoveatedSynthesis(const std::string& dataDir, glm::vec2 depthRange,
uint nSamples[], uint encodeDim, uint coordChns, sptr<Camera> cam,
const std::vector<sptr<Camera>>& layerCams, bool stereo) :
_impl(new FoveatedSynthesis_Impl(dataDir, depthRange, nSamples, encodeDim, coordChns, cam, layerCams, stereo)) {
}
void FoveatedSynthesis::run(View& view, glm::vec2 foveaPos, bool showPerf, glm::vec2 foveaPosR) {
_impl->run(view, foveaPos, showPerf, foveaPosR);
}
GLuint FoveatedSynthesis::getGlResultTexture(uint index) {
return _impl->getGlResultTexture(index);
}
#pragma once
#include "../utils/common.h"
#include "View.h"
class FoveatedSynthesis_Impl;
class FoveatedSynthesis {
public:
FoveatedSynthesis(const std::string& dataDir, glm::vec2 depthRange, uint nSamples[],
uint encodeDim, uint coordChns, sptr<Camera> cam,
const std::vector<sptr<Camera>>& layerCams, bool stereo = false);
void run(View& view, glm::vec2 foveaPos, bool showPerf = false, glm::vec2 foveaPosR = {});
GLuint getGlResultTexture(uint index);
private:
sptr<FoveatedSynthesis_Impl> _impl;
};
\ No newline at end of file
#include "InferPipeline.h"
#include "Nmsl2.h"
InferPipeline::InferPipeline(sptr<Msl> net, uint nRays, uint nSamplesPerRay, glm::vec2 depthRange,
uint encodeDim, uint coordChns)
: _nRays(nRays),
_nSamplesPerRay(nSamplesPerRay),
_coordChns(coordChns),
_net(net),
_sampler(new Sampler(depthRange, nSamplesPerRay, coordChns == 3)),
_encoder(new Encoder(encodeDim, coordChns)),
_renderer(new Renderer()) {
auto nSamples = _nRays * _nSamplesPerRay;
_coords = sptr<CudaArray<float>>(new CudaArray<float>(nSamples * coordChns));
_depths = sptr<CudaArray<float>>(new CudaArray<float>(nSamples));
_encoded = sptr<CudaArray<float>>(new CudaArray<float>(nSamples * _encoder->outDim()));
_layeredColors = sptr<CudaArray<glm::vec4>>(new CudaArray<glm::vec4>(nSamples));
_net->bindResources(_encoded.get(), _depths.get(), _layeredColors.get());
}
void InferPipeline::run(sptr<CudaArray<glm::vec4>> o_colors, sptr<CudaArray<glm::vec3>> rays,
glm::vec3 origin, bool showPerf) {
rays = sptr<CudaArray<glm::vec3>>(rays->subArray(0, _nRays));
o_colors = sptr<CudaArray<glm::vec4>>(o_colors->subArray(0, _nRays));
CudaEvent eStart, eSampled, eEncoded, eInferred, eRendered;
cudaEventRecord(eStart);
_sampler->sampleOnRays(_coords, _depths, rays, origin);
CHECK_EX(cudaDeviceSynchronize());
cudaEventRecord(eSampled);
_encoder->encode(_encoded, _coords);
CHECK_EX(cudaDeviceSynchronize());
cudaEventRecord(eEncoded);
_net->infer();
CHECK_EX(cudaDeviceSynchronize());
cudaEventRecord(eInferred);
_renderer->render(o_colors, _layeredColors);
cudaEventRecord(eRendered);
if (showPerf) {
CHECK_EX(cudaDeviceSynchronize());
float timeTotal, timeSample, timeEncode, timeInfer, timeRender;
cudaEventElapsedTime(&timeTotal, eStart, eRendered);
cudaEventElapsedTime(&timeSample, eStart, eSampled);
cudaEventElapsedTime(&timeEncode, eSampled, eEncoded);
cudaEventElapsedTime(&timeInfer, eEncoded, eInferred);
cudaEventElapsedTime(&timeRender, eInferred, eRendered);
std::ostringstream sout;
sout << "Infer pipeline: " << timeTotal << "ms (Sample: " << timeSample
<< "ms, Encode: " << timeEncode << "ms, Infer: " << timeInfer
<< "ms, Render: " << timeRender << "ms)";
Logger::instance.info(sout.str().c_str());
}
/*
{
std::ostringstream sout;
sout << "Rays:" << std::endl;
dumpArray<glm::vec3, float>(sout, *rays, 10);
Logger::instance.info(sout.str());
}
{
std::ostringstream sout;
sout << "Spherical coords:" << std::endl;
dumpArray(sout, *_coords, 10, _coordChns * _nSamplesPerRay);
Logger::instance.info(sout.str());
}
{
std::ostringstream sout;
sout << "Depths:" << std::endl;
dumpArray(sout, *_depths, 10, _nSamplesPerRay);
Logger::instance.info(sout.str());
}
{
std::ostringstream sout;
sout << "Encoded:" << std::endl;
dumpArray(sout, *_encoded, 10, _encoder->outDim() * _nSamplesPerRay);
Logger::instance.info(sout.str());
}
{
std::ostringstream sout;
sout << "Color:" << std::endl;
dumpArray<glm::vec4, float>(sout, *o_colors, 10);
Logger::instance.info(sout.str());
}
*/
}
\ No newline at end of file
#pragma once
#include "../utils/common.h"
#include "Sampler.h"
#include "Encoder.h"
#include "Renderer.h"
#include "Msl.h"
class InferPipeline {
public:
InferPipeline(sptr<Msl> net, uint nRays, uint nSamplesPerRay,
glm::vec2 depthRange, uint encodeDim, uint coordChns);
void run(sptr<CudaArray<glm::vec4>> o_colors, sptr<CudaArray<glm::vec3>> rays, glm::vec3 origin,
bool showPerf = false);
uint nRays() const { return _nRays; }
private:
uint _nRays;
uint _nSamplesPerRay;
uint _coordChns;
sptr<Msl> _net;
sptr<Sampler> _sampler;
sptr<Encoder> _encoder;
sptr<Renderer> _renderer;
sptr<CudaArray<float>> _coords;
sptr<CudaArray<float>> _depths;
sptr<CudaArray<float>> _encoded;
sptr<CudaArray<glm::vec4>> _layeredColors;
};
\ No newline at end of file
#include "../utils/half.h"
#include "Net.h"
#include <fstream>
#include <numeric>
#include <assert.h>
#include <time.h>
#include <NvOnnxParser.h>
bool Net::load(const std::string &path)
{
_deserialize(path);
if (!mEngine)
{
Logger::instance.error("Failed to build net: failed to load engine.");
return false;
}
mContext = std::shared_ptr<nv::IExecutionContext>(
mEngine->createExecutionContext(), Destroy<nv::IExecutionContext>());
if (!mContext)
return false;
std::ostringstream sout;
sout << "NbBindings: " << mEngine->getNbBindings() << std::endl;
for (auto i = 0; i < mEngine->getNbBindings(); ++i)
{
auto name = mEngine->getBindingName(i);
auto dims = mEngine->getBindingDimensions(i);
//nv::DataType type = mEngine->getBindingDataType(i);
auto isInput = mEngine->bindingIsInput(i);
sout << "Binding " << i << ": " << name << "("
<< (isInput ? "Input " : "Output ")
<< Formatter::toString(dims) << ")\n";
}
Logger::instance.info(sout.str().c_str());
return true;
}
void Net::bindResource(const std::string &name, Resource *res)
{
mResources.addResource(name, res);
}
bool Net::dispose()
{
mResources.clear();
mEngine = nullptr;
return true;
}
bool Net::infer(cudaStream_t stream, bool dumpInputOutput)
{
CudaMapScope mapScope(mResources.graphicsResources);
CHECK(mapScope.map());
auto bindings = _getBindings();
if (!mContext->enqueueV2(bindings.data(), stream, nullptr))
{
Logger::instance.error("Failed to enqueue inference");
return false;
}
/*
if (stream == nullptr) {
auto inferStart = clock();
if (!mContext->executeV2(bindings.data())) {
Logger::instance.error("Failed to execute inference");
return false;
}
auto inferEnd = clock();
std::ostringstream sout;
sout << "Infer takes " << INTERVAL(inferStart, inferEnd) << "ms" << std::endl;
Logger::instance.info(sout.str());
} else {
if (!mContext->enqueueV2(bindings.data(), stream, nullptr)) {
Logger::instance.error("Failed to enqueue inference");
return false;
}
}
*/
if (dumpInputOutput)
{
if (stream != nullptr)
CHECK(cudaStreamSynchronize(stream));
_dumpInputOutput();
}
return true;
}
void Net::_deserialize(const std::string &path)
{
std::ifstream fin(path, std::ios::in | std::ios::binary);
if (!fin.is_open())
return;
std::streampos begin, end;
begin = fin.tellg();
fin.seekg(0, std::ios::end);
end = fin.tellg();
std::size_t size = end - begin;
fin.seekg(0, std::ios::beg);
char *engine_data = new char[size];
fin.read(engine_data, size);
fin.close();
uptr<nv::IRuntime> runtime(nv::createInferRuntime(Logger::instance));
mEngine = std::shared_ptr<nv::ICudaEngine>(
runtime->deserializeCudaEngine(engine_data, size, nullptr),
Destroy<nv::ICudaEngine>());
delete[] engine_data;
Logger::instance.info("Engine is deserialized");
}
std::vector<void *> Net::_getBindings()
{
std::vector<void *> bindings(mEngine->getNbBindings());
for (auto it = mResources.resources.begin();
it != mResources.resources.end(); ++it)
{
auto idx = mEngine->getBindingIndex(it->first.c_str());
if (idx < 0)
continue;
bindings[idx] = it->second->getBuffer();
}
return bindings;
}
void Net::_dumpInputOutput()
{
auto bindings = _getBindings();
for (auto it = mResources.resources.begin();
it != mResources.resources.end(); ++it)
{
auto idx = mEngine->getBindingIndex(it->first.c_str());
if (idx < 0)
continue;
if (mEngine->bindingIsInput(idx))
{
std::ostringstream sout;
sout << "Input Buffer " << it->first << ": ";
_dumpBuffer(sout, bindings[idx], idx);
Logger::instance.info(sout.str().c_str());
}
else
{
std::ostringstream sout;
sout << "Output Buffer " << it->first << ": ";
_dumpBuffer(sout, bindings[idx], idx);
Logger::instance.info(sout.str().c_str());
}
}
}
bool Net::_dumpBuffer(std::ostream &os, void *deviceBuf, int index)
{
return _dumpBuffer(os, deviceBuf, mEngine->getBindingDimensions(index),
mEngine->getBindingDataType(index));
}
bool Net::_dumpBuffer(std::ostream &os, void *deviceBuf, nv::Dims bufDims, nv::DataType dataType)
{
auto size = std::accumulate(bufDims.d, bufDims.d + bufDims.nbDims, 1,
std::multiplies<int64_t>()) *
getElementSize(dataType);
char *hostBuf = new char[size];
CHECK(cudaMemcpyAsync(hostBuf, deviceBuf, size, cudaMemcpyDeviceToHost));
int mBatchSize = 0;
size_t rowCount = static_cast<size_t>(bufDims.nbDims > 0 ? bufDims.d[bufDims.nbDims - 1] : mBatchSize);
int leadDim = mBatchSize;
int *trailDims = bufDims.d;
int nbDims = bufDims.nbDims;
// Fix explicit Dimension networks
if (!leadDim && nbDims > 0)
{
leadDim = bufDims.d[0];
++trailDims;
--nbDims;
}
os << "[" << leadDim;
for (int i = 0; i < nbDims; i++)
os << ", " << trailDims[i];
os << "]" << std::endl;
switch (dataType)
{
case nv::DataType::kINT32:
dumpHostBuffer<int32_t>(os, (int32_t*)hostBuf, size, rowCount);
break;
case nv::DataType::kFLOAT:
dumpHostBuffer<float>(os, (float*)hostBuf, size, rowCount);
break;
case nv::DataType::kHALF:
dumpHostBuffer<half_float::half>(os, (half_float::half*)hostBuf, size, rowCount);
break;
case nv::DataType::kINT8:
assert(0 && "Int8 network-level input and output is not supported");
break;
case nv::DataType::kBOOL:
assert(0 && "Bool network-level input and output are not supported");
break;
}
return true;
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment