sync

6294701e · Nianchen Deng · 2824f796 · 6294701e · 6294701e · 6294701e
Commit 6294701e authored Jan 17, 2022 by Nianchen Deng
--- a/cpp/fnr_core/CrossRenderer.h
+++ b/cpp/fnr_core/CrossRenderer.h
+#pragma once
+#include "../utils/common.h"
+#include "../utils/Shader.h"
+class CrossRenderer {
+public:
+	CrossRenderer(glm::vec2 frameRes, float crossSize, glm::vec4 crossColor);
+	void render(glm::vec2 p, float globalShift);
+private:
+	glm::vec2 _frameRes;
+	float _crossSize;
+	glm::vec4 _crossColor;
+	sptr<Shader> _shader;
+	GLuint _vertBuf;
+	GLuint _shaderProp_crossSize;
+	GLuint _shaderProp_crossColor;
+	GLuint _shaderProp_crossPos;
+	GLuint _shaderProp_frameRes;
+	GLuint _shaderProp_globalShift;
+	GLuint _loc_MVP;
+};
\ No newline at end of file
--- a/cpp/fnr_core/Encoder.cu
+++ b/cpp/fnr_core/Encoder.cu
+#include "Encoder.h"
+#include "../utils/cuda.h"
+/// idx3.z = 0: x, y, z, sin(x), sin(y), sin(z), cos(x), cos(y), cos(z)
+/// idx3.z = 1: sin(2x), sin(2y), sin(2z), cos(2x), cos(2y), cos(2z)
+/// ...
+/// idx3.z = n_freq-1: sin(2^(n_freq-1)x), sin(2^(n_freq-1)y), sin(2^(n_freq-1)z),
+///                    cos(2^(n_freq-1)x), cos(2^(n_freq-1)y), cos(2^(n_freq-1)z)
+/// Dispatch (n, in_chns, n_freqs)
+__global__ void cu_encode0(float *output, float *input, uint n, uint nFreqs) {
+    glm::uvec3 idx3 = IDX3;
+    if (idx3.x >= n)
+        return;
+    uint inChns = blockDim.y;
+    uint outChns = inChns * (nFreqs * 2 + 1);
+    uint i = idx3.x, chn = idx3.y;
+    output[i * outChns + chn] = input[i * inChns + chn];
+}
+__global__ void cu_encode(float *output, float *input, float *freqs, uint n, bool catInput) {
+    glm::uvec3 idx3 = IDX3;
+    if (idx3.x >= n)
+        return;
+    uint offset = (uint)catInput;
+    uint inChns = blockDim.y, nFreqs = blockDim.z;
+    uint i = idx3.x, chn = idx3.y, freq = idx3.z;
+    uint elem = i * inChns + chn;
+    uint outChns = inChns * (nFreqs * 2 + offset);
+    uint base = i * outChns + chn;
+    if (freq == 0 && catInput)
+        output[base] = input[elem];
+    float x = freqs[freq] * input[elem];
+    float s, c;
+    __sincosf(x, &s, &c);
+    output[base + inChns * (freq * 2 + offset)] = s;
+    output[base + inChns * (freq * 2 + offset + 1)] = c;
+}
+__global__ void cu_encode2(glm::vec2 *output, glm::vec2 *input, float *freqs, uint n) {
+    glm::uvec3 idx3 = IDX3;
+    if (idx3.x >= n)
+        return;
+    uint nFreqs = blockDim.y;
+    uint i = idx3.x, freq = idx3.y;
+    uint outChns = nFreqs * 2 + 1;
+    uint base = i * outChns;
+    if (freq == 0)
+        output[base] = input[i];
+    glm::vec2 x = freqs[freq] * input[i];
+    glm::vec2 s, c;
+    __sincosf(x.x, &s.x, &c.x);
+    __sincosf(x.y, &s.y, &c.y);
+    output[base + (freq * 2 + 1)] = s;
+    output[base + (freq * 2 + 2)] = c;
+}
+/**
+ * @brief
+ *
+ * @param output encoded data, n x out_chns
+ * @param input coord data, n x in_chns
+ */
+void Encoder::encode(sptr<CudaArray<float>> output, sptr<CudaArray<float>> input) {
+    std::ostringstream sout;
+    sout << "Encoder => input size: (" << input->n() / _chns << ", " << _chns << "), output size: ("
+         << output->n() / outDim() << ", " << outDim() << ")";
+    //Logger::instance.info(sout.str());
+    uint n = input->n() / _chns;
+    dim3 blkSize(1024 / _chns / _multires, _chns, _multires);
+    dim3 grdSize(ceilDiv(n, blkSize.x), 1, 1);
+    CU_INVOKE(cu_encode)(*output, *input, *_freqs, n, _catInput);
+    // blkSize = dim3(1024 / _chns, _chns);
+    // grdSize = dim3(ceilDiv(n, blkSize.x), 1, 1);
+    // CU_INVOKE(cu_encode0)(*output, *input, n, _multires);
+    CHECK_EX(cudaGetLastError());
+}
+void Encoder::_genFreqArray() {
+    float *arr = new float[_multires];
+    arr[0] = 1.0f;
+    for (auto i = 1; i < _multires; ++i)
+        arr[i] = arr[i - 1] * 2.0f;
+    _freqs = sptr<CudaArray<float>>(new CudaArray<float>(_multires));
+    cudaMemcpy(_freqs->getBuffer(), arr, _multires * sizeof(float), cudaMemcpyHostToDevice);
+    delete[] arr;
+}
--- a/cpp/fnr_core/Encoder.h
+++ b/cpp/fnr_core/Encoder.h
+#pragma once
+#include "../utils/common.h"
+class Encoder {
+public:
+    Encoder(unsigned int multires, unsigned int chns, bool catInput)
+        : _multires(multires), _chns(chns), _catInput(catInput) {
+        _genFreqArray();
+    }
+    unsigned int outDim() const { return _chns * ((int)_catInput + _multires * 2); }
+    void encode(sptr<CudaArray<float>> output, sptr<CudaArray<float>> input);
+private:
+    unsigned int _multires;
+    unsigned int _chns;
+    bool _catInput;
+    sptr<CudaArray<float>> _freqs;
+    void _genFreqArray();
+}; 
\ No newline at end of file
--- a/cpp/msl_infer/Enhancement.cu
+++ b/cpp/msl_infer/Enhancement.cu
--- a/cpp/msl_infer/Enhancement.h
+++ b/cpp/msl_infer/Enhancement.h
--- a/cpp/fnr_core/FoveatedBlend.cpp
+++ b/cpp/fnr_core/FoveatedBlend.cpp
+#include "FoveatedBlend.h"
+#include <glm/gtx/transform.hpp>
+static const struct {
+	float x, y;
+	float u, v;
+} vertices[4] = {
+	{-1.0f, -1.0f, 0.f, 1.f},
+	{1.0f, -1.0f, 1.f, 1.f},
+	{1.0f, 1.0f, 1.f, 0.f},
+	{-1.0f, 1.0f, 0.f, 0.f}
+};
+static const char* vertex_shader_text =
+"#version 300 es\n"
+"uniform mat4 MVP;\n"
+"in vec2 vUV;\n"
+"in vec2 vPos;\n"
+"out vec2 uv;\n"
+"void main()\n"
+"{\n"
+"    gl_Position = MVP * vec4(vPos, 0.0, 1.0);\n"
+"    uv = vUV;\n"
+"}\n";
+static const char* fragment_shader_text =
+"#version 300 es\n"
+"precision highp float;\n"
+"out vec4 FragColor;\n"
+"in vec2 uv;\n"
+"uniform sampler2D tex;\n"
+"uniform float innerR;\n"
+"uniform float outerR;\n"
+"uniform float shift;\n"
+"uniform float globalShift;\n"
+"uniform vec2 foveaCenter;\n"
+"uniform vec2 frameRes;\n"
+"void main()\n"
+"{\n"
+"    vec2 u = uv;"
+"    FragColor = texture(tex, u);"
+"    u.x += (shift + globalShift) / frameRes.x;\n"
+"    if(outerR < 1e-2) {\n"
+"        FragColor = texture(tex, u);\n"
+"        return;\n"
+"    }\n"
+"    vec2 p = u * frameRes;\n"
+"    float r = distance(p, foveaCenter);\n"
+"    vec2 coord = (p - foveaCenter) / outerR / 2.0 + 0.5;\n"
+"    FragColor = vec4(coord, 0, 1);\n"
+"    if(coord.x < 0.0 || coord.x > 1.0 || coord.y < 0.0 || coord.y > 1.0) {\n"
+"        FragColor = vec4(0, 0, 0, 0);\n"
+"        return;\n"
+"    }\n"
+"    vec4 c = texture(tex, coord);\n"
+"    float alpha = 1.0 - smoothstep((outerR - innerR) * 0.6 + innerR, outerR, r);\n"
+"    c.a = c.a * alpha;\n"
+"    FragColor = c;\n"
+"}\n";
+static const char* fragment_shader_text_baked =
+"#version 300 es\n"
+"precision highp float;\n"
+"out vec4 FragColor;\n"
+"in vec2 uv;\n"
+"uniform sampler2D tex;\n"
+"uniform float innerR;\n"
+"uniform float outerR;\n"
+"uniform float shift;\n"
+"uniform float globalShift;\n"
+"uniform vec2 foveaCenter;\n"
+"uniform vec2 frameRes;\n"
+"void main()\n"
+"{\n"
+"    vec2 u = uv;"
+"    u.x += (shift + globalShift) / frameRes.x;\n"
+"    if(outerR < 1e-2) {\n"
+"        FragColor = texture(tex, u);\n"
+"        return;\n"
+"    }\n"
+"    vec2 p = u * frameRes;\n"
+"    float r = distance(p, foveaCenter);\n"
+"    vec2 coord = (p - foveaCenter) / outerR / 2.0 + 0.5;\n"
+"    if(coord.x < 0.0 || coord.x > 1.0 || coord.y < 0.0 || coord.y > 1.0) {\n"
+"        FragColor = vec4(0, 0, 0, 0);\n"
+"        return;\n"
+"    }\n"
+"    vec4 c = texture(tex, u);\n"
+"    float alpha = 1.0 - smoothstep((outerR - innerR) * 0.6 + innerR, outerR, r);\n"
+"    c.a = c.a * alpha;\n"
+"    FragColor = c;\n"
+"}\n";
+FoveatedBlend::FoveatedBlend(sptr<Camera> cam, const std::vector<sptr<Camera>>& layerCams, bool forBaked) :
+	_cam(cam), _layerCams(layerCams), _forBaked(forBaked) {
+	_blendShader.reset(new Shader("FoveaBlend", vertex_shader_text,
+		forBaked ? fragment_shader_text_baked : fragment_shader_text));
+	_blendShader->compile();
+	glGenBuffers(1, &_vertBuf);
+	glBindBuffer(GL_ARRAY_BUFFER, _vertBuf);
+	glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW);
+	auto program = _blendShader->getId();
+	_shaderProp_tex = glGetUniformLocation(program, "tex");
+	_shaderProp_innerR = glGetUniformLocation(program, "innerR");
+	_shaderProp_outerR = glGetUniformLocation(program, "outerR");
+	_shaderProp_shift = glGetUniformLocation(program, "shift");
+	_shaderProp_globalShift = glGetUniformLocation(program, "globalShift");
+	_shaderProp_frameRes = glGetUniformLocation(program, "frameRes");
+	_shaderProp_foveaCenter = glGetUniformLocation(program, "foveaCenter");
+	_loc_MVP = glGetUniformLocation(program, "MVP");
+	auto loc_vPos = glGetAttribLocation(program, "vPos");
+	auto loc_vUV = glGetAttribLocation(program, "vUV");
+	glEnableVertexAttribArray(loc_vPos);
+	glVertexAttribPointer(loc_vPos, 2, GL_FLOAT, GL_FALSE, sizeof(vertices[0]), (void*)0);
+	glEnableVertexAttribArray(loc_vUV);
+	glVertexAttribPointer(loc_vUV, 2, GL_FLOAT, GL_FALSE, sizeof(vertices[0]),
+		(void*)(sizeof(float) * 2));
+	glGenQueries(1, &_glQuery);
+}
+void FoveatedBlend::run(GLuint glTexs[], glm::vec2 foveaPos, float shift, float globalShift, bool showPerf) {
+	glm::mat4 mvp = glm::ortho(-1.f, 1.f, -1.f, 1.f, 1.f, -1.f);
+	glBeginQuery(GL_TIME_ELAPSED, _glQuery);
+	glUseProgram(_blendShader->getId());
+	glUniformMatrix4fv(_loc_MVP, 1, GL_FALSE, (float*)&mvp[0][0]);
+	glUniform1i(_shaderProp_tex, 0);
+	glEnable(GL_TEXTURE_2D);
+	glActiveTexture(GL_TEXTURE0);
+	int i = _layerCams.size() - 1;
+	glUniform1f(_shaderProp_outerR, 0.0f);
+	glUniform1f(_shaderProp_shift, shift);
+	glUniform1f(_shaderProp_globalShift, globalShift);
+	glUniform2f(_shaderProp_frameRes, _cam->res().x, _cam->res().y);
+	glUniform2f(_shaderProp_foveaCenter, _cam->res().x / 2.0f, _cam->res().y / 2.0f);
+	glBindTexture(GL_TEXTURE_2D, glTexs[i]);
+	glDrawArrays(GL_QUADS, 0, 4);
+	for (i -= 1; i >= 0; --i) {
+		auto outerR = _layerCams[i]->res().y / _layerCams[i]->f().y * _cam->f().y * 0.5f;
+		auto innerR = i == 0 ?
+			0.0f :
+			_layerCams[i - 1]->res().y / _layerCams[i - 1]->f().y * _cam->f().y * 0.5f;
+		glUniform1f(_shaderProp_outerR, outerR);
+		glUniform1f(_shaderProp_shift, _forBaked ? shift : 0.0f);
+		glUniform1f(_shaderProp_globalShift, globalShift);
+		glUniform2f(_shaderProp_frameRes, _cam->res().x, _cam->res().y);
+		glUniform2f(_shaderProp_foveaCenter, foveaPos.x, foveaPos.y);
+		glBindTexture(GL_TEXTURE_2D, glTexs[i]);
+		glDrawArrays(GL_QUADS, 0, 4);
+	}
+	glDisable(GL_TEXTURE_2D);
+	glEndQuery(GL_TIME_ELAPSED);
+	if (showPerf) {
+		GLint available = 0;
+		while (!available)
+			glGetQueryObjectiv(_glQuery, GL_QUERY_RESULT_AVAILABLE, &available);
+		// timer queries can contain more than 32 bits of data, so always
+		// query them using the 64 bit types to avoid overflow
+		GLuint64 timeElapsed = 0;
+		glGetQueryObjectui64v(_glQuery, GL_QUERY_RESULT, &timeElapsed);
+		Logger::instance.info("Blend: %fms", timeElapsed / 1000000.0f);
+	}
+}
\ No newline at end of file
--- a/cpp/fnr_core/FoveatedBlend.h
+++ b/cpp/fnr_core/FoveatedBlend.h
+#pragma once
+#include "../utils/common.h"
+#include "../utils/Shader.h"
+#include "View.h"
+class FoveatedBlend {
+public:
+	FoveatedBlend(sptr<Camera> cam, const std::vector<sptr<Camera>>& layerCams, bool forBaked = false);
+	void run(GLuint glTexs[], glm::vec2 foveaPos, float shift, float globalShift, bool showPerf = false);
+private:
+	bool _forBaked;
+	sptr<Camera> _cam;
+	std::vector<sptr<Camera>> _layerCams;
+	sptr<Shader> _blendShader;
+	GLuint _vertBuf;
+	GLuint _shaderProp_tex;
+	GLuint _shaderProp_innerR;
+	GLuint _shaderProp_outerR;
+	GLuint _shaderProp_shift;
+	GLuint _shaderProp_globalShift;
+	GLuint _shaderProp_frameRes;
+	GLuint _shaderProp_foveaCenter;
+	GLuint _glQuery;
+	GLuint _loc_MVP;
+};
\ No newline at end of file
--- a/cpp/fnr_core/FoveatedSynthesis.cpp
+++ b/cpp/fnr_core/FoveatedSynthesis.cpp
+#include "FoveatedSynthesis.h"
+#include "InferPipeline.h"
+#include "Enhancement.h"
+#include "ImageGen.h"
+constexpr auto NUM_LAYERS = 3u;
+constexpr auto STEREO_FOVEA_R = NUM_LAYERS;
+constexpr auto NUM_NETS = 2u;
+class FoveatedSynthesis_Impl {
+public:
+	FoveatedSynthesis_Impl(const std::string& dataDir, glm::vec2 depthRange, uint nSamples[],
+		uint encodeDim, uint coordChns, sptr<Camera> cam, const std::vector<sptr<Camera>>& layerCams,
+		bool stereo);
+	void run(View& view, glm::vec2 foveaPos, bool showPerf, glm::vec2 foveaPosR);
+	GLuint getGlResultTexture(uint index);
+private:
+	bool _stereo;
+	uint _nRays;
+	uint _nSamples;
+	sptr<Camera> _fullCam;
+	sptr<Camera> _cams[NUM_LAYERS];
+	sptr<Msl> _nets[NUM_NETS];
+	sptr<InferPipeline> _infers[NUM_NETS];
+	sptr<Enhancement> _enhancements[NUM_LAYERS];
+	sptr<ImageGen> _imageGens[NUM_LAYERS + 1];
+	sptr<CudaArray<glm::vec3>> _rays;
+	sptr<CudaArray<glm::vec4>> _clrs;
+	sptr<CudaArray<glm::vec4>> _imageData[NUM_LAYERS + 1];
+};
+FoveatedSynthesis_Impl::FoveatedSynthesis_Impl(const std::string& dataDir, glm::vec2 depthRange,
+	uint nSamples[], uint encodeDim, uint coordChns, sptr<Camera> cam,
+	const std::vector<sptr<Camera>>& layerCams, bool stereo) :
+	_fullCam(cam), _stereo(stereo) {
+	// Load nets
+	for (uint i = 0; i < NUM_NETS; ++i)
+		_nets[i].reset(new Msl());
+	_nets[0]->load(dataDir + "/fovea.trt");
+	_nets[1]->load(dataDir + "/periph.trt");
+	// Init cams
+	for (uint i = 0; i < NUM_LAYERS; ++i)
+		_cams[i] = layerCams[i];
+	uint nRays[NUM_LAYERS];
+	uint nTotRays = 0;
+	for (uint i = 0; i < NUM_LAYERS; ++i)
+		nTotRays += nRays[i] = _cams[i]->nRays();
+	if (_stereo)
+		nTotRays += nRays[0];
+	// Init infers
+	_infers[0].reset(new InferPipeline(_nets[0], nRays[0], nSamples[0],
+		depthRange, encodeDim, coordChns));
+	_infers[1].reset(new InferPipeline(_nets[1], nRays[1] + nRays[2], nSamples[1],
+		depthRange, encodeDim, coordChns));
+	// Init image gens
+	for (uint i = 0; i < NUM_LAYERS; ++i)
+		_imageGens[i].reset(new ImageGen(_cams[i]->res()));
+	if (_stereo)
+		_imageGens[STEREO_FOVEA_R].reset(new ImageGen(_cams[0]->res()));
+	// Init enhancements
+	glm::vec2 enhancementParams[] = {
+		{3.0f, 0.2f}, {5.0f, 0.2f}, {5.0f, 0.2f}
+	};
+	for (uint i = 0; i < NUM_LAYERS; ++i)
+		_enhancements[i].reset(new Enhancement(_cams[i]->res(), enhancementParams[i]));
+	// Create buffers
+	_rays.reset(new CudaArray<glm::vec3>(nTotRays));
+	_clrs.reset(new CudaArray<glm::vec4>(nTotRays));
+	for (uint i = 0; i < NUM_LAYERS; ++i)
+		_imageData[i].reset(new CudaArray<glm::vec4>(_cams[i]->nPixels()));
+	if (_stereo)
+		_imageData[STEREO_FOVEA_R].reset(new CudaArray<glm::vec4>(_cams[0]->nPixels()));
+}
+void FoveatedSynthesis_Impl::run(View& view, glm::vec2 foveaPos, bool showPerf, glm::vec2 foveaPosR) {
+	CudaEvent eStart, eGenRays, eInferred, eGenImage, eEnhance;
+	uint offset;
+	cudaEventRecord(eStart);
+	glm::vec2 foveaOffset(foveaPos - (glm::vec2)_fullCam->res() / 2.0f);
+	foveaOffset /= _fullCam->f();
+	glm::vec3 foveaOffset3(foveaOffset.x, foveaOffset.y, 0.0f);
+	glm::vec2 foveaOffsetR(foveaPosR - (glm::vec2)_fullCam->res() / 2.0f);
+	foveaOffsetR /= _fullCam->f();
+	glm::vec3 foveaOffset3R(foveaOffsetR.x, foveaOffsetR.y, 0.0f);
+	auto viewL = view.getStereoEye(0.06f, Eye_Left);
+	auto viewR = view.getStereoEye(0.06f, Eye_Right);
+	if (_stereo) {
+		offset = 0;
+		_cams[0]->getRays(sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), viewL, foveaOffset3);
+		offset += _cams[0]->nRays();
+		_cams[1]->getRays(sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), view, (foveaOffset3 + foveaOffset3R) / 2.0f);
+		offset += _cams[1]->nRays();
+		_cams[2]->getRays(sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), view, {});
+		offset += _cams[2]->nRays();
+		_cams[0]->getRays(sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), viewR, foveaOffset3R);
+	} else {
+		offset = 0;
+		for (uint i = 0; i < NUM_LAYERS; ++i) {
+			_cams[i]->getRays(sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)),
+				view, i == NUM_LAYERS - 1 ? glm::vec3() : foveaOffset3);
+			offset += _cams[i]->nRays();
+		}
+	}
+	cudaEventRecord(eGenRays);
+	if (_stereo) {
+		offset = 0;
+		_infers[0]->run(sptr<CudaArray<glm::vec4>>(_clrs->subArray(offset)),
+			sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), viewL.t(), showPerf);
+		offset += _infers[0]->nRays();
+		_infers[1]->run(sptr<CudaArray<glm::vec4>>(_clrs->subArray(offset)),
+			sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), view.t(), showPerf);
+		offset += _infers[1]->nRays();
+		_infers[0]->run(sptr<CudaArray<glm::vec4>>(_clrs->subArray(offset)),
+			sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), viewR.t(), showPerf);
+	} else {
+		offset = 0;
+		for (uint i = 0; i < NUM_NETS; ++i) {
+			_infers[i]->run(sptr<CudaArray<glm::vec4>>(_clrs->subArray(offset)),
+				sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), view.t(), showPerf);
+			offset += _infers[i]->nRays();
+		}
+	}
+	cudaEventRecord(eInferred);
+	offset = 0;
+	for (uint i = 0; i < NUM_LAYERS; ++i) {
+		_cams[i]->restoreImage(_imageData[i], sptr<CudaArray<glm::vec4>>(_clrs->subArray(offset)));
+		offset += _cams[i]->nRays();
+	}
+	if (_stereo)
+		_cams[0]->restoreImage(_imageData[STEREO_FOVEA_R], sptr<CudaArray<glm::vec4>>(_clrs->subArray(offset)));
+	cudaEventRecord(eGenImage);
+	for (uint i = 0; i < NUM_LAYERS; ++i)
+		_enhancements[i]->run(_imageData[i]);
+	if (_stereo)
+		_enhancements[0]->run(_imageData[STEREO_FOVEA_R]);
+	cudaEventRecord(eEnhance);
+	CHECK_EX(cudaDeviceSynchronize());
+	for (uint i = 0; i < NUM_LAYERS; ++i)
+		_imageGens[i]->run(_imageData[i]);
+	if (_stereo)
+		_imageGens[STEREO_FOVEA_R]->run(_imageData[STEREO_FOVEA_R]);
+	float timeTotal, timeGenRays, timeInfer, timeGenImage, timeEnhance;
+	cudaEventElapsedTime(&timeTotal, eStart, eGenImage);
+	cudaEventElapsedTime(&timeGenRays, eStart, eGenRays);
+	cudaEventElapsedTime(&timeInfer, eGenRays, eInferred);
+	cudaEventElapsedTime(&timeGenImage, eInferred, eGenImage);
+	cudaEventElapsedTime(&timeEnhance, eGenImage, eEnhance);
+	if (showPerf) {
+		std::ostringstream sout;
+		sout << "Synthesis => Total: " << timeTotal << "ms (Gen rays: " << timeGenRays
+			<< "ms, Infer: " << timeInfer << "ms, Gen image: " << timeGenImage
+			<< "ms, Enhance: " << timeEnhance << "ms)";
+		Logger::instance.info(sout.str().c_str());
+	}
+}
+GLuint FoveatedSynthesis_Impl::getGlResultTexture(uint index) {
+	return _imageGens[index]->getGlResultTexture();
+}
+FoveatedSynthesis::FoveatedSynthesis(const std::string& dataDir, glm::vec2 depthRange,
+	uint nSamples[], uint encodeDim, uint coordChns, sptr<Camera> cam,
+	const std::vector<sptr<Camera>>& layerCams, bool stereo) :
+	_impl(new FoveatedSynthesis_Impl(dataDir, depthRange, nSamples, encodeDim, coordChns, cam, layerCams, stereo)) {
+}
+void FoveatedSynthesis::run(View& view, glm::vec2 foveaPos, bool showPerf, glm::vec2 foveaPosR) {
+	_impl->run(view, foveaPos, showPerf, foveaPosR);
+}
+GLuint FoveatedSynthesis::getGlResultTexture(uint index) {
+	return _impl->getGlResultTexture(index);
+}
--- a/cpp/fnr_core/FoveatedSynthesis.h
+++ b/cpp/fnr_core/FoveatedSynthesis.h
+#pragma once
+#include "../utils/common.h"
+#include "View.h"
+class FoveatedSynthesis_Impl;
+class FoveatedSynthesis {
+public:
+	FoveatedSynthesis(const std::string& dataDir, glm::vec2 depthRange, uint nSamples[],
+		uint encodeDim, uint coordChns, sptr<Camera> cam,
+		const std::vector<sptr<Camera>>& layerCams, bool stereo = false);
+	void run(View& view, glm::vec2 foveaPos, bool showPerf = false, glm::vec2 foveaPosR = {});
+	GLuint getGlResultTexture(uint index);
+private:
+	sptr<FoveatedSynthesis_Impl> _impl;
+};
\ No newline at end of file
--- a/cpp/msl_infer/ImageGen.cpp
+++ b/cpp/msl_infer/ImageGen.cpp
--- a/cpp/msl_infer/ImageGen.h
+++ b/cpp/msl_infer/ImageGen.h
--- a/cpp/fnr_core/InferPipeline.cpp
+++ b/cpp/fnr_core/InferPipeline.cpp
+#include "InferPipeline.h"
+#include "Nmsl2.h"
+InferPipeline::InferPipeline(sptr<Msl> net, uint nRays, uint nSamplesPerRay, glm::vec2 depthRange,
+                             uint encodeDim, uint coordChns)
+    : _nRays(nRays),
+      _nSamplesPerRay(nSamplesPerRay),
+      _coordChns(coordChns),
+      _net(net),
+      _sampler(new Sampler(depthRange, nSamplesPerRay, coordChns == 3)),
+      _encoder(new Encoder(encodeDim, coordChns)),
+      _renderer(new Renderer()) {
+    auto nSamples = _nRays * _nSamplesPerRay;
+    _coords = sptr<CudaArray<float>>(new CudaArray<float>(nSamples * coordChns));
+    _depths = sptr<CudaArray<float>>(new CudaArray<float>(nSamples));
+    _encoded = sptr<CudaArray<float>>(new CudaArray<float>(nSamples * _encoder->outDim()));
+    _layeredColors = sptr<CudaArray<glm::vec4>>(new CudaArray<glm::vec4>(nSamples));
+    _net->bindResources(_encoded.get(), _depths.get(), _layeredColors.get());
+}
+void InferPipeline::run(sptr<CudaArray<glm::vec4>> o_colors, sptr<CudaArray<glm::vec3>> rays,
+                        glm::vec3 origin, bool showPerf) {
+    rays = sptr<CudaArray<glm::vec3>>(rays->subArray(0, _nRays));
+    o_colors = sptr<CudaArray<glm::vec4>>(o_colors->subArray(0, _nRays));
+    CudaEvent eStart, eSampled, eEncoded, eInferred, eRendered;
+    cudaEventRecord(eStart);
+    _sampler->sampleOnRays(_coords, _depths, rays, origin);
+    CHECK_EX(cudaDeviceSynchronize());
+    cudaEventRecord(eSampled);
+    _encoder->encode(_encoded, _coords);
+    CHECK_EX(cudaDeviceSynchronize());
+    cudaEventRecord(eEncoded);
+    _net->infer();
+    CHECK_EX(cudaDeviceSynchronize());
+    cudaEventRecord(eInferred);
+    _renderer->render(o_colors, _layeredColors);
+    cudaEventRecord(eRendered);
+    if (showPerf) {
+        CHECK_EX(cudaDeviceSynchronize());
+        float timeTotal, timeSample, timeEncode, timeInfer, timeRender;
+        cudaEventElapsedTime(&timeTotal, eStart, eRendered);
+        cudaEventElapsedTime(&timeSample, eStart, eSampled);
+        cudaEventElapsedTime(&timeEncode, eSampled, eEncoded);
+        cudaEventElapsedTime(&timeInfer, eEncoded, eInferred);
+        cudaEventElapsedTime(&timeRender, eInferred, eRendered);
+        std::ostringstream sout;
+        sout << "Infer pipeline: " << timeTotal << "ms (Sample: " << timeSample
+             << "ms, Encode: " << timeEncode << "ms, Infer: " << timeInfer
+             << "ms, Render: " << timeRender << "ms)";
+        Logger::instance.info(sout.str().c_str());
+    }
+    /*
+    {
+        std::ostringstream sout;
+        sout << "Rays:" << std::endl;
+        dumpArray<glm::vec3, float>(sout, *rays, 10);
+        Logger::instance.info(sout.str());
+    }
+    {
+        std::ostringstream sout;
+        sout << "Spherical coords:" << std::endl;
+        dumpArray(sout, *_coords, 10, _coordChns * _nSamplesPerRay);
+        Logger::instance.info(sout.str());
+    }
+    {
+        std::ostringstream sout;
+        sout << "Depths:" << std::endl;
+        dumpArray(sout, *_depths, 10, _nSamplesPerRay);
+        Logger::instance.info(sout.str());
+    }
+    {
+        std::ostringstream sout;
+        sout << "Encoded:" << std::endl;
+        dumpArray(sout, *_encoded, 10, _encoder->outDim() * _nSamplesPerRay);
+        Logger::instance.info(sout.str());
+    }
+    {
+        std::ostringstream sout;
+        sout << "Color:" << std::endl;
+        dumpArray<glm::vec4, float>(sout, *o_colors, 10);
+        Logger::instance.info(sout.str());
+    }
+    */
+}
\ No newline at end of file
--- a/cpp/fnr_core/InferPipeline.h
+++ b/cpp/fnr_core/InferPipeline.h
+#pragma once
+#include "../utils/common.h"
+#include "Sampler.h"
+#include "Encoder.h"
+#include "Renderer.h"
+#include "Msl.h"
+class InferPipeline {
+public:
+    InferPipeline(sptr<Msl> net, uint nRays, uint nSamplesPerRay,
+                  glm::vec2 depthRange, uint encodeDim, uint coordChns);
+    void run(sptr<CudaArray<glm::vec4>> o_colors, sptr<CudaArray<glm::vec3>> rays, glm::vec3 origin,
+             bool showPerf = false);
+    uint nRays() const { return _nRays; }
+private:
+    uint _nRays;
+    uint _nSamplesPerRay;
+    uint _coordChns;
+    sptr<Msl> _net;
+    sptr<Sampler> _sampler;
+    sptr<Encoder> _encoder;
+    sptr<Renderer> _renderer;
+    sptr<CudaArray<float>> _coords;
+    sptr<CudaArray<float>> _depths;
+    sptr<CudaArray<float>> _encoded;
+    sptr<CudaArray<glm::vec4>> _layeredColors;
+};
\ No newline at end of file
--- a/cpp/msl_infer/Msl.cpp
+++ b/cpp/msl_infer/Msl.cpp
--- a/cpp/msl_infer/Msl.h
+++ b/cpp/msl_infer/Msl.h
--- a/cpp/fnr_core/Net.cpp
+++ b/cpp/fnr_core/Net.cpp
+#include "../utils/half.h"
+#include "Net.h"
+#include <fstream>
+#include <numeric>
+#include <assert.h>
+#include <time.h>
+#include <NvOnnxParser.h>
+bool Net::load(const std::string &path)
+{
+	_deserialize(path);
+	if (!mEngine)
+	{
+		Logger::instance.error("Failed to build net: failed to load engine.");
+		return false;
+	}
+	mContext = std::shared_ptr<nv::IExecutionContext>(
+		mEngine->createExecutionContext(), Destroy<nv::IExecutionContext>());
+	if (!mContext)
+		return false;
+	std::ostringstream sout;
+	sout << "NbBindings: " << mEngine->getNbBindings() << std::endl;
+	for (auto i = 0; i < mEngine->getNbBindings(); ++i)
+	{
+		auto name = mEngine->getBindingName(i);
+		auto dims = mEngine->getBindingDimensions(i);
+		//nv::DataType type = mEngine->getBindingDataType(i);
+		auto isInput = mEngine->bindingIsInput(i);
+		sout << "Binding " << i << ": " << name << "("
+			 << (isInput ? "Input " : "Output ")
+			 << Formatter::toString(dims) << ")\n";
+	}
+	Logger::instance.info(sout.str().c_str());
+	return true;
+}
+void Net::bindResource(const std::string &name, Resource *res)
+{
+	mResources.addResource(name, res);
+}
+bool Net::dispose()
+{
+	mResources.clear();
+	mEngine = nullptr;
+	return true;
+}
+bool Net::infer(cudaStream_t stream, bool dumpInputOutput)
+{
+	CudaMapScope mapScope(mResources.graphicsResources);
+	CHECK(mapScope.map());
+	auto bindings = _getBindings();
+	if (!mContext->enqueueV2(bindings.data(), stream, nullptr))
+	{
+		Logger::instance.error("Failed to enqueue inference");
+		return false;
+	}
+	/*
+	if (stream == nullptr) {
+		auto inferStart = clock();
+		if (!mContext->executeV2(bindings.data())) {
+			Logger::instance.error("Failed to execute inference");
+			return false;
+		}
+		auto inferEnd = clock();
+		std::ostringstream sout;
+		sout << "Infer takes " << INTERVAL(inferStart, inferEnd) << "ms" << std::endl;
+		Logger::instance.info(sout.str());
+	} else {
+		if (!mContext->enqueueV2(bindings.data(), stream, nullptr)) {
+			Logger::instance.error("Failed to enqueue inference");
+			return false;
+		}
+	}
+	*/
+	if (dumpInputOutput)
+	{
+		if (stream != nullptr)
+			CHECK(cudaStreamSynchronize(stream));
+		_dumpInputOutput();
+	}
+	return true;
+}
+void Net::_deserialize(const std::string &path)
+{
+	std::ifstream fin(path, std::ios::in | std::ios::binary);
+	if (!fin.is_open())
+		return;
+	std::streampos begin, end;
+	begin = fin.tellg();
+	fin.seekg(0, std::ios::end);
+	end = fin.tellg();
+	std::size_t size = end - begin;
+	fin.seekg(0, std::ios::beg);
+	char *engine_data = new char[size];
+	fin.read(engine_data, size);
+	fin.close();
+	uptr<nv::IRuntime> runtime(nv::createInferRuntime(Logger::instance));
+	mEngine = std::shared_ptr<nv::ICudaEngine>(
+		runtime->deserializeCudaEngine(engine_data, size, nullptr),
+		Destroy<nv::ICudaEngine>());
+	delete[] engine_data;
+	Logger::instance.info("Engine is deserialized");
+}
+std::vector<void *> Net::_getBindings()
+{
+	std::vector<void *> bindings(mEngine->getNbBindings());
+	for (auto it = mResources.resources.begin();
+		 it != mResources.resources.end(); ++it)
+	{
+		auto idx = mEngine->getBindingIndex(it->first.c_str());
+		if (idx < 0)
+			continue;
+		bindings[idx] = it->second->getBuffer();
+	}
+	return bindings;
+}
+void Net::_dumpInputOutput()
+{
+	auto bindings = _getBindings();
+	for (auto it = mResources.resources.begin();
+		 it != mResources.resources.end(); ++it)
+	{
+		auto idx = mEngine->getBindingIndex(it->first.c_str());
+		if (idx < 0)
+			continue;
+		if (mEngine->bindingIsInput(idx))
+		{
+			std::ostringstream sout;
+			sout << "Input Buffer " << it->first << ": ";
+			_dumpBuffer(sout, bindings[idx], idx);
+			Logger::instance.info(sout.str().c_str());
+		}
+		else
+		{
+			std::ostringstream sout;
+			sout << "Output Buffer " << it->first << ": ";
+			_dumpBuffer(sout, bindings[idx], idx);
+			Logger::instance.info(sout.str().c_str());
+		}
+	}
+}
+bool Net::_dumpBuffer(std::ostream &os, void *deviceBuf, int index)
+{
+	return _dumpBuffer(os, deviceBuf, mEngine->getBindingDimensions(index),
+					   mEngine->getBindingDataType(index));
+}
+bool Net::_dumpBuffer(std::ostream &os, void *deviceBuf, nv::Dims bufDims, nv::DataType dataType)
+{
+	auto size = std::accumulate(bufDims.d, bufDims.d + bufDims.nbDims, 1,
+								std::multiplies<int64_t>()) *
+				getElementSize(dataType);
+	char *hostBuf = new char[size];
+	CHECK(cudaMemcpyAsync(hostBuf, deviceBuf, size, cudaMemcpyDeviceToHost));
+	int mBatchSize = 0;
+	size_t rowCount = static_cast<size_t>(bufDims.nbDims > 0 ? bufDims.d[bufDims.nbDims - 1] : mBatchSize);
+	int leadDim = mBatchSize;
+	int *trailDims = bufDims.d;
+	int nbDims = bufDims.nbDims;
+	// Fix explicit Dimension networks
+	if (!leadDim && nbDims > 0)
+	{
+		leadDim = bufDims.d[0];
+		++trailDims;
+		--nbDims;
+	}
+	os << "[" << leadDim;
+	for (int i = 0; i < nbDims; i++)
+		os << ", " << trailDims[i];
+	os << "]" << std::endl;
+	switch (dataType)
+	{
+	case nv::DataType::kINT32:
+		dumpHostBuffer<int32_t>(os, (int32_t*)hostBuf, size, rowCount);
+		break;
+	case nv::DataType::kFLOAT:
+		dumpHostBuffer<float>(os, (float*)hostBuf, size, rowCount);
+		break;
+	case nv::DataType::kHALF:
+		dumpHostBuffer<half_float::half>(os, (half_float::half*)hostBuf, size, rowCount);
+		break;
+	case nv::DataType::kINT8:
+		assert(0 && "Int8 network-level input and output is not supported");
+		break;
+	case nv::DataType::kBOOL:
+		assert(0 && "Bool network-level input and output are not supported");
+		break;
+	}
+	return true;
+}
\ No newline at end of file
--- a/cpp/msl_infer/Net.h
+++ b/cpp/msl_infer/Net.h
--- a/cpp/msl_infer/Nmsl2.cpp
+++ b/cpp/msl_infer/Nmsl2.cpp
--- a/cpp/msl_infer/Nmsl2.h
+++ b/cpp/msl_infer/Nmsl2.h
--- a/cpp/msl_infer/Renderer.cu
+++ b/cpp/msl_infer/Renderer.cu