sync

1bc644a1 · Nianchen Deng · 6294701e · 1bc644a1 · 1bc644a1 · 1bc644a1
Commit 1bc644a1 authored Sep 28, 2022 by Nianchen Deng
--- a/cpp_old/old/utils/thread_index.h
+++ b/cpp_old/old/utils/thread_index.h
--- a/cpp_old/utils/Formatter.h
+++ b/cpp_old/utils/Formatter.h
--- a/cpp_old/old/utils/Logger.cpp
+++ b/cpp_old/old/utils/Logger.cpp
--- a/cpp_old/utils/Logger.h
+++ b/cpp_old/utils/Logger.h
@@ -65,7 +65,3 @@ public:
 		externalLogFunc((int)severity, msg);
 	}
 };
+\ No newline at end of file
-
-
-#define CHECK(__ERR_CODE__) do { if (!Logger::instance.checkErr((__ERR_CODE__), __FILE__, __LINE__)) return false; } while (0)
-#define CHECK_EX(__ERR_CODE__) do { if (!Logger::instance.checkErr((__ERR_CODE__), __FILE__, __LINE__)) throw std::exception(); } while (0)
--- a/cpp_old/old/utils/Resource.h
+++ b/cpp_old/old/utils/Resource.h
--- a/cpp_old/utils/common.h
+++ b/cpp_old/utils/common.h
--- a/cpp_old/utils/cuda.h
+++ b/cpp_old/utils/cuda.h
--- a/cpp_old/utils/half.h
+++ b/cpp_old/utils/half.h
--- a/cpp/utils/thread_index.h
+++ b/cpp/utils/thread_index.h
--- a/cpp/fields/FsNeRF.cpp
+++ b/cpp/fields/FsNeRF.cpp
+#include "FsNeRF.h"
+
+namespace fields
+{
+    FsNeRF::FsNeRF(const std::string &netPath) : _net(nullptr) {
+        _net = new Net();
+        if (!_net->load(netPath)) {
+            dispose();
+            throw std::runtime_error("Failed to load net: " + netPath);
+        }
+    }
+
+    void FsNeRF::bindResources(Resource *resEncoded, Resource *resDepths, Resource *resColors)
+    {
+        _net->bindResource("Encoded", resEncoded);
+        _net->bindResource("Depths", resDepths);
+        _net->bindResource("Colors", resColors);
+    }
+
+    bool FsNeRF::infer() { return _net->infer(); }
+
+    void FsNeRF::dispose()
+    {
+        if (_net != nullptr)
+        {
+            _net->dispose();
+            delete _net;
+            _net = nullptr;
+        }
+    }
+}
\ No newline at end of file
--- a/cpp/fields/FsNeRF.h
+++ b/cpp/fields/FsNeRF.h
+#pragma once
+#include "../utils/common.h"
+#include "Net.h"
+
+namespace fields
+{
+    class FsNeRF
+    {
+    public:
+
+        FsNeRF(const std::string &netPath);
+
+        virtual void bindResources(Resource *resEncoded, Resource *resDepths, Resource *resColors);
+
+        virtual bool infer();
+        
+        virtual void dispose();
+    
+    private:
+        Net *_net;
+    };
+}
\ No newline at end of file
--- a/cpp/fnr_core/Net.cpp
+++ b/cpp/fnr_core/Net.cpp
--- a/cpp_old/old/msl_infer/Net.h
+++ b/cpp_old/old/msl_infer/Net.h
--- a/cpp/fnr_core/Encoder.cu
+++ b/cpp/fnr_core/Encoder.cu
-#include "Encoder.h"
-#include "../utils/cuda.h"
-
-/// idx3.z = 0: x, y, z, sin(x), sin(y), sin(z), cos(x), cos(y), cos(z)
-/// idx3.z = 1: sin(2x), sin(2y), sin(2z), cos(2x), cos(2y), cos(2z)
-/// ...
-/// idx3.z = n_freq-1: sin(2^(n_freq-1)x), sin(2^(n_freq-1)y), sin(2^(n_freq-1)z),
-///                    cos(2^(n_freq-1)x), cos(2^(n_freq-1)y), cos(2^(n_freq-1)z)
-/// Dispatch (n, in_chns, n_freqs)
-__global__ void cu_encode0(float *output, float *input, uint n, uint nFreqs) {
-    glm::uvec3 idx3 = IDX3;
-    if (idx3.x >= n)
-        return;
-    uint inChns = blockDim.y;
-    uint outChns = inChns * (nFreqs * 2 + 1);
-    uint i = idx3.x, chn = idx3.y;
-    output[i * outChns + chn] = input[i * inChns + chn];
-}
-
-__global__ void cu_encode(float *output, float *input, float *freqs, uint n, bool catInput) {
-    glm::uvec3 idx3 = IDX3;
-    if (idx3.x >= n)
-        return;
-    uint offset = (uint)catInput;
-    uint inChns = blockDim.y, nFreqs = blockDim.z;
-    uint i = idx3.x, chn = idx3.y, freq = idx3.z;
-    uint elem = i * inChns + chn;
-    uint outChns = inChns * (nFreqs * 2 + offset);
-    uint base = i * outChns + chn;
-    if (freq == 0 && catInput)
-        output[base] = input[elem];
-    float x = freqs[freq] * input[elem];
-    float s, c;
-    __sincosf(x, &s, &c);
-    output[base + inChns * (freq * 2 + offset)] = s;
-    output[base + inChns * (freq * 2 + offset + 1)] = c;
-}
-
-__global__ void cu_encode2(glm::vec2 *output, glm::vec2 *input, float *freqs, uint n) {
-    glm::uvec3 idx3 = IDX3;
-    if (idx3.x >= n)
-        return;
-    uint nFreqs = blockDim.y;
-    uint i = idx3.x, freq = idx3.y;
-    uint outChns = nFreqs * 2 + 1;
-    uint base = i * outChns;
-    if (freq == 0)
-        output[base] = input[i];
-    glm::vec2 x = freqs[freq] * input[i];
-    glm::vec2 s, c;
-    __sincosf(x.x, &s.x, &c.x);
-    __sincosf(x.y, &s.y, &c.y);
-    output[base + (freq * 2 + 1)] = s;
-    output[base + (freq * 2 + 2)] = c;
-}
-
-/**
- * @brief
- *
- * @param output encoded data, n x out_chns
- * @param input coord data, n x in_chns
- */
-void Encoder::encode(sptr<CudaArray<float>> output, sptr<CudaArray<float>> input) {
-    std::ostringstream sout;
-    sout << "Encoder => input size: (" << input->n() / _chns << ", " << _chns << "), output size: ("
-         << output->n() / outDim() << ", " << outDim() << ")";
-    //Logger::instance.info(sout.str());
-    uint n = input->n() / _chns;
-    dim3 blkSize(1024 / _chns / _multires, _chns, _multires);
-    dim3 grdSize(ceilDiv(n, blkSize.x), 1, 1);
-    CU_INVOKE(cu_encode)(*output, *input, *_freqs, n, _catInput);
-    // blkSize = dim3(1024 / _chns, _chns);
-    // grdSize = dim3(ceilDiv(n, blkSize.x), 1, 1);
-    // CU_INVOKE(cu_encode0)(*output, *input, n, _multires);
-    CHECK_EX(cudaGetLastError());
-}
-
-void Encoder::_genFreqArray() {
-    float *arr = new float[_multires];
-    arr[0] = 1.0f;
-    for (auto i = 1; i < _multires; ++i)
-        arr[i] = arr[i - 1] * 2.0f;
-    _freqs = sptr<CudaArray<float>>(new CudaArray<float>(_multires));
-    cudaMemcpy(_freqs->getBuffer(), arr, _multires * sizeof(float), cudaMemcpyHostToDevice);
-    delete[] arr;
-}
--- a/cpp/fnr_core/Encoder.h
+++ b/cpp/fnr_core/Encoder.h
-#pragma once
-#include "../utils/common.h"
-
-class Encoder {
-public:
-    Encoder(unsigned int multires, unsigned int chns, bool catInput)
-        : _multires(multires), _chns(chns), _catInput(catInput) {
-        _genFreqArray();
-    }
-
-    unsigned int outDim() const { return _chns * ((int)_catInput + _multires * 2); }
-    void encode(sptr<CudaArray<float>> output, sptr<CudaArray<float>> input);
-
-private:
-    unsigned int _multires;
-    unsigned int _chns;
-    bool _catInput;
-    sptr<CudaArray<float>> _freqs;
-
-    void _genFreqArray();
-}; 
\ No newline at end of file
--- a/cpp/fnr_core/InferPipeline.cpp
+++ b/cpp/fnr_core/InferPipeline.cpp
-#include "InferPipeline.h"
-#include "Nmsl2.h"
-
-InferPipeline::InferPipeline(sptr<Msl> net, uint nRays, uint nSamplesPerRay, glm::vec2 depthRange,
-                             uint encodeDim, uint coordChns)
-    : _nRays(nRays),
-      _nSamplesPerRay(nSamplesPerRay),
-      _coordChns(coordChns),
-      _net(net),
-      _sampler(new Sampler(depthRange, nSamplesPerRay, coordChns == 3)),
-      _encoder(new Encoder(encodeDim, coordChns)),
-      _renderer(new Renderer()) {
-    auto nSamples = _nRays * _nSamplesPerRay;
-    _coords = sptr<CudaArray<float>>(new CudaArray<float>(nSamples * coordChns));
-    _depths = sptr<CudaArray<float>>(new CudaArray<float>(nSamples));
-    _encoded = sptr<CudaArray<float>>(new CudaArray<float>(nSamples * _encoder->outDim()));
-    _layeredColors = sptr<CudaArray<glm::vec4>>(new CudaArray<glm::vec4>(nSamples));
-    _net->bindResources(_encoded.get(), _depths.get(), _layeredColors.get());
-}
-
-void InferPipeline::run(sptr<CudaArray<glm::vec4>> o_colors, sptr<CudaArray<glm::vec3>> rays,
-                        glm::vec3 origin, bool showPerf) {
-    rays = sptr<CudaArray<glm::vec3>>(rays->subArray(0, _nRays));
-    o_colors = sptr<CudaArray<glm::vec4>>(o_colors->subArray(0, _nRays));
-    CudaEvent eStart, eSampled, eEncoded, eInferred, eRendered;
-
-    cudaEventRecord(eStart);
-
-    _sampler->sampleOnRays(_coords, _depths, rays, origin);
-    CHECK_EX(cudaDeviceSynchronize());
-
-    cudaEventRecord(eSampled);
-
-    _encoder->encode(_encoded, _coords);
-    CHECK_EX(cudaDeviceSynchronize());
-
-    cudaEventRecord(eEncoded);
-
-    _net->infer();
-    CHECK_EX(cudaDeviceSynchronize());
-
-    cudaEventRecord(eInferred);
-
-    _renderer->render(o_colors, _layeredColors);
-
-    cudaEventRecord(eRendered);
-
-    if (showPerf) {
-        CHECK_EX(cudaDeviceSynchronize());
-
-        float timeTotal, timeSample, timeEncode, timeInfer, timeRender;
-        cudaEventElapsedTime(&timeTotal, eStart, eRendered);
-        cudaEventElapsedTime(&timeSample, eStart, eSampled);
-        cudaEventElapsedTime(&timeEncode, eSampled, eEncoded);
-        cudaEventElapsedTime(&timeInfer, eEncoded, eInferred);
-        cudaEventElapsedTime(&timeRender, eInferred, eRendered);
-
-        std::ostringstream sout;
-        sout << "Infer pipeline: " << timeTotal << "ms (Sample: " << timeSample
-             << "ms, Encode: " << timeEncode << "ms, Infer: " << timeInfer
-             << "ms, Render: " << timeRender << "ms)";
-        Logger::instance.info(sout.str().c_str());
-    }
-    
-    /*
-    {
-        std::ostringstream sout;
-        sout << "Rays:" << std::endl;
-        dumpArray<glm::vec3, float>(sout, *rays, 10);
-        Logger::instance.info(sout.str());
-    }
-    {
-        std::ostringstream sout;
-        sout << "Spherical coords:" << std::endl;
-        dumpArray(sout, *_coords, 10, _coordChns * _nSamplesPerRay);
-        Logger::instance.info(sout.str());
-    }
-    {
-        std::ostringstream sout;
-        sout << "Depths:" << std::endl;
-        dumpArray(sout, *_depths, 10, _nSamplesPerRay);
-        Logger::instance.info(sout.str());
-    }
-    {
-        std::ostringstream sout;
-        sout << "Encoded:" << std::endl;
-        dumpArray(sout, *_encoded, 10, _encoder->outDim() * _nSamplesPerRay);
-        Logger::instance.info(sout.str());
-    }
-    {
-        std::ostringstream sout;
-        sout << "Color:" << std::endl;
-        dumpArray<glm::vec4, float>(sout, *o_colors, 10);
-        Logger::instance.info(sout.str());
-    }
-    */
-}
\ No newline at end of file
--- a/cpp/fnr_core/InferPipeline.h
+++ b/cpp/fnr_core/InferPipeline.h
-#pragma once
-#include "../utils/common.h"
-#include "Sampler.h"
-#include "Encoder.h"
-#include "Renderer.h"
-#include "Msl.h"
-
-class InferPipeline {
-public:
-    InferPipeline(sptr<Msl> net, uint nRays, uint nSamplesPerRay,
-                  glm::vec2 depthRange, uint encodeDim, uint coordChns);
-
-    void run(sptr<CudaArray<glm::vec4>> o_colors, sptr<CudaArray<glm::vec3>> rays, glm::vec3 origin,
-             bool showPerf = false);
-
-    uint nRays() const { return _nRays; }
-
-private:
-    uint _nRays;
-    uint _nSamplesPerRay;
-    uint _coordChns;
-    sptr<Msl> _net;
-    sptr<Sampler> _sampler;
-    sptr<Encoder> _encoder;
-    sptr<Renderer> _renderer;
-    sptr<CudaArray<float>> _coords;
-    sptr<CudaArray<float>> _depths;
-    sptr<CudaArray<float>> _encoded;
-    sptr<CudaArray<glm::vec4>> _layeredColors;
-
-};
\ No newline at end of file
--- a/cpp/fnr_core/NeuralSynthesis.cpp
+++ b/cpp/fnr_core/NeuralSynthesis.cpp
+#include "NeuralSynthesis.h"
+#include "InferPipeline.h"
+#include "Enhancement.h"
+#include "ImageGen.h"
+
+constexpr auto NUM_LAYERS = 3u;
+constexpr auto STEREO_FOVEA_R = NUM_LAYERS;
+constexpr auto NUM_NETS = 2u;
+
+class NeuralSynthesis_Impl {
+public:
+	NeuralSynthesis_Impl(models::Model& model, Camera& cam);
+
+	void run(View& view);
+
+	GLuint getGlResultTexture(uint index);
+
+private:
+	models::Model& model;
+	Camera& _cam;
+	uint _nRays;
+	sptr<InferPipeline> _infers[NUM_NETS];
+	sptr<Enhancement> _enhancements[NUM_LAYERS];
+	sptr<ImageGen> _imageGens[NUM_LAYERS + 1];
+	sptr<CudaArray<glm::vec3>> _rays;
+	sptr<CudaArray<glm::vec4>> _clrs;
+	sptr<CudaArray<glm::vec4>> _imageData[NUM_LAYERS + 1];
+
+};
+
+NeuralSynthesis_Impl::NeuralSynthesis_Impl(const std::string& dataDir, glm::vec2 depthRange,
+	uint nSamples[], uint encodeDim, uint coordChns, sptr<Camera> cam,
+	const std::vector<sptr<Camera>>& layerCams, bool stereo) :
+	_fullCam(cam), _stereo(stereo) {
+	// Load nets
+	for (uint i = 0; i < NUM_NETS; ++i)
+		_nets[i].reset(new Msl());
+	_nets[0]->load(dataDir + "/fovea.trt");
+	_nets[1]->load(dataDir + "/periph.trt");
+
+	// Init cams
+	for (uint i = 0; i < NUM_LAYERS; ++i)
+		_cams[i] = layerCams[i];
+
+	uint nRays[NUM_LAYERS];
+	uint nTotRays = 0;
+	for (uint i = 0; i < NUM_LAYERS; ++i)
+		nTotRays += nRays[i] = _cams[i]->nRays();
+	if (_stereo)
+		nTotRays += nRays[0];
+
+	// Init infers
+	_infers[0].reset(new InferPipeline(_nets[0], nRays[0], nSamples[0],
+		depthRange, encodeDim, coordChns));
+	_infers[1].reset(new InferPipeline(_nets[1], nRays[1] + nRays[2], nSamples[1],
+		depthRange, encodeDim, coordChns));
+
+	// Init image gens
+	for (uint i = 0; i < NUM_LAYERS; ++i)
+		_imageGens[i].reset(new ImageGen(_cams[i]->res()));
+	if (_stereo)
+		_imageGens[STEREO_FOVEA_R].reset(new ImageGen(_cams[0]->res()));
+
+	// Init enhancements
+	glm::vec2 enhancementParams[] = {
+		{3.0f, 0.2f}, {5.0f, 0.2f}, {5.0f, 0.2f}
+	};
+	for (uint i = 0; i < NUM_LAYERS; ++i)
+		_enhancements[i].reset(new Enhancement(_cams[i]->res(), enhancementParams[i]));
+
+	// Create buffers
+	_rays.reset(new CudaArray<glm::vec3>(nTotRays));
+	_clrs.reset(new CudaArray<glm::vec4>(nTotRays));
+	for (uint i = 0; i < NUM_LAYERS; ++i)
+		_imageData[i].reset(new CudaArray<glm::vec4>(_cams[i]->nPixels()));
+	if (_stereo)
+		_imageData[STEREO_FOVEA_R].reset(new CudaArray<glm::vec4>(_cams[0]->nPixels()));
+}
+
+
+void NeuralSynthesis_Impl::run(View& view, glm::vec2 foveaPos, bool showPerf, glm::vec2 foveaPosR) {
+	CudaEvent eStart, eGenRays, eInferred, eGenImage, eEnhance;
+	uint offset;
+
+	cudaEventRecord(eStart);
+
+	glm::vec2 foveaOffset(foveaPos - (glm::vec2)_fullCam->res() / 2.0f);
+	foveaOffset /= _fullCam->f();
+	glm::vec3 foveaOffset3(foveaOffset.x, foveaOffset.y, 0.0f);
+
+	glm::vec2 foveaOffsetR(foveaPosR - (glm::vec2)_fullCam->res() / 2.0f);
+	foveaOffsetR /= _fullCam->f();
+	glm::vec3 foveaOffset3R(foveaOffsetR.x, foveaOffsetR.y, 0.0f);
+
+	auto viewL = view.getStereoEye(0.06f, Eye_Left);
+	auto viewR = view.getStereoEye(0.06f, Eye_Right);
+
+	if (_stereo) {
+		offset = 0;
+		_cams[0]->getRays(sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), viewL, foveaOffset3);
+		offset += _cams[0]->nRays();
+		_cams[1]->getRays(sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), view, (foveaOffset3 + foveaOffset3R) / 2.0f);
+		offset += _cams[1]->nRays();
+		_cams[2]->getRays(sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), view, {});
+		offset += _cams[2]->nRays();
+		_cams[0]->getRays(sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), viewR, foveaOffset3R);
+	} else {
+		offset = 0;
+		for (uint i = 0; i < NUM_LAYERS; ++i) {
+			_cams[i]->getRays(sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)),
+				view, i == NUM_LAYERS - 1 ? glm::vec3() : foveaOffset3);
+			offset += _cams[i]->nRays();
+		}
+	}
+
+	cudaEventRecord(eGenRays);
+
+	if (_stereo) {
+		offset = 0;
+		_infers[0]->run(sptr<CudaArray<glm::vec4>>(_clrs->subArray(offset)),
+			sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), viewL.t(), showPerf);
+		offset += _infers[0]->nRays();
+		_infers[1]->run(sptr<CudaArray<glm::vec4>>(_clrs->subArray(offset)),
+			sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), view.t(), showPerf);
+		offset += _infers[1]->nRays();
+		_infers[0]->run(sptr<CudaArray<glm::vec4>>(_clrs->subArray(offset)),
+			sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), viewR.t(), showPerf);
+	} else {
+		offset = 0;
+		for (uint i = 0; i < NUM_NETS; ++i) {
+			_infers[i]->run(sptr<CudaArray<glm::vec4>>(_clrs->subArray(offset)),
+				sptr<CudaArray<glm::vec3>>(_rays->subArray(offset)), view.t(), showPerf);
+			offset += _infers[i]->nRays();
+		}
+	}
+
+	cudaEventRecord(eInferred);
+
+	offset = 0;
+	for (uint i = 0; i < NUM_LAYERS; ++i) {
+		_cams[i]->restoreImage(_imageData[i], sptr<CudaArray<glm::vec4>>(_clrs->subArray(offset)));
+		offset += _cams[i]->nRays();
+	}
+	if (_stereo)
+		_cams[0]->restoreImage(_imageData[STEREO_FOVEA_R], sptr<CudaArray<glm::vec4>>(_clrs->subArray(offset)));
+
+	cudaEventRecord(eGenImage);
+
+	for (uint i = 0; i < NUM_LAYERS; ++i)
+		_enhancements[i]->run(_imageData[i]);
+	if (_stereo)
+		_enhancements[0]->run(_imageData[STEREO_FOVEA_R]);
+
+	cudaEventRecord(eEnhance);
+	CHECK_EX(cudaDeviceSynchronize());
+
+	for (uint i = 0; i < NUM_LAYERS; ++i)
+		_imageGens[i]->run(_imageData[i]);
+	if (_stereo)
+		_imageGens[STEREO_FOVEA_R]->run(_imageData[STEREO_FOVEA_R]);
+
+	float timeTotal, timeGenRays, timeInfer, timeGenImage, timeEnhance;
+	cudaEventElapsedTime(&timeTotal, eStart, eGenImage);
+	cudaEventElapsedTime(&timeGenRays, eStart, eGenRays);
+	cudaEventElapsedTime(&timeInfer, eGenRays, eInferred);
+	cudaEventElapsedTime(&timeGenImage, eInferred, eGenImage);
+	cudaEventElapsedTime(&timeEnhance, eGenImage, eEnhance);
+	if (showPerf) {
+		std::ostringstream sout;
+		sout << "Synthesis => Total: " << timeTotal << "ms (Gen rays: " << timeGenRays
+			<< "ms, Infer: " << timeInfer << "ms, Gen image: " << timeGenImage
+			<< "ms, Enhance: " << timeEnhance << "ms)";
+		Logger::instance.info(sout.str().c_str());
+	}
+}
+
+GLuint NeuralSynthesis_Impl::getGlResultTexture(uint index) {
+	return _imageGens[index]->getGlResultTexture();
+}
+
+NeuralSynthesis::NeuralSynthesis(const std::string& dataDir, glm::vec2 depthRange,
+	uint nSamples[], uint encodeDim, uint coordChns, sptr<Camera> cam,
+	const std::vector<sptr<Camera>>& layerCams, bool stereo) :
+	_impl(new NeuralSynthesis_Impl(dataDir, depthRange, nSamples, encodeDim, coordChns, cam, layerCams, stereo)) {
+}
+
+void NeuralSynthesis::run(View& view, glm::vec2 foveaPos, bool showPerf, glm::vec2 foveaPosR) {
+	_impl->run(view, foveaPos, showPerf, foveaPosR);
+}
+
+GLuint NeuralSynthesis::getGlResultTexture(uint index) {
+	return _impl->getGlResultTexture(index);
+}
--- a/cpp/fnr_core/NeuralSynthesis.h
+++ b/cpp/fnr_core/NeuralSynthesis.h
+#pragma once
+#include "../utils/common.h"
+#include "View.h"
+#include "../models/Model.h"
+
+class NeuralSynthesis_Impl;
+
+class NeuralSynthesis {
+public:
+	NeuralSynthesis(models::Model& model, Camera& cam);
+
+	void operator()(View& view);
+
+	GLuint getGlResultTexture(uint index);
+
+private:
+	sptr<NeuralSynthesis_Impl> _impl;
+
+};
\ No newline at end of file
--- a/cpp/fnr_core/Sampler.cu
+++ b/cpp/fnr_core/Sampler.cu
-#include "Sampler.h"
-#define _USE_MATH_DEFINES
-#include <math.h>
-#include "../utils/cuda.h"
-
-__device__ glm::vec3 _raySphereIntersect(glm::vec3 p, glm::vec3 v, float r, float &o_depth) {
-    float pp = glm::dot(p, p);
-    float vv = glm::dot(v, v);
-    float pv = glm::dot(p, v);
-    o_depth = (sqrtf(pv * pv - vv * (pp - r * r)) - pv) / vv;
-    return p + o_depth * v;
-}
-
-__device__ float _getAngle(float x, float y) {
-    return -atan(x / y) + (y < 0) * (float)M_PI + 0.5f * (float)M_PI;
-}
-
-/**
- * Dispatch with block_size=(n_samples, *), grid_size=(1, nRays/*)
- * Index with (sample_idx, ray_idx)
- */
-__global__ void cu_sampleOnRays(float *o_coords, float *o_depths, glm::vec3 *rays, uint nRays,
-                                glm::vec3 origin, Range range, bool outputRadius) {
-    glm::uvec3 idx3 = IDX3;
-    uint idx = flattenIdx(idx3);
-    uint sampleIdx = idx3.x;
-    uint rayIdx = idx3.y;
-    if (rayIdx >= nRays)
-        return;
-    float r_reciprocal = range.get(sampleIdx);
-    glm::vec3 p = _raySphereIntersect(origin, rays[rayIdx], 1.0f / r_reciprocal, o_depths[idx]);
-    glm::vec3 sp(r_reciprocal, _getAngle(p.x, p.z), acos(p.y * r_reciprocal));
-    if (outputRadius)
-        ((glm::vec3 *)o_coords)[idx] = sp;
-    else
-        ((glm::vec2 *)o_coords)[idx] = {sp.y, sp.z};
-}
-
-void Sampler::sampleOnRays(sptr<CudaArray<float>> o_coords, sptr<CudaArray<float>> o_depths,
-                           sptr<CudaArray<glm::vec3>> rays, glm::vec3 rayCenter) {
-    dim3 blkSize(_dispRange.steps(), 1024 / _dispRange.steps());
-    dim3 grdSize(1, (uint)ceil(rays->n() / (float)blkSize.y));
-    CU_INVOKE(cu_sampleOnRays)
-    (*o_coords, *o_depths, *rays, rays->n(), rayCenter, _dispRange, _outputRadius);
-    CHECK_EX(cudaGetLastError());
-}
\ No newline at end of file