sync

c10f614f · Nianchen Deng · dcba5844 · c10f614f · c10f614f · dcba5844
Commit c10f614f authored 4 years ago by Nianchen Deng
--- a/.clang-format
+++ b/.clang-format
+---
+Language:        Cpp
+# BasedOnStyle:  LLVM
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Right
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: false
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:   
+  AfterClass:      true
+  AfterControlStatement: false
+  AfterEnum:       true
+  AfterFunction:   true
+  AfterNamespace:  true
+  AfterObjCDeclaration: true
+  AfterStruct:     true
+  AfterUnion:      true
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     100
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:   
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeCategories: 
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
+    Priority:        3
+  - Regex:           '.*'
+    Priority:        1
+IncludeIsMainRegex: '(Test)?$'
+IndentCaseLabels: false
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Right
+ReflowComments:  true
+SortIncludes:    false
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        4
+UseTab:          Never
+...
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -11,5 +11,4 @@
        "__nullptr": "cpp"
    },
    "python.pythonPath": "/home/dengnc/miniconda3/bin/python",
-    "jupyter.jupyterServerType": "local"
 }
\ No newline at end of file
--- a/.vscode/tasks.json
+++ b/.vscode/tasks.json
-{
-    // See https://go.microsoft.com/fwlink/?LinkId=733558
-    // for the documentation about the tasks.json format
-    "version": "2.0.0",
-    "tasks": [
-        {
-            "label": "echo",
-            "type": "shell",
-            "command": "echo Hello",
-            "problemMatcher": [],
-            "group": {
-                "kind": "build",
-                "isDefault": true
-            }
-        }
-    ]
-}
\ No newline at end of file
--- a/batch_export_net.sh
+++ b/batch_export_net.sh
+#/usr/bin/bash
+datadir='data/__new/classroom_fovea_r360x80_t0.6'
+onnxdir="$datadir/eval_onnx"
+trtdir="$datadir/eval_trt"
+epochs=50
+if [ ! -d "$onnxdir" ]; then
+    echo "make directory for ONNX"
+    mkdir $onnxdir
+fi
+if [ ! -d "$trtdir" ]; then
+    echo "make directory for TensorRT"
+    mkdir $trtdir
+    mkdir $trtdir/time
+fi
+# nets: 1, 2, 4, 8
+# layers: 2, 4, 8
+# channels: 64 128 256 512 1024
+for n_nets in 1 2 4 8; do
+    for n_layers in 2 4 8; do
+        for nf in 64 128 256 512 1024; do
+            configid="eval@snerffast${n_nets}-rgb_e6_fc${nf}x${n_layers}_d1.00-7.00_s64_~p"
+            exportname="eval_${n_nets}x${nf}x${n_layers}"
+            pth_path="$datadir/$configid/model-epoch_$epochs.pth"
+            onnx_path="$onnxdir/$exportname.onnx"
+            trt_path="$trtdir/$exportname.trt"
+            time_perf_path="$trtdir/time/$exportname.json"
+            if [ -f "$pth_path" ]; then
+                if [ ! -f "$onnx_path" ]; then
+                    # Export ONNX model
+                    python tools/export_snerf_fast.py $pth_path -b 65536 -o $onnx_path
+                fi
+                if [ ! -f "$trt_path" ]; then
+                    # Export TensorRT engine
+                    trtexec --onnx=$onnx_path --fp16 --saveEngine=$trt_path --workspace=4096 --exportTimes=$time_perf_path --noDataTransfers
+                fi
+            fi
+        done
+    done
+done
\ No newline at end of file
--- a/batch_infer.sh
+++ b/batch_infer.sh
@@ -8,7 +8,7 @@ epochs=50
 # nets: 1, 2, 4, 8
 # layers: 2, 4, 8
-# channels: 128 256 512
+# channels: 64 128 256 512 1024
 n_nets_arr=(1 2 4 8 1 2 4 8 1 2 4 8)
 n_layers_arr=(2 2 2 2 4 4 4 4 8 8 8 8)
 n_nets=${n_nets_arr[$testcase]}
@@ -16,21 +16,22 @@ n_layers=${n_layers_arr[$testcase]}
 for nf in 64 128 256 512 1024; do
    configid="eval@snerffast${n_nets}-rgb_e6_fc${nf}x${n_layers}_d1.00-7.00_s64_~p"
-    if [ -f "$datadir/$configid/model-epoch_$epochs.pth" ]; then
+    if [ ! -f "$datadir/$configid/model-epoch_$epochs.pth" ]; then
-        continue
+        cont_epoch=0
-    fi
+        for ((i=$epochs-1;i>0;i--)) do
-    cont_epoch=0
+            if [ -f "$datadir/$configid/model-epoch_$i.pth" ]; then
-    for ((i=$epochs-1;i>0;i--)) do
+                cont_epoch=$i
-        if [ -f "$datadir/$configid/model-epoch_$i.pth" ]; then
+                break
-            cont_epoch=$i
+            fi
-            break
+        done
+        if [ ${cont_epoch} -gt 0 ]; then
+            python run_spherical_view_syn.py $trainset -e $epochs -m $configid/model-epoch_${cont_epoch}.pth
+        else
+            python run_spherical_view_syn.py $trainset -i $configid -e $epochs
        fi
-    done
-    if [ ${cont_epoch} -gt 0 ]; then
-        python run_spherical_view_syn.py $trainset -e $epochs -m $configid/model-epoch_${cont_epoch}.pth
-    else
-        python run_spherical_view_syn.py $trainset -i $configid -e $epochs
    fi
-    python run_spherical_view_syn.py $trainset -t -m $configid/model-epoch_$epochs.pth -o perf
+    if ! ls $datadir/$configid/output_$epochs/perf_r120x80* >/dev/null 2>&1; then
-    python run_spherical_view_syn.py $testset -t -m $configid/model-epoch_$epochs.pth -o perf
+        python run_spherical_view_syn.py $trainset -t -m $configid/model-epoch_$epochs.pth -o perf
-done
+        python run_spherical_view_syn.py $testset -t -m $configid/model-epoch_$epochs.pth -o perf
+    fi
+done
\ No newline at end of file
--- a/components/fnr.py
+++ b/components/fnr.py
@@ -14,7 +14,6 @@ class FoveatedNeuralRenderer(object):
                 layers_res: List[Tuple[int, int]],
                 layers_net: nn.ModuleList,
                 output_res: Tuple[int, int], *,
-                 using_mask=True,
                 device: torch.device = None):
        super().__init__()
        self.layers_net = layers_net.to(device=device)
@@ -34,7 +33,6 @@ class FoveatedNeuralRenderer(object):
            'normalized': True
        }, output_res, device=device)
        self.foveation = Foveation(layers_fov, layers_res, output_res, device=device)
-        self.layers_mask = self.foveation.get_layers_mask() if using_mask else None
        self.device = device
    def to(self, device: torch.device):
@@ -43,8 +41,6 @@ class FoveatedNeuralRenderer(object):
        self.cam.to(device)
        for cam in self.layers_cam:
            cam.to(device)
-        if self.layers_mask is not None:
-            self.layers_mask = self.layers_mask.to(device)
        self.device = device
        return self
@@ -52,32 +48,46 @@ class FoveatedNeuralRenderer(object):
        return self.render(*args, **kwds)
    def render(self, view: Trans, gaze, right_gaze=None, *,
-               stereo_disparity=0, ret_raw=False) -> Union[Mapping[str, torch.Tensor], Tuple[Mapping[str, torch.Tensor]]]:
+               stereo_disparity=0,
+               using_mask=True,
+               ret_raw=False) -> Union[Mapping[str, torch.Tensor], Tuple[Mapping[str, torch.Tensor]]]:
        if stereo_disparity > TINY_FLOAT:
            left_view = Trans(
-                view.trans_point(torch.tensor([-stereo_disparity / 2, 0, 0], device=view.device())),
+                view.trans_point(torch.tensor([-stereo_disparity / 2, 0, 0], device=self.device)),
                view.r)
            right_view = Trans(
-                view.trans_point(torch.tensor([stereo_disparity / 2, 0, 0], device=view.device())),
+                view.trans_point(torch.tensor([stereo_disparity / 2, 0, 0], device=self.device)),
                view.r)
            left_gaze = gaze
            right_gaze = gaze if right_gaze is None else right_gaze
+            left_layers_mask = self.foveation.get_layers_mask(left_gaze) \
+                if using_mask else [None] * 3
+            right_layers_mask = self.foveation.get_layers_mask(right_gaze) \
+                if using_mask else [None] * 3
            res_raw_left = [
-                self._render(i, left_view, left_gaze if i < 2 else None)['color']
+                self._render(self.layers_net[i], self.layers_cam[i], left_view,
+                             left_gaze if i < 2 else None,
+                             layer_mask=left_layers_mask[i])['color']
                for i in range(3)
            ]
            res_raw_right = [
-                self._render(i, right_view, right_gaze if i < 2 else None)['color']
+                self._render(self.layers_net[i], self.layers_cam[i], right_view,
+                             right_gaze if i < 2 else None,
+                             layer_mask=right_layers_mask[i])['color']
                for i in range(3)
            ]
            return self._gen_output(res_raw_left, left_gaze, ret_raw), \
                self._gen_output(res_raw_right, right_gaze, ret_raw)
        else:
+            layers_mask = self.foveation.get_layers_mask(gaze) if using_mask else None
            res_raw = [
-                self._render(i, view, gaze if i < 2 else None)['color']
+                self._render(self.layers_net[i], self.layers_cam[i], view, gaze if i < 2 else None,
+                             layer_mask=layers_mask[i] if layers_mask is not None else None)['color']
                for i in range(3)
            ]
            return self._gen_output(res_raw, gaze, ret_raw)
        '''
        if mono_trans != None and shift == 0:  # do warp
            fovea_depth[torch.isnan(fovea_depth)] = 50
@@ -105,25 +115,25 @@ class FoveatedNeuralRenderer(object):
        ], (gaze[0], gaze[1]), [0, shift, shift] if shift != 0 else None)
        '''
-    def _render(self, layer: int, view: Trans, gaze=None, ret_depth=False) -> Mapping[str, torch.Tensor]:
+    def _render(self, net, cam: CameraParam, view: Trans, gaze=None, *,
-        net = self.layers_net[layer]
+                ret_depth=False,
-        cam = self.layers_cam[layer]
+                layer_mask=None) -> Mapping[str, torch.Tensor]:
        if gaze is not None:
            cam = self._adjust_cam(cam, gaze)
-        rays_o, rays_d = cam.get_global_rays(view, True)  # (1, N, 3)
+        rays_o, rays_d = cam.get_global_rays(view, False)  # (1, H, W, 3)
-        if self.layers_mask is not None and layer < len(self.layers_mask):
+        if layer_mask is not None:
-            mask = self.layers_mask[layer] >= 0
+            infer_mask = layer_mask >= 0
-            rays_o = rays_o[:, mask]
+            rays_o = rays_o[:, infer_mask]
-            rays_d = rays_d[:, mask]
+            rays_d = rays_d[:, infer_mask]
            net_output = net(rays_o.view(-1, 3), rays_d.view(-1, 3), ret_depth=ret_depth)
            ret = {
-                'color': torch.zeros(1, cam.res[0], cam.res[1], 3)
+                'color': torch.zeros(1, cam.res[0], cam.res[1], 3, device=self.device)
            }
-            ret['color'][:, mask] = net_output['color']
+            ret['color'][:, infer_mask] = net_output['color']
            ret['color'] = ret['color'].permute(0, 3, 1, 2)
            if ret_depth:
                ret['depth'] = torch.zeros(1, cam.res[0], cam.res[1])
-                ret['depth'][:, mask] = net_output['depth']
+                ret['depth'][:, infer_mask] = net_output['depth']
            return ret
        else:
            net_output = net(rays_o.view(-1, 3), rays_d.view(-1, 3), ret_depth=ret_depth)
@@ -140,7 +150,7 @@ class FoveatedNeuralRenderer(object):
            'blended': blended
        }
        if ret_raw:
-            ret['layers_raw'] = layers_img,
+            ret['layers_raw'] = layers_img
            ret['blended_raw'] = self.foveation.synthesis(layers_img, gaze)
        return ret

--- a/components/foveation.py
+++ b/components/foveation.py
@@ -31,7 +31,7 @@ class Foveation(object):
    def synthesis(self, layers: List[torch.Tensor],
                  fovea_center: Tuple[float, float],
-                  shifts: List[int] = None) -> torch.Tensor:
+                  shifts: List[int] = None, do_blend=True) -> torch.Tensor:
        """
        Generate foveated retinal image by blending fovea layers
        **Note: current implementation only support two fovea layers**
@@ -55,8 +55,12 @@ class Foveation(object):
            if shifts != None:
                grid = img.horizontal_shift(grid, shifts[i], -2)
            # (1, 1, H:out, W:out)
-            blend = nn_f.grid_sample(self.eye_fovea_blend[i][None, None, ...], grid)
+            if do_blend:
-            output.mul_(1 - blend).add_(nn_f.grid_sample(layers[i], grid) * blend)
+                blend = nn_f.grid_sample(self.eye_fovea_blend[i][None, None], grid, align_corners=False)
+                output.mul_(1 - blend).add_(nn_f.grid_sample(layers[i], grid, align_corners=False) * blend)
+            else:
+                blend = nn_f.grid_sample(torch.ones_like(self.eye_fovea_blend[i][None, None]), grid, align_corners=False)
+                output.mul_(1 - blend).add_(nn_f.grid_sample(layers[i], grid, align_corners=False) * blend)
        return output
    def get_layer_size_in_final_image(self, i: int) -> int:
@@ -94,7 +98,7 @@ class Foveation(object):
        r = torch.norm(p - R, dim=2)  # (size, size, 2)
        return misc.smooth_step(R, R * self.blend, r)
-    def get_layers_mask(self) -> List[torch.Tensor]:
+    def get_layers_mask(self, gaze) -> List[torch.Tensor]:
        """
        Generate mask images for layers[:-1]
        the meaning of values in mask images:
@@ -106,15 +110,26 @@ class Foveation(object):
        :return: Mask images for layers except outermost
        """
        layers_mask = []
-        for i in range(self.n_layers - 1):
+        for i in range(self.n_layers):
            layers_mask.append(torch.ones(*self.layers_res[i], device=self.device) * -1)
-            r = torch.norm(misc.meshgrid(*self.layers_res[i], normalize=True).to(device=self.device) * 2 - 1, dim=-1)
+            if i == self.n_layers - 1:
+                c = torch.tensor([
+                    (gaze[0] + 0.5 * self.out_res[1]) / self.out_res[0],
+                    (gaze[1] + 0.5 * self.out_res[0]) / self.out_res[0]
+                ], device=self.device)
+            else:
+                c = torch.tensor([0.5, 0.5], device=self.device)
+            coord = misc.meshgrid(*self.layers_res[i]).to(device=self.device) / self.layers_res[i][0]
+            r = 2 * torch.norm(coord - c, dim=-1)
            inner_radius = self.get_source_layer_cover_size_in_target_layer(
-                self.layers_fov[i - 1], self.layers_fov[i],
+                self.layers_fov[i - 1], self.layers_fov[i], self.layers_res[i][0]) / self.layers_res[i][0] \
-                self.layers_res[i][0]) / self.layers_res[i][0] if i > 0 else 0
+                if i > 0 else 0
-            bounds = [inner_radius * (1 - self.blend), inner_radius, self.blend, 1]
+            if i == self.n_layers - 1:
+                bounds = [inner_radius * (1 - self.blend), inner_radius, 100, 100]
+            else:
+                bounds = [inner_radius * (1 - self.blend), inner_radius, self.blend, 1]
            for bi in range(len(bounds) - 1):
                region = torch.logical_and(r > bounds[bi], r <= bounds[bi + 1])
                layers_mask[i][region] = bi + \
                    (r[region] - bounds[bi]) / (bounds[bi + 1] - bounds[bi])
        return layers_mask
\ No newline at end of file
--- a/cpp/Makefile.config
+++ b/cpp/Makefile.config
@@ -128,7 +128,7 @@ endif
 #########################
 INCPATHS=
 LIBPATHS=
-COMMON_LIBS= -lGLEW -lglfw3 -lGL -lX11 -lpthread -lXrandr -lXinerama -lXcursor -lXi -ldl
+COMMON_LIBS= -lGLEW -lglfw -lGL -lX11 -lpthread -lXrandr #-lXinerama -lXcursor -lXi -ldl
 # Add extra libraries if TRT_STATIC is enabled
 ifeq ($(TRT_STATIC), 1)
@@ -207,7 +207,7 @@ else ifeq ($(TARGET), aarch64)
  endif
 endif
 ifeq ($(ENABLE_MYELIN), 1)
-  COMMON_LIBS += $(MYELIN_LIB) $(NVRTC_LIB)
+  #COMMON_LIBS += $(MYELIN_LIB) $(NVRTC_LIB)
 endif
 .SUFFIXES:

--- a/cpp/msl_infer/Common.h
+++ b/cpp/msl_infer/Common.h
-#pragma once
-#include <memory>
-#include <stdexcept>
-#include <vector>
-#include <string>
-#include <sstream>
-#include <GL/glew.h>
-#include <cuda_gl_interop.h>
-#include "../glm/glm.hpp"
-#include "Logger.h"
-inline unsigned int getElementSize(nv::DataType t)
-{
-	switch (t)
-	{
-	case nv::DataType::kINT32:
-		return 4;
-	case nv::DataType::kFLOAT:
-		return 4;
-	case nv::DataType::kHALF:
-		return 2;
-	case nv::DataType::kBOOL:
-	case nv::DataType::kINT8:
-		return 1;
-	}
-	throw std::runtime_error("Invalid DataType.");
-	return 0;
-}
-template <typename T>
-void dumpRow(std::ostream &os, T* buf, size_t n)
-{
-	os << buf[0];
-	for (size_t i = 1; i < n; ++i) {
-		os << " " << buf[i];
-	}
-	os << std::endl;
-}
-template <typename T>
-void dumpHostBuffer(std::ostream &os, void *buf, size_t bufSize, size_t rowCount, size_t maxDumpRows = 0)
-{
-	T *typedBuf = static_cast<T *>(buf);
-	size_t numItems = bufSize / sizeof(T);
-	size_t nInLastRow = numItems % rowCount;
-	size_t rows;
-	if (nInLastRow == 0) {
-		rows = numItems / rowCount;
-		nInLastRow = rowCount;
-	} else {
-		rows = numItems / rowCount + 1;
-	}
-	if (maxDumpRows == 0) {
-		for (size_t i = 0; i < rows - 1; ++i) {
-			dumpRow(os, typedBuf, rowCount);
-			typedBuf += rowCount;
-		}
-		dumpRow(os, typedBuf, nInLastRow);
-	} else {
-		for (size_t i = 0; i < maxDumpRows / 2; ++i)
-			dumpRow(os, typedBuf + i * rowCount, rowCount);
-		os << "..." << std::endl;
-		for (size_t i = rows - maxDumpRows + maxDumpRows / 2; i < rows - 1; ++i)
-			dumpRow(os, typedBuf + i * rowCount, rowCount);
-		dumpRow(os, typedBuf + (rows - 1) * rowCount, nInLastRow);
-	}
-}
-class CudaStream
-{
-public:
-	CudaStream()
-	{
-		cudaStreamCreate(&stream);
-	}
-	operator cudaStream_t()
-	{
-		return stream;
-	}
-	virtual ~CudaStream()
-	{
-		cudaStreamDestroy(stream);
-	}
-private:
-	cudaStream_t stream;
-};
-class CudaEvent
-{
-public:
-	CudaEvent()
-	{
-		cudaEventCreate(&mEvent);
-	}
-	operator cudaEvent_t()
-	{
-		return mEvent;
-	}
-	virtual ~CudaEvent()
-	{
-		cudaEventDestroy(mEvent);
-	}
-private:
-	cudaEvent_t mEvent;
-};
-struct CudaMapScope
-{
-	std::vector<cudaGraphicsResource_t> resources_;
-	cudaStream_t stream_;
-	CudaMapScope(const std::vector<cudaGraphicsResource_t> &resources,
-				 cudaStream_t stream = nullptr) : resources_(resources), stream_(stream) {}
-	~CudaMapScope()
-	{
-		if (!resources_.empty())
-			cudaGraphicsUnmapResources(resources_.size(),
-									   resources_.data(), stream_);
-	}
-	cudaError_t map()
-	{
-		if (!resources_.empty())
-			return cudaGraphicsMapResources(resources_.size(),
-											resources_.data(), stream_);
-		return cudaSuccess;
-	}
-};
-template <typename T>
-struct Destroy
-{
-	void operator()(T *t)
-	{
-		if (t != nullptr)
-			t->destroy();
-	}
-};
-template <class T>
-using uptr = std::unique_ptr<T, ::Destroy<T>>;
-template <class T>
-using sptr = std::shared_ptr<T>;
-#define INTERVAL(__start__, __end__) (((__end__) - (__start__)) / (float)CLOCKS_PER_SEC * 1000)
-#include "Resource.h"
-#include "Formatter.h"
\ No newline at end of file
--- a/cpp/msl_infer/Encoder.cu
+++ b/cpp/msl_infer/Encoder.cu
 #include "Encoder.h"
-#include "thread_index.h"
+#include "../utils/cuda.h"
 /// idx3.z = 0: x, y, z, sin(x), sin(y), sin(z), cos(x), cos(y), cos(z)
 /// idx3.z = 1: sin(2x), sin(2y), sin(2z), cos(2x), cos(2y), cos(2z)
@@ -7,12 +7,11 @@
 /// idx3.z = n_freq-1: sin(2^(n_freq-1)x), sin(2^(n_freq-1)y), sin(2^(n_freq-1)z),
 ///                    cos(2^(n_freq-1)x), cos(2^(n_freq-1)y), cos(2^(n_freq-1)z)
 /// Dispatch (n_batch, n_chns, n_freqs)
-__global__ void cu_encode(float *output, float *input, float *freqs, uint n)
+__global__ void cu_encode(float *output, float *input, float *freqs, uint n) {
-{
    glm::uvec3 idx3 = IDX3;
    if (idx3.x >= n)
        return;
-    uint n = blockDim.x, inChns = blockDim.y, nFreqs = blockDim.z;
+    uint inChns = blockDim.y, nFreqs = blockDim.z;
    uint i = idx3.x, chn = idx3.y, freq = idx3.z;
    uint elem = i * inChns + chn;
    uint outChns = inChns * (nFreqs * 2 + 1);
@@ -26,16 +25,14 @@ __global__ void cu_encode(float *output, float *input, float *freqs, uint n)
    output[base + inChns * (freq * 2 + 2)] = c;
 }
-void Encoder::encode(sptr<CudaArray<float>> output, sptr<CudaArray<float>> input)
+void Encoder::encode(sptr<CudaArray<float>> output, sptr<CudaArray<float>> input) {
-{
    dim3 blkSize(1024 / _chns / _multires, _chns, _multires);
    dim3 grdSize((uint)ceil(input->n() / (float)blkSize.x), 1, 1);
-    cu_encode<<<grdSize, blkSize>>>(output->getBuffer(), *input, *_freqs, input->n());
+    CU_INVOKE(cu_encode)(output->getBuffer<float>(), *input, *_freqs, input->n());
    CHECK_EX(cudaGetLastError());
 }
-void Encoder::_genFreqArray()
+void Encoder::_genFreqArray() {
-{
    float *arr = new float[_multires];
    arr[0] = 1.0f;
    for (auto i = 1; i < _multires; ++i)

--- a/cpp/msl_infer/Encoder.h
+++ b/cpp/msl_infer/Encoder.h
 #pragma once
-#include "Common.h"
+#include "../utils/common.h"
 class Encoder {
 public:
@@ -14,5 +14,4 @@ private:
    sptr<CudaArray<float>> _freqs;
    void _genFreqArray();
 };
\ No newline at end of file
--- a/cpp/msl_infer/Enhancement.cu
+++ b/cpp/msl_infer/Enhancement.cu
 #include "Enhancement.h"
-#include "thread_index.h"
+#include "../utils/cuda.h"
 #define max(__a__, __b__) (__a__ > __b__ ? __a__ : __b__)
 #define min(__a__, __b__) (__a__ < __b__ ? __a__ : __b__)

--- a/cpp/msl_infer/Enhancement.h
+++ b/cpp/msl_infer/Enhancement.h
 #pragma once
-#include "Common.h"
+#include "../utils/common.h"
 class Enhancement
 {

--- a/cpp/msl_infer/InferPipeline.cpp
+++ b/cpp/msl_infer/InferPipeline.cpp
 #include "InferPipeline.h"
 #include "Nmsl2.h"
-InferPipeline::InferPipeline(
+InferPipeline::InferPipeline(sptr<Msl> net, uint nRays, uint nSamplesPerRay, glm::vec2 depthRange,
-    const std::string &netDir, bool isNmsl, uint batchSize,
+                             int encodeDim, int coordChns)
-    uint samples) : _batchSize(batchSize),
+    : _nRays(nRays),
-                    _samples(samples),
+      _nSamplesPerRay(nSamplesPerRay),
-                    _sampler(new Sampler({1.0f, 50.0f}, samples)),
+      _net(net),
-                    _encoder(new Encoder(10, 3)),
+      _sampler(new Sampler(depthRange, nSamplesPerRay, coordChns == 3)),
-                    _renderer(new Renderer()),
+      _encoder(new Encoder(encodeDim, coordChns)),
-                    _net(isNmsl ? new Nmsl2(batchSize, samples) : new Msl(batchSize, samples))
+      _renderer(new Renderer()) {
-{
+    uint nSamples = _nRays * _nSamplesPerRay;
-    uint batchSizeForNet = _batchSize * _samples;
+    _coords = sptr<CudaArray<float>>(new CudaArray<float>(nSamples * coordChns));
-    _sphericalCoords = sptr<CudaArray<glm::vec3>>(new CudaArray<glm::vec3>(batchSizeForNet));
+    _depths = sptr<CudaArray<float>>(new CudaArray<float>(nSamples));
-    _depths = sptr<CudaArray<float>>(new CudaArray<float>(batchSizeForNet));
+    _encoded = sptr<CudaArray<float>>(new CudaArray<float>(nSamples * _encoder->outDim()));
-    _encoded = sptr<CudaArray<float>>(new CudaArray<float>(batchSizeForNet * _encoder->outDim()));
+    _layeredColors = sptr<CudaArray<glm::vec4>>(new CudaArray<glm::vec4>(nSamples));
-    _layeredColors = sptr<CudaArray<glm::vec4>>(new CudaArray<glm::vec4>(batchSizeForNet));
-    _net->load(netDir);
    _net->bindResources(_encoded.get(), _depths.get(), _layeredColors.get());
 }
-void InferPipeline::run(sptr<CudaArray<glm::vec4>> o_colors,
+void InferPipeline::run(sptr<CudaArray<glm::vec4>> o_colors, sptr<CudaArray<glm::vec3>> rays,
-                        sptr<CudaArray<glm::vec3>> rays,
+                        glm::vec3 origin, bool showPerf) {
-                        glm::vec3 rayOrigin, bool showPerf)
-{
    CudaEvent eStart, eSampled, eEncoded, eInferred, eRendered;
    cudaEventRecord(eStart);
-    _sampler->sampleOnRays(_sphericalCoords, _depths, rays, rayOrigin);
+    _sampler->sampleOnRays(_coords, _depths, rays, origin);
    cudaEventRecord(eSampled);
-    sptr<CudaArray<float>> coords(new CudaArray<float>((float *)_sphericalCoords->getBuffer(),
+    _encoder->encode(_encoded, _coords);
-                                                       _sphericalCoords->n() * 3));
-    _encoder->encode(_encoded, coords);
    cudaEventRecord(eEncoded);
@@ -46,8 +40,7 @@ void InferPipeline::run(sptr<CudaArray<glm::vec4>> o_colors,
    cudaEventRecord(eRendered);
-    if (showPerf)
+    if (showPerf) {
-    {
        CHECK_EX(cudaDeviceSynchronize());
        float timeTotal, timeSample, timeEncode, timeInfer, timeRender;
@@ -59,34 +52,34 @@ void InferPipeline::run(sptr<CudaArray<glm::vec4>> o_colors,
        std::ostringstream sout;
        sout << "Infer pipeline: " << timeTotal << "ms (Sample: " << timeSample
-             << "ms, Encode: " << timeEncode << "ms, Infer: "
+             << "ms, Encode: " << timeEncode << "ms, Infer: " << timeInfer
-             << timeInfer << "ms, Render: " << timeRender << "ms)";
+             << "ms, Render: " << timeRender << "ms)";
        Logger::instance.info(sout.str());
    }
    /*
-	{
+    {
-		std::ostringstream sout;
+        std::ostringstream sout;
-		sout << "Rays:" << std::endl;
+        sout << "Rays:" << std::endl;
-		dumpFloatArray(sout, *rays, 10);
+        dumpFloatArray(sout, *rays, 10);
-		Logger::instance.info(sout.str());
+        Logger::instance.info(sout.str());
-	}
+    }
-	{
+    {
-		std::ostringstream sout;
+        std::ostringstream sout;
-		sout << "Spherical coords:" << std::endl;
+        sout << "Spherical coords:" << std::endl;
-		dumpFloatArray(sout, *sphericalCoords, 10);
+        dumpFloatArray(sout, *sphericalCoords, 10);
-		Logger::instance.info(sout.str());
+        Logger::instance.info(sout.str());
-	}
+    }
-	{
+    {
-		std::ostringstream sout;
+        std::ostringstream sout;
-		sout << "Depths:" << std::endl;
+        sout << "Depths:" << std::endl;
-		dumpFloatArray(sout, *depths, 10);
+        dumpFloatArray(sout, *depths, 10);
-		Logger::instance.info(sout.str());
+        Logger::instance.info(sout.str());
-	}
+    }
-	{
+    {
-		std::ostringstream sout;
+        std::ostringstream sout;
-		sout << "Encoded:" << std::endl;
+        sout << "Encoded:" << std::endl;
-		dumpFloatArray(sout, *encoded, 10, encoder.outDim());
+        dumpFloatArray(sout, *encoded, 10, encoder.outDim());
-		Logger::instance.info(sout.str());
+        Logger::instance.info(sout.str());
-	}
+    }
-	*/
+    */
 }
\ No newline at end of file
--- a/cpp/msl_infer/InferPipeline.h
+++ b/cpp/msl_infer/InferPipeline.h
 #pragma once
-#include "Common.h"
+#include "../utils/common.h"
 #include "../msl_infer/Sampler.h"
 #include "../msl_infer/Encoder.h"
 #include "../msl_infer/Renderer.h"
 #include "../msl_infer/Msl.h"
-class InferPipeline
+class InferPipeline {
-{
+  public:
-public:
+    InferPipeline(sptr<Msl> net, uint nRays, uint nSamplesPerRay,
-    InferPipeline(const std::string &netDir, bool isNmsl, uint batchSize, uint samples);
+                  glm::vec2 depthRange, int encodeDim, int coordChns);
-    void run(sptr<CudaArray<glm::vec4>> o_colors, sptr<CudaArray<glm::vec3>> rays,
+    void run(sptr<CudaArray<glm::vec4>> o_colors, sptr<CudaArray<glm::vec3>> rays, glm::vec3 origin,
-             glm::vec3 rayOrigin, bool showPerf = false);
+             bool showPerf = false);
-private:
+  private:
-    uint _batchSize;
+    uint _nRays;
-    uint _samples;
+    uint _nSamplesPerRay;
+    sptr<Msl> _net;
    sptr<Sampler> _sampler;
    sptr<Encoder> _encoder;
    sptr<Renderer> _renderer;
-    sptr<Msl> _net;
+    sptr<CudaArray<float>> _coords;
-    sptr<CudaArray<glm::vec3>> _sphericalCoords;
    sptr<CudaArray<float>> _depths;
    sptr<CudaArray<float>> _encoded;
    sptr<CudaArray<glm::vec4>> _layeredColors;

--- a/cpp/msl_infer/Msl.cpp
+++ b/cpp/msl_infer/Msl.cpp
 #include "Msl.h"
 #include <time.h>
-Msl::Msl(int batchSize, int samples) : batchSize(batchSize), samples(samples), net(nullptr) {}
+Msl::Msl() : net(nullptr) {}
-bool Msl::load(const std::string &netDir)
+bool Msl::load(const std::string &netPath) {
-{
    net = new Net();
-    if (!net->load(netDir + "msl.trt"))
+    if (net->load(netPath))
-        return false;
+        return true;
-    return true;
+    dispose();
+    return false;
 }
-void Msl::bindResources(Resource *resEncoded, Resource *resDepths, Resource *resColors)
+void Msl::bindResources(Resource *resEncoded, Resource *resDepths, Resource *resColors) {
-{
    net->bindResource("Encoded", resEncoded);
    net->bindResource("Depths", resDepths);
    net->bindResource("Colors", resColors);
 }
-bool Msl::infer()
+bool Msl::infer() { return net->infer(); }
-{
-    if (!net->infer())
-        return false;
-    return true;
-}
-bool Msl::dispose()
+void Msl::dispose() {
-{
+    if (net != nullptr) {
-    if (net != nullptr)
-    {
        net->dispose();
        delete net;
        net = nullptr;
    }
-    return true;
 }
--- a/cpp/msl_infer/Msl.h
+++ b/cpp/msl_infer/Msl.h
 #pragma once
-#include "Common.h"
+#include "../utils/common.h"
 #include "Net.h"
-class Msl
+class Msl {
-{
 public:
-	int batchSize;
-	int samples;
    Net *net;
-	Msl(int batchSize, int samples);
+    Msl();
-	virtual bool load(const std::string &netDir);
+    virtual bool load(const std::string &netDir);
+    virtual void bindResources(Resource *resEncoded, Resource *resDepths, Resource *resColors);
-	virtual void bindResources(Resource *resEncoded, Resource *resDepths, Resource *resColors);
+    virtual bool infer();
+    virtual void dispose();
-	virtual bool infer();
-	virtual bool dispose();
 };
--- a/cpp/msl_infer/Net.cpp
+++ b/cpp/msl_infer/Net.cpp
-#include "half.h"
+#include "../utils/half.h"
 #include "Net.h"
 #include <fstream>
 #include <numeric>

--- a/cpp/msl_infer/Net.h
+++ b/cpp/msl_infer/Net.h
 #pragma once
-#include "Common.h"
+#include "../utils/common.h"
 class Net {

--- a/cpp/msl_infer/Nmsl2.cpp
+++ b/cpp/msl_infer/Nmsl2.cpp
 #include "Nmsl2.h"
 #include <time.h>
-Nmsl2::Nmsl2(int batchSize, int samples) : Msl(batchSize, samples),
+Nmsl2::Nmsl2(int batchSize, int samples)
-										   resRaw1(nullptr), resRaw2(nullptr),
+    : batchSize(batchSize),
-										   fcNet1(nullptr), fcNet2(nullptr), catNet(nullptr) {}
+      samples(samples),
+      resRaw1(nullptr),
+      resRaw2(nullptr),
+      fcNet1(nullptr),
+      fcNet2(nullptr),
+      catNet(nullptr) {}
-bool Nmsl2::load(const std::string &netDir)
+bool Nmsl2::load(const std::string &netDir) {
-{
+    fcNet1 = new Net();
-	fcNet1 = new Net();
+    fcNet2 = new Net();
-	fcNet2 = new Net();
+    catNet = new Net();
-	catNet = new Net();
+    if (!fcNet1->load(netDir + "fc1.trt") || !fcNet2->load(netDir + "fc2.trt") ||
-	if (!fcNet1->load(netDir + "fc1.trt") || !fcNet2->load(netDir + "fc2.trt") ||
+        !catNet->load(netDir + "cat.trt"))
-		!catNet->load(netDir + "cat.trt"))
+        return false;
-		return false;
+    resRaw1 = sptr<Resource>(new CudaBuffer(batchSize * samples / 2 * sizeof(float4)));
-	resRaw1 = sptr<Resource>(new CudaBuffer(batchSize * samples / 2 * sizeof(float4)));
+    resRaw2 = sptr<Resource>(new CudaBuffer(batchSize * samples / 2 * sizeof(float4)));
-	resRaw2 = sptr<Resource>(new CudaBuffer(batchSize * samples / 2 * sizeof(float4)));
+    return true;
-	return true;
 }
-void Nmsl2::bindResources(Resource *resEncoded, Resource *resDepths, Resource *resColors)
+void Nmsl2::bindResources(Resource *resEncoded, Resource *resDepths, Resource *resColors) {
-{
+    fcNet1->bindResource("Encoded", resEncoded);
-	fcNet1->bindResource("Encoded", resEncoded);
+    fcNet1->bindResource("Raw", resRaw1.get());
-	fcNet1->bindResource("Raw", resRaw1.get());
+    fcNet2->bindResource("Encoded", resEncoded);
-	fcNet2->bindResource("Encoded", resEncoded);
+    fcNet2->bindResource("Raw", resRaw2.get());
-	fcNet2->bindResource("Raw", resRaw2.get());
+    catNet->bindResource("Raw1", resRaw1.get());
-	catNet->bindResource("Raw1", resRaw1.get());
+    catNet->bindResource("Raw2", resRaw2.get());
-	catNet->bindResource("Raw2", resRaw2.get());
+    catNet->bindResource("Depths", resDepths);
-	catNet->bindResource("Depths", resDepths);
+    catNet->bindResource("Colors", resColors);
-	catNet->bindResource("Colors", resColors);
 }
-bool Nmsl2::infer()
+bool Nmsl2::infer() {
-{
+    // CudaStream stream1, stream2;
-	//CudaStream stream1, stream2;
+    if (!fcNet1->infer())
-	if (!fcNet1->infer())
+        return false;
-		return false;
+    if (!fcNet2->infer())
-	if (!fcNet2->infer())
+        return false;
-		return false;
+    if (!catNet->infer())
-	if (!catNet->infer())
+        return false;
-		return false;
+    return true;
-	return true;
 }
-bool Nmsl2::dispose()
+void Nmsl2::dispose() {
-{
+    if (fcNet1 != nullptr) {
-	if (fcNet1 != nullptr)
+        fcNet1->dispose();
-	{
+        delete fcNet1;
-		fcNet1->dispose();
+        fcNet1 = nullptr;
-		delete fcNet1;
+    }
-		fcNet1 = nullptr;
+    if (fcNet2 != nullptr) {
-	}
+        fcNet2->dispose();
-	if (fcNet2 != nullptr)
+        delete fcNet2;
-	{
+        fcNet2 = nullptr;
-		fcNet2->dispose();
+    }
-		delete fcNet2;
+    if (catNet != nullptr) {
-		fcNet2 = nullptr;
+        catNet->dispose();
-	}
+        delete catNet;
-	if (catNet != nullptr)
+        catNet = nullptr;
-	{
+    }
-		catNet->dispose();
+    resRaw1 = nullptr;
-		delete catNet;
+    resRaw2 = nullptr;
-		catNet = nullptr;
-	}
-	resRaw1 = nullptr;
-	resRaw2 = nullptr;
-	return true;
 }