# Performance of Randperm on CPU/GPU

In [1]:
from common import *
from utils.profile import debug_profile
from utils.mem_profiler import MemProfiler

with debug_profile("Random perm on host"):
 torch.randperm(1024 * 1024 * 100)

with debug_profile("Random perm on device"),\
 MemProfiler("Random perm on host", device="cuda:3"):
 torch.randperm(1024 * 1024 * 100, device="cuda:3")


Node Random perm on host: host duration 6384.1ms, device duration 6384.4ms
Node Random perm on device: host duration 2525.0ms, device duration 2525.0ms


# \_\_getattribute\_\_ Method

In [3]:
class A(object):


 def __init__(self, a, **extra) -> None:
 super().__init__()
 self.a = a
 self.extra = extra

 def __getattribute__(self, __name: str):
 try:
 return super().__getattribute__(__name)
 except AttributeError as e:
 try:
 return self.extra[__name]
 except KeyError:
 pass
 err = e
 raise err


a = A(a=1, b=2)


print("a.a:", a.a)
print("a.b:", a.b)
print("a.c:", a.c)

a.a: 1
a.b: 2


AttributeError: 'A' object has no attribute 'c'

# Performance of Various Select/Scatter Methods

In [4]:
from common import *
from time import time

a = torch.zeros(200000, device="cuda")
i = torch.randint(0, a.shape[0], [500000], device="cuda")
start = time()
for _ in range(100):
 a[i] += 1
end = time()
print(end - start, a.max())
start = time()
for _ in range(100):
 a.index_add_(0, i, torch.ones_like(i, dtype=torch.float))
end = time()
print(end - start, a.max())

start = time()
for _ in range(100):
 ui, n = i.unique(return_counts=True)
 a[ui] += n
end = time()
print(end - start, a.max())


a = torch.rand(2000, 2000, device="cuda") - .5
m = a > 0

start = time()
for _ in range(100):
 b = a[m]
end = time()
print(end - start, b.min())

start = time()
for _1 in range(20):
 m1 = m.nonzero(as_tuple=True)
 for _ in range(5):
 b = a[m1]
end = time()
print(end - start, b.min())


c = torch.rand_like(b)

start = time()
for _ in range(100):
 a[m] = c
end = time()
print("Mask set", end - start)

start = time()
for _ in range(100):
 a.masked_scatter_(m, c)
end = time()
print("Inplace mask scatter", end - start)


start = time()
for _ in range(100):
 a = a.masked_scatter(m, c)
end = time()
print("Mask scatter", end - start)

start = time()
for _1 in range(20):
 m1 = m.nonzero(as_tuple=True)
 for _ in range(5):
 a[m1] = b
end = time()
print("Index set", end - start)


start = time()
for _1 in range(20):
 m1 = m.nonzero(as_tuple=True)
 for _ in range(5):
 a.index_put_(m1, b)
end = time()
print("Index put", end - start)

0.010159730911254883 tensor(100., device='cuda:0')
0.011237859725952148 tensor(1400., device='cuda:0')
0.032263755798339844 tensor(2700., device='cuda:0')
0.02148723602294922 tensor(4.1723e-07, device='cuda:0')
0.009927511215209961 tensor(4.1723e-07, device='cuda:0')
Mask set 0.02173590660095215
Inplace mask scatter 0.0041882991790771484
Mask scatter 0.00580906867980957
Index set 0.03358888626098633
Index put 0.01044917106628418
