Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

"initialization error" on every operation #10

Open
hoiwanchang opened this issue Jun 19, 2024 · 3 comments
Open

"initialization error" on every operation #10

hoiwanchang opened this issue Jun 19, 2024 · 3 comments

Comments

@hoiwanchang
Copy link

I'm using Debian 12 with 2 1080ti.

  • Nvidia Driver information
nvidia-smi --version                              
NVIDIA-SMI version  : 555.42.02
NVML version        : 555.42
DRIVER version      : 555.42.02
CUDA Version        : 12.5
  • Kernel information
Linux 1080x2-ws 6.1.0-17-amd64 #1 SMP PREEMPT_DYNAMIC Debian 6.1.69-1 (2023-12-30) x86_64 GNU/Linux

When I want to toggle or make a checkpoint of Mathematica and Microsoft Edge it complains 'Could not checkpoint on process ID 17146: "initialization error"'

strace output
execve("/usr/local/bin/cuda-checkpoint", ["cuda-checkpoint", "--action", "checkpoint", "--pid", "17146"], 0x7fffdab86680 /* 29 vars */) = 0
brk(NULL)                               = 0x25a9000
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fb5fe2d8000
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
newfstatat(3, "", {st_mode=S_IFREG|0644, st_size=154626, ...}, AT_EMPTY_PATH) = 0
mmap(NULL, 154626, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7fb5fe2b2000
close(3)                                = 0
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libcuda.so.1", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0@\376\n\0\0\0\0\0"..., 832) = 832
newfstatat(3, "", {st_mode=S_IFREG|0644, st_size=28094872, ...}, AT_EMPTY_PATH) = 0
mmap(NULL, 28517280, PROT_READ, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7fb5fc600000
mprotect(0x7fb5fc6af000, 26615808, PROT_NONE) = 0
mmap(0x7fb5fc6af000, 4759552, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0xaf000) = 0x7fb5fc6af000
mmap(0x7fb5fcb39000, 21852160, PROT_READ, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x539000) = 0x7fb5fcb39000
mmap(0x7fb5fe011000, 765952, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1a10000) = 0x7fb5fe011000
mmap(0x7fb5fe0cc000, 418720, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fb5fe0cc000
close(3)                                = 0
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\20t\2\0\0\0\0\0"..., 832) = 832
pread64(3, "\6\0\0\0\4\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0"..., 784, 64) = 784
newfstatat(3, "", {st_mode=S_IFREG|0755, st_size=1922136, ...}, AT_EMPTY_PATH) = 0
pread64(3, "\6\0\0\0\4\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0"..., 784, 64) = 784
mmap(NULL, 1970000, PROT_READ, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7fb5fc41f000
mmap(0x7fb5fc445000, 1396736, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x26000) = 0x7fb5fc445000
mmap(0x7fb5fc59a000, 339968, PROT_READ, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x17b000) = 0x7fb5fc59a000
mmap(0x7fb5fc5ed000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1ce000) = 0x7fb5fc5ed000
mmap(0x7fb5fc5f3000, 53072, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fb5fc5f3000
close(3)                                = 0
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libm.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\0\0\0\0\0\0\0\0"..., 832) = 832
newfstatat(3, "", {st_mode=S_IFREG|0644, st_size=907784, ...}, AT_EMPTY_PATH) = 0
mmap(NULL, 909560, PROT_READ, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7fb5fe1d3000
mmap(0x7fb5fe1e3000, 471040, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x10000) = 0x7fb5fe1e3000
mmap(0x7fb5fe256000, 368640, PROT_READ, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x83000) = 0x7fb5fe256000
mmap(0x7fb5fe2b0000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0xdc000) = 0x7fb5fe2b0000
close(3)                                = 0
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libdl.so.2", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\0\0\0\0\0\0\0\0"..., 832) = 832
newfstatat(3, "", {st_mode=S_IFREG|0644, st_size=14480, ...}, AT_EMPTY_PATH) = 0
mmap(NULL, 16400, PROT_READ, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7fb5fe1ce000
mmap(0x7fb5fe1cf000, 4096, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1000) = 0x7fb5fe1cf000
mmap(0x7fb5fe1d0000, 4096, PROT_READ, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x2000) = 0x7fb5fe1d0000
mmap(0x7fb5fe1d1000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x2000) = 0x7fb5fe1d1000
close(3)                                = 0
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libpthread.so.0", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\0\0\0\0\0\0\0\0"..., 832) = 832
newfstatat(3, "", {st_mode=S_IFREG|0644, st_size=14480, ...}, AT_EMPTY_PATH) = 0
mmap(NULL, 16400, PROT_READ, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7fb5fe1c9000
mmap(0x7fb5fe1ca000, 4096, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1000) = 0x7fb5fe1ca000
mmap(0x7fb5fe1cb000, 4096, PROT_READ, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x2000) = 0x7fb5fe1cb000
mmap(0x7fb5fe1cc000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x2000) = 0x7fb5fe1cc000
close(3)                                = 0
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/librt.so.1", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\0\0\0\0\0\0\0\0"..., 832) = 832
newfstatat(3, "", {st_mode=S_IFREG|0644, st_size=14640, ...}, AT_EMPTY_PATH) = 0
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fb5fe1c7000
mmap(NULL, 16416, PROT_READ, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7fb5fe1c2000
mmap(0x7fb5fe1c3000, 4096, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1000) = 0x7fb5fe1c3000
mmap(0x7fb5fe1c4000, 4096, PROT_READ, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x2000) = 0x7fb5fe1c4000
mmap(0x7fb5fe1c5000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x2000) = 0x7fb5fe1c5000
close(3)                                = 0
mmap(NULL, 12288, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fb5fe1bf000
arch_prctl(ARCH_SET_FS, 0x7fb5fe1bf740) = 0
set_tid_address(0x7fb5fe1bfa10)         = 22406
set_robust_list(0x7fb5fe1bfa20, 24)     = 0
rseq(0x7fb5fe1c0060, 0x20, 0, 0x53053053) = 0
mprotect(0x7fb5fc5ed000, 16384, PROT_READ) = 0
mprotect(0x7fb5fe1c5000, 4096, PROT_READ) = 0
mprotect(0x7fb5fe1cc000, 4096, PROT_READ) = 0
mprotect(0x7fb5fe1d1000, 4096, PROT_READ) = 0
mprotect(0x7fb5fe2b0000, 4096, PROT_READ) = 0
mprotect(0x7fb5fe011000, 94208, PROT_READ) = 0
mprotect(0x600000, 4096, PROT_READ)     = 0
mprotect(0x7fb5fe30a000, 8192, PROT_READ) = 0
prlimit64(0, RLIMIT_STACK, NULL, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) = 0
munmap(0x7fb5fe2b2000, 154626)          = 0
sched_get_priority_max(SCHED_RR)        = 99
sched_get_priority_min(SCHED_RR)        = 1
getrandom("\x51\x55\x0f\xc4\xa3\x09\x66\x10", 8, GRND_NONBLOCK) = 8
brk(NULL)                               = 0x25a9000
brk(0x25ca000)                          = 0x25ca000
getpid()                                = 22406
openat(AT_FDCWD, "/sys/devices/system/cpu/online", O_RDONLY|O_CLOEXEC) = 3
read(3, "0-15\n", 1024)                 = 5
close(3)                                = 0
openat(AT_FDCWD, "/sys/devices/system/cpu/online", O_RDONLY|O_CLOEXEC) = 3
read(3, "0-15\n", 1024)                 = 5
close(3)                                = 0
sched_getaffinity(22406, 8, [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]) = 8
openat(AT_FDCWD, "/proc/sys/vm/mmap_min_addr", O_RDONLY) = 3
newfstatat(3, "", {st_mode=S_IFREG|0644, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(3, "65536\n", 1024)                = 6
close(3)                                = 0
openat(AT_FDCWD, "/proc/cpuinfo", O_RDONLY) = 3
newfstatat(3, "", {st_mode=S_IFREG|0444, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(3, "processor\t: 0\nvendor_id\t: Genuin"..., 1024) = 1024
read(3, "p clflushopt clwb intel_pt avx51"..., 1024) = 1024
close(3)                                = 0
openat(AT_FDCWD, "/proc/self/maps", O_RDONLY) = 3
newfstatat(3, "", {st_mode=S_IFREG|0444, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(3, "00400000-00401000 r-xp 00000000 "..., 1024) = 1024
read(3, "0 fe:00 5376284                 "..., 1024) = 1024
read(3, "0 4742499                    /us"..., 1024) = 1024
read(3, "fe1ce000-7fb5fe1cf000 r--p 00000"..., 1024) = 1024
read(3, ":00 4729436                    /"..., 1024) = 957
read(3, "", 1024)                       = 0
close(3)                                = 0
openat(AT_FDCWD, "/sys/devices/system/cpu/online", O_RDONLY|O_CLOEXEC) = 3
read(3, "0-15\n", 1024)                 = 5
close(3)                                = 0
getpid()                                = 22406
eventfd2(0, EFD_CLOEXEC|EFD_NONBLOCK)   = 3
fcntl(3, F_SETFL, O_RDONLY|O_NONBLOCK)  = 0
rt_sigaction(SIGRT_1, {sa_handler=0x7fb5fc4a56a0, sa_mask=[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x7fb5fc45b050}, NULL, 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0
mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7fb5fbc1e000
mprotect(0x7fb5fbc1f000, 8388608, PROT_READ|PROT_WRITE) = 0
rt_sigprocmask(SIG_BLOCK, ~[], [], 8)   = 0
clone3({flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, child_tid=0x7fb5fc41e990, parent_tid=0x7fb5fc41e990, exit_signal=0, stack=0x7fb5fbc1e000, stack_size=0x7fff80, tls=0x7fb5fc41e6c0} => {parent_tid=[22407]}, 88) = 22407
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
pipe2([4, 5], O_CLOEXEC)                = 0
pipe2([6, 7], O_CLOEXEC)                = 0
openat(AT_FDCWD, "/proc/self/task/22407/comm", O_WRONLY|O_CREAT|O_TRUNC, 0666) = 8
newfstatat(8, "", {st_mode=S_IFREG|0644, st_size=0, ...}, AT_EMPTY_PATH) = 0
write(8, "cuda00001400006", 15)         = 15
close(8)                                = 0
write(3, "\1\0\0\0\0\0\0\0", 8)         = 8
futex(0x7fb5fe0d43f4, FUTEX_WAKE_PRIVATE, 2147483647) = 0
openat(AT_FDCWD, "/dev/shm/cuda_injection_path_shm", O_RDWR|O_NOFOLLOW|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/root/.nv/nvidia-application-profile-globals-rc", O_RDONLY) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/root/.nv/nvidia-application-profiles-rc", O_RDONLY) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/root/.nv/nvidia-application-profiles-rc.d", O_RDONLY) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/etc/nvidia/nvidia-application-profiles-rc", O_RDONLY) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/etc/nvidia/nvidia-application-profiles-rc.d/", O_RDONLY) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/share/nvidia/nvidia-application-profiles-555.42.02-rc", O_RDONLY) = 8
fstat(8, {st_mode=S_IFREG|0644, st_size=18852, ...}) = 0
newfstatat(8, "", {st_mode=S_IFREG|0644, st_size=18852, ...}, AT_EMPTY_PATH) = 0
read(8, "# Application profiles for the N"..., 16384) = 16384
read(8, "    { \"pattern\": { \"feature\":\"cm"..., 4096) = 2468
brk(0x25eb000)                          = 0x25eb000
brk(0x260c000)                          = 0x260c000
close(8)                                = 0
openat(AT_FDCWD, "/usr/share/nvidia/nvidia-application-profiles-rc", O_RDONLY) = -1 ENOENT (No such file or directory)
getpid()                                = 22406
readlink("/proc/22406/exe", "/usr/local/bin/cuda-checkpoint", 4095) = 30
openat(AT_FDCWD, "/proc/self/cmdline", O_RDONLY) = 8
newfstatat(8, "", {st_mode=S_IFREG|0444, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(8, "cuda-checkpoint\0--action\0checkpo"..., 4096) = 48
read(8, "", 3072)                       = 0
close(8)                                = 0
getpid()                                = 22406
geteuid()                               = 0
socket(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0) = 8
setsockopt(8, SOL_SOCKET, SO_PASSCRED, [1], 4) = 0
connect(8, {sa_family=AF_UNIX, sun_path="/tmp/nvidia-mps/control"}, 26) = -1 ENOENT (No such file or directory)
close(8)                                = 0
readlink("/proc", 0x7fffdbcca4c0, 1023) = -1 EINVAL (Invalid argument)
readlink("/proc/self", "22406", 1023)   = 5
readlink("/proc/22406", 0x7fffdbcca4c0, 1023) = -1 EINVAL (Invalid argument)
readlink("/proc/22406/exe", "/usr/local/bin/cuda-checkpoint", 1023) = 30
readlink("/usr", 0x7fffdbcca4c0, 1023)  = -1 EINVAL (Invalid argument)
readlink("/usr/local", 0x7fffdbcca4c0, 1023) = -1 EINVAL (Invalid argument)
readlink("/usr/local/bin", 0x7fffdbcca4c0, 1023) = -1 EINVAL (Invalid argument)
readlink("/usr/local/bin/cuda-checkpoint", 0x7fffdbcca4c0, 1023) = -1 EINVAL (Invalid argument)
mmap(NULL, 135168, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fb5fe2b7000
openat(AT_FDCWD, "/proc/modules", O_RDONLY) = 8
newfstatat(8, "", {st_mode=S_IFREG|0444, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(8, "raw_diag 16384 0 - Live 0xffffff"..., 1024) = 1024
read(8, "lg 36864 6 algif_hash,algif_skci"..., 1024) = 1024
read(8, "xffffffffc1299000\nipt_REJECT 163"..., 1024) = 1024
read(8, "c1124000\nsunrpc 692224 1 - Live "..., 1024) = 1024
read(8, "84 1 kvm, Live 0xffffffffc12da00"..., 1024) = 1024
read(8, "i 20480 0 - Live 0xffffffffc120e"..., 1024) = 1024
read(8, "d_pcm, Live 0xffffffffc4f4e000\nd"..., 1024) = 1024
close(8)                                = 0
openat(AT_FDCWD, "/proc/driver/nvidia/params", O_RDONLY) = 8
newfstatat(8, "", {st_mode=S_IFREG|0444, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(8, "ResmanDebugLevel: 4294967295\nRmL"..., 1024) = 945
close(8)                                = 0
stat("/dev/nvidiactl", {st_mode=S_IFCHR|0666, st_rdev=makedev(0xc3, 0xff), ...}) = 0
stat("/dev/nvidiactl", {st_mode=S_IFCHR|0666, st_rdev=makedev(0xc3, 0xff), ...}) = 0
unlink("/dev/char/195:255")             = 0
symlink("../nvidiactl", "/dev/char/195:255") = 0
openat(AT_FDCWD, "/dev/nvidiactl", O_RDWR) = 8
fcntl(8, F_SETFD, FD_CLOEXEC)           = 0
openat(AT_FDCWD, "/sys/devices/system/memory/block_size_bytes", O_RDONLY) = 9
read(9, "80000000\n", 99)               = 9
close(9)                                = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0xd6, 0x8), 0x7fffdbccaa60) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0xc8, 0x900), 0x7fb5fe11aae0) = 0
stat("/proc/driver/nvidia/gpus/0000:17:00.0/numa_status", 0x7fffdbcca9d0) = -1 ENOENT (No such file or directory)
stat("/proc/driver/nvidia/gpus/0000:73:00.0/numa_status", 0x7fffdbcca9d0) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/proc/driver/nvidia/params", O_RDONLY) = 9
newfstatat(9, "", {st_mode=S_IFREG|0444, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(9, "ResmanDebugLevel: 4294967295\nRmL"..., 1024) = 945
close(9)                                = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2b, 0x20), 0x7fffdbccabb0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc9ab0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x29, 0x10), 0x7fffdbccab90) = 0
close(8)                                = 0
openat(AT_FDCWD, "/proc/modules", O_RDONLY) = 8
newfstatat(8, "", {st_mode=S_IFREG|0444, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(8, "raw_diag 16384 0 - Live 0xffffff"..., 1024) = 1024
read(8, "lg 36864 6 algif_hash,algif_skci"..., 1024) = 1024
read(8, "xffffffffc1299000\nipt_REJECT 163"..., 1024) = 1024
read(8, "c1124000\nsunrpc 692224 1 - Live "..., 1024) = 1024
read(8, "84 1 kvm, Live 0xffffffffc12da00"..., 1024) = 1024
read(8, "i 20480 0 - Live 0xffffffffc120e"..., 1024) = 1024
read(8, "d_pcm, Live 0xffffffffc4f4e000\nd"..., 1024) = 1024
close(8)                                = 0
openat(AT_FDCWD, "/proc/driver/nvidia/params", O_RDONLY) = 8
newfstatat(8, "", {st_mode=S_IFREG|0444, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(8, "ResmanDebugLevel: 4294967295\nRmL"..., 1024) = 945
close(8)                                = 0
stat("/dev/nvidiactl", {st_mode=S_IFCHR|0666, st_rdev=makedev(0xc3, 0xff), ...}) = 0
stat("/dev/nvidiactl", {st_mode=S_IFCHR|0666, st_rdev=makedev(0xc3, 0xff), ...}) = 0
unlink("/dev/char/195:255")             = 0
symlink("../nvidiactl", "/dev/char/195:255") = 0
openat(AT_FDCWD, "/dev/nvidiactl", O_RDWR) = 8
fcntl(8, F_SETFD, FD_CLOEXEC)           = 0
openat(AT_FDCWD, "/sys/devices/system/memory/block_size_bytes", O_RDONLY) = 9
read(9, "80000000\n", 99)               = 9
close(9)                                = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0xd6, 0x8), 0x7fffdbccb020) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0xc8, 0x900), 0x7fb5fe11aae0) = 0
stat("/proc/driver/nvidia/gpus/0000:17:00.0/numa_status", 0x7fffdbccaf90) = -1 ENOENT (No such file or directory)
stat("/proc/driver/nvidia/gpus/0000:73:00.0/numa_status", 0x7fffdbccaf90) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/proc/driver/nvidia/params", O_RDONLY) = 9
newfstatat(9, "", {st_mode=S_IFREG|0444, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(9, "ResmanDebugLevel: 4294967295\nRmL"..., 1024) = 945
close(9)                                = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2b, 0x20), 0x7fffdbccb170) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcca030) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcca030) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2b, 0x30), 0x7fffdbccb160) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcca040) = 0
openat(AT_FDCWD, "/sys/devices/system/cpu/online", O_RDONLY|O_CLOEXEC) = 9
read(9, "0-15\n", 1024)                 = 5
close(9)                                = 0
openat(AT_FDCWD, "/proc/self/status", O_RDONLY) = 9
newfstatat(9, "", {st_mode=S_IFREG|0444, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(9, "Name:\tcuda-checkpoint\nUmask:\t002"..., 1024) = 1024
read(9, "ms_allowed:\t00000000,00000000,00"..., 1024) = 382
close(9)                                = 0
openat(AT_FDCWD, "/sys/devices/system/node", O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_DIRECTORY) = 9
newfstatat(9, "", {st_mode=S_IFDIR|0755, st_size=0, ...}, AT_EMPTY_PATH) = 0
getdents64(9, 0x25bc1a0 /* 12 entries */, 32768) = 392
openat(AT_FDCWD, "/sys/devices/system/node/node0/cpumap", O_RDONLY) = 10
newfstatat(10, "", {st_mode=S_IFREG|0444, st_size=4096, ...}, AT_EMPTY_PATH) = 0
read(10, "0f0f\n", 4096)                = 5
close(10)                               = 0
openat(AT_FDCWD, "/sys/devices/system/node/node1/cpumap", O_RDONLY) = 10
newfstatat(10, "", {st_mode=S_IFREG|0444, st_size=4096, ...}, AT_EMPTY_PATH) = 0
read(10, "f0f0\n", 4096)                = 5
close(10)                               = 0
getdents64(9, 0x25bc1a0 /* 0 entries */, 32768) = 0
close(9)                                = 0
futex(0x7fb5fe0d4ea0, FUTEX_WAKE_PRIVATE, 2147483647) = 0
get_mempolicy([MPOL_DEFAULT], [0000000000000000, 0000000000000000, 0000000000000000, 0000000000000000, 0000000000000000, 0000000000000000, 0000000000000000, 0000000000000000, 0000000000000000, 0000000000000000, 0000000000000000, 0000000000000000, 0000000000000000, 0000000000000000, 0000000000000000, 0000000000000000], 1024, NULL, 0) = 0
openat(AT_FDCWD, "/proc/modules", O_RDONLY) = 9
newfstatat(9, "", {st_mode=S_IFREG|0444, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(9, "raw_diag 16384 0 - Live 0xffffff"..., 1024) = 1024
close(9)                                = 0
openat(AT_FDCWD, "/proc/devices", O_RDONLY) = 9
newfstatat(9, "", {st_mode=S_IFREG|0444, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(9, "Character devices:\n  1 mem\n  4 /"..., 1024) = 750
close(9)                                = 0
stat("/dev/nvidia-uvm", {st_mode=S_IFCHR|0666, st_rdev=makedev(0xea, 0), ...}) = 0
stat("/dev/nvidia-uvm", {st_mode=S_IFCHR|0666, st_rdev=makedev(0xea, 0), ...}) = 0
unlink("/dev/char/234:0")               = 0
symlink("../nvidia-uvm", "/dev/char/234:0") = 0
stat("/dev/nvidia-uvm-tools", {st_mode=S_IFCHR|0666, st_rdev=makedev(0xea, 0x1), ...}) = 0
stat("/dev/nvidia-uvm-tools", {st_mode=S_IFCHR|0666, st_rdev=makedev(0xea, 0x1), ...}) = 0
unlink("/dev/char/234:1")               = 0
symlink("../nvidia-uvm-tools", "/dev/char/234:1") = 0
openat(AT_FDCWD, "/dev/nvidia-uvm", O_RDWR|O_CLOEXEC) = 9
fcntl(9, F_GETFD)                       = 0x1 (flags FD_CLOEXEC)
openat(AT_FDCWD, "/dev/nvidia-uvm", O_RDWR|O_CLOEXEC) = 10
fcntl(10, F_GETFD)                      = 0x1 (flags FD_CLOEXEC)
ioctl(9, _IOC(_IOC_NONE, 0, 0x1, 0x3000), 0x7fffdbccb170) = 0
ioctl(10, _IOC(_IOC_NONE, 0, 0x4b, 0), 0x7fffdbccb170) = 0
close(10)                               = 0
ioctl(9, _IOC(_IOC_NONE, 0, 0x27, 0), 0x7fffdbccb170) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc81e0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7a10) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8120) = 0
openat(AT_FDCWD, "/proc/driver/nvidia/params", O_RDONLY) = 10
newfstatat(10, "", {st_mode=S_IFREG|0444, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(10, "ResmanDebugLevel: 4294967295\nRmL"..., 1024) = 945
close(10)                               = 0
stat("/dev/nvidia0", {st_mode=S_IFCHR|0666, st_rdev=makedev(0xc3, 0), ...}) = 0
stat("/dev/nvidia0", {st_mode=S_IFCHR|0666, st_rdev=makedev(0xc3, 0), ...}) = 0
unlink("/dev/char/195:0")               = 0
symlink("../nvidia0", "/dev/char/195:0") = 0
openat(AT_FDCWD, "/dev/nvidia0", O_RDWR|O_CLOEXEC) = 10
fcntl(10, F_GETFD)                      = 0x1 (flags FD_CLOEXEC)
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8120) = 0
openat(AT_FDCWD, "/proc/driver/nvidia/params", O_RDONLY) = 11
newfstatat(11, "", {st_mode=S_IFREG|0444, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(11, "ResmanDebugLevel: 4294967295\nRmL"..., 1024) = 945
close(11)                               = 0
stat("/dev/nvidia1", {st_mode=S_IFCHR|0666, st_rdev=makedev(0xc3, 0x1), ...}) = 0
stat("/dev/nvidia1", {st_mode=S_IFCHR|0666, st_rdev=makedev(0xc3, 0x1), ...}) = 0
unlink("/dev/char/195:1")               = 0
symlink("../nvidia1", "/dev/char/195:1") = 0
openat(AT_FDCWD, "/dev/nvidia1", O_RDWR|O_CLOEXEC) = 11
fcntl(11, F_GETFD)                      = 0x1 (flags FD_CLOEXEC)
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc82e0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc82d0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x29, 0x10), 0x7fffdbcc9370) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8160) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8160) = 0
openat(AT_FDCWD, "/proc/driver/nvidia/params", O_RDONLY) = 12
newfstatat(12, "", {st_mode=S_IFREG|0444, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(12, "ResmanDebugLevel: 4294967295\nRmL"..., 1024) = 945
close(12)                               = 0
stat("/dev/nvidia0", {st_mode=S_IFCHR|0666, st_rdev=makedev(0xc3, 0), ...}) = 0
stat("/dev/nvidia0", {st_mode=S_IFCHR|0666, st_rdev=makedev(0xc3, 0), ...}) = 0
unlink("/dev/char/195:0")               = 0
symlink("../nvidia0", "/dev/char/195:0") = 0
openat(AT_FDCWD, "/dev/nvidia0", O_RDWR|O_CLOEXEC) = 12
fcntl(12, F_GETFD)                      = 0x1 (flags FD_CLOEXEC)
ioctl(12, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0xc9, 0x4), 0x7fffdbcc92e0) = 0
ioctl(12, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0xd7, 0x230), 0x7fffdbcc9060) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2b, 0x30), 0x7fffdbcc93b0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc82e0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8140) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc82e0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8250) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8150) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8150) = 0
openat(AT_FDCWD, "/proc/driver/nvidia/params", O_RDONLY) = 13
newfstatat(13, "", {st_mode=S_IFREG|0444, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(13, "ResmanDebugLevel: 4294967295\nRmL"..., 1024) = 945
close(13)                               = 0
stat("/dev/nvidia0", {st_mode=S_IFCHR|0666, st_rdev=makedev(0xc3, 0), ...}) = 0
stat("/dev/nvidia0", {st_mode=S_IFCHR|0666, st_rdev=makedev(0xc3, 0), ...}) = 0
unlink("/dev/char/195:0")               = 0
symlink("../nvidia0", "/dev/char/195:0") = 0
openat(AT_FDCWD, "/dev/nvidia0", O_RDWR|O_CLOEXEC) = 13
fcntl(13, F_GETFD)                      = 0x1 (flags FD_CLOEXEC)
ioctl(13, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0xc9, 0x4), 0x7fffdbcc92d0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2b, 0x30), 0x7fffdbcc93e0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2b, 0x30), 0x7fffdbcc9400) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8130) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7c30) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc81e0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7850) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7040) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7690) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7f00) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc80d0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7dc0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7dc0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7dc0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7dc0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7dc0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7c10) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7f20) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7fc0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc80d0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc80d0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7e20) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8000) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8000) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8000) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8000) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8000) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8000) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8000) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7ff0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7f70) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc5a50) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc6b00) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8100) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7f50) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc80a0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2b, 0x30), 0x7fffdbcc91f0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2b, 0x30), 0x7fffdbcc91f0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc80d0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8030) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8030) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8130) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7ff0) = 0
openat(AT_FDCWD, "/sys/bus/pci/devices/0000:17:00.0/numa_node", O_RDONLY) = 14
newfstatat(14, "", {st_mode=S_IFREG|0644, st_size=4096, ...}, AT_EMPTY_PATH) = 0
read(14, "0\n", 4096)                   = 2
close(14)                               = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc82d0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x29, 0x10), 0x7fffdbcc9370) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8160) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8160) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8160) = 0
openat(AT_FDCWD, "/proc/driver/nvidia/params", O_RDONLY) = 14
newfstatat(14, "", {st_mode=S_IFREG|0444, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(14, "ResmanDebugLevel: 4294967295\nRmL"..., 1024) = 945
close(14)                               = 0
stat("/dev/nvidia1", {st_mode=S_IFCHR|0666, st_rdev=makedev(0xc3, 0x1), ...}) = 0
stat("/dev/nvidia1", {st_mode=S_IFCHR|0666, st_rdev=makedev(0xc3, 0x1), ...}) = 0
unlink("/dev/char/195:1")               = 0
symlink("../nvidia1", "/dev/char/195:1") = 0
openat(AT_FDCWD, "/dev/nvidia1", O_RDWR|O_CLOEXEC) = 14
fcntl(14, F_GETFD)                      = 0x1 (flags FD_CLOEXEC)
ioctl(14, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0xc9, 0x4), 0x7fffdbcc92e0) = 0
ioctl(14, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0xd7, 0x230), 0x7fffdbcc9060) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2b, 0x30), 0x7fffdbcc93b0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc82e0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8140) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc82e0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8250) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8150) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8150) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8150) = 0
openat(AT_FDCWD, "/proc/driver/nvidia/params", O_RDONLY) = 15
newfstatat(15, "", {st_mode=S_IFREG|0444, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(15, "ResmanDebugLevel: 4294967295\nRmL"..., 1024) = 945
close(15)                               = 0
stat("/dev/nvidia1", {st_mode=S_IFCHR|0666, st_rdev=makedev(0xc3, 0x1), ...}) = 0
stat("/dev/nvidia1", {st_mode=S_IFCHR|0666, st_rdev=makedev(0xc3, 0x1), ...}) = 0
unlink("/dev/char/195:1")               = 0
symlink("../nvidia1", "/dev/char/195:1") = 0
openat(AT_FDCWD, "/dev/nvidia1", O_RDWR|O_CLOEXEC) = 15
fcntl(15, F_GETFD)                      = 0x1 (flags FD_CLOEXEC)
ioctl(15, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0xc9, 0x4), 0x7fffdbcc92d0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2b, 0x30), 0x7fffdbcc93e0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2b, 0x30), 0x7fffdbcc9400) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8130) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7c30) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc81e0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7850) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7040) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7690) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7f00) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc80d0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7dc0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7dc0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7dc0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7dc0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7dc0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7c10) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7f20) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7fc0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc80d0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc80d0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7e20) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8000) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8000) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8000) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8000) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8000) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8000) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8000) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7ff0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7f70) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc5a50) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc6b00) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8100) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7f50) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc80a0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2b, 0x30), 0x7fffdbcc91f0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2b, 0x30), 0x7fffdbcc91f0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc80d0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8030) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8030) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8130) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7ff0) = 0
openat(AT_FDCWD, "/sys/bus/pci/devices/0000:73:00.0/numa_node", O_RDONLY) = 16
newfstatat(16, "", {st_mode=S_IFREG|0644, st_size=4096, ...}, AT_EMPTY_PATH) = 0
read(16, "0\n", 4096)                   = 2
close(16)                               = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc82e0) = 0
mmap(NULL, 100663296, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fb5ee000000
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc9440) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc9440) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc9440) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc9440) = 0
mmap(NULL, 2465792, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fb5fb9c4000
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8590) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8590) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8590) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8e10) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7610) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc8e10) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fffdbcc7610) = 0
mmap(NULL, 659456, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fb5fb923000
mkdir("/root", 0700)                    = -1 EEXIST (File exists)
mkdir("/root/.nv", 0700)                = -1 EEXIST (File exists)
mkdir("/root/.nv/ComputeCache", 0700)   = -1 EEXIST (File exists)
sysinfo({uptime=1672, loads=[34912, 45312, 37088], totalram=268889485312, freeram=252441673728, sharedram=199385088, bufferram=825442304, totalswap=63999832064, freeswap=63999832064, procs=2738, totalhigh=0, freehigh=0, mem_unit=1}) = 0
uname({sysname="Linux", nodename="1080x2-ws", ...}) = 0
ioctl(9, _IOC(_IOC_NONE, 0, 0x25, 0), 0x7fffdbccace0) = 0
ioctl(9, _IOC(_IOC_NONE, 0, 0x37, 0), 0x7fffdbccad00) = 0
ioctl(9, _IOC(_IOC_NONE, 0, 0x25, 0), 0x7fffdbccace0) = 0
ioctl(9, _IOC(_IOC_NONE, 0, 0x37, 0), 0x7fffdbccad00) = 0
ioctl(9, _IOC(_IOC_NONE, 0, 0x17, 0), 0x7fffdbccb1b0) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2b, 0x30), 0x7fffdbccaf10) = 0
ioctl(8, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2b, 0x30), 0x7fffdbccaf10) = 0
prlimit64(0, RLIMIT_AS, NULL, {rlim_cur=RLIM64_INFINITY, rlim_max=RLIM64_INFINITY}) = 0
mmap(0x200000000, 8592031744, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x200000000
mmap(NULL, 536866816, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fb5ce001000
munmap(0x7fb5ce001000, 33550336)        = 0
munmap(0x7fb5e0000000, 234881024)       = 0
mmap(NULL, 16912384, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fb5fa902000
mmap(NULL, 16912384, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fb5f98e1000
getpid()                                = 22406
stat("/proc/22406/ns/pid", {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
stat("/proc/22406/ns/pid", {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
socket(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0) = 16
unlink("")                              = -1 ENOENT (No such file or directory)
bind(16, {sa_family=AF_UNIX, sun_path=@"cuda-uvmfd-4026531836-22406\0"}, 31) = 0
listen(16, 128)                         = 0
write(3, "\1\0\0\0\0\0\0\0", 8)         = 8
getpid()                                = 22406
openat(AT_FDCWD, "/proc/17146/task", O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_DIRECTORY) = 17
newfstatat(17, "", {st_mode=S_IFDIR|0555, st_size=0, ...}, AT_EMPTY_PATH) = 0
getdents64(17, 0x25a9d40 /* 12 entries */, 32768) = 368
openat(AT_FDCWD, "/proc/17146/task/17146/comm", O_RDONLY) = 18
newfstatat(18, "", {st_mode=S_IFREG|0644, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(18, "msedge\n", 1024)              = 7
read(18, "", 1024)                      = 0
close(18)                               = 0
openat(AT_FDCWD, "/proc/17146/task/17178/comm", O_RDONLY) = 18
newfstatat(18, "", {st_mode=S_IFREG|0644, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(18, "GpuWatchdog\n", 1024)         = 12
read(18, "", 1024)                      = 0
close(18)                               = 0
openat(AT_FDCWD, "/proc/17146/task/17179/comm", O_RDONLY) = 18
newfstatat(18, "", {st_mode=S_IFREG|0644, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(18, "ThreadPoolServi\n", 1024)     = 16
close(18)                               = 0
openat(AT_FDCWD, "/proc/17146/task/17180/comm", O_RDONLY) = 18
newfstatat(18, "", {st_mode=S_IFREG|0644, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(18, "ThreadPoolForeg\n", 1024)     = 16
close(18)                               = 0
openat(AT_FDCWD, "/proc/17146/task/17181/comm", O_RDONLY) = 18
newfstatat(18, "", {st_mode=S_IFREG|0644, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(18, "Chrome_ChildIOT\n", 1024)     = 16
close(18)                               = 0
openat(AT_FDCWD, "/proc/17146/task/17182/comm", O_RDONLY) = 18
newfstatat(18, "", {st_mode=S_IFREG|0644, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(18, "ThreadPoolForeg\n", 1024)     = 16
close(18)                               = 0
openat(AT_FDCWD, "/proc/17146/task/17183/comm", O_RDONLY) = 18
newfstatat(18, "", {st_mode=S_IFREG|0644, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(18, "VizCompositorTh\n", 1024)     = 16
close(18)                               = 0
openat(AT_FDCWD, "/proc/17146/task/17264/comm", O_RDONLY) = 18
newfstatat(18, "", {st_mode=S_IFREG|0644, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(18, "ThreadPoolSingl\n", 1024)     = 16
close(18)                               = 0
openat(AT_FDCWD, "/proc/17146/task/19336/comm", O_RDONLY) = 18
newfstatat(18, "", {st_mode=S_IFREG|0644, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(18, "msedge\n", 1024)              = 7
read(18, "", 1024)                      = 0
close(18)                               = 0
openat(AT_FDCWD, "/proc/17146/task/19337/comm", O_RDONLY) = 18
newfstatat(18, "", {st_mode=S_IFREG|0644, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(18, "msedge\n", 1024)              = 7
read(18, "", 1024)                      = 0
close(18)                               = 0
getdents64(17, 0x25a9d40 /* 0 entries */, 32768) = 0
close(17)                               = 0
write(2, "Could not checkpoint on process "..., 65Could not checkpoint on process ID 17146: "initialization error"
) = 65
exit_group(1)                           = ?
+++ exited with 1 +++
@jesus-ramos
Copy link
Collaborator

Based on the strace output it looks like CUDA is more than likely running as a subprocess of what you're trying to checkpoint. If you use the --get-restore-tid flag it should return a thread ID if the target pid is a CUDA process or an error otherwise. You can also do a similar check with --get-state it should return running|locked|checkpointed if CUDA is running or complain similarly otherwise.

Full process tree support is currently being worked on but available as a PR on the main CRIU github if you'd like to try that : checkpoint-restore/criu#2416

@xinranwang17
Copy link

I also encountered this error during restoration. Is there any BKM to debug it?

$nvidia-smi --version
NVIDIA-SMI version  : 550.90.07
NVML version        : 550.90
DRIVER version      : 550.90.07
CUDA Version        : 12.4

@jesus-ramos
Copy link
Collaborator

I also encountered this error during restoration. Is there any BKM to debug it?

$nvidia-smi --version
NVIDIA-SMI version  : 550.90.07
NVML version        : 550.90
DRIVER version      : 550.90.07
CUDA Version        : 12.4

Can you provide a sample app and the calls to cuda-checkpoint for me to take a look at? During restore was cuda-checkpoint also run as root?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants