Motherboard: X10DRG-O / Product Name: SYS-4028GR-TR
BIOS: 7/27/2016
IPMI: 3.44
CPU: E5 2689 3.1Ghz V4 x 2
Memory: Samsung 16GB x 24
GPU: Nvidia Titan X Pascal x 10
OS: Redhat Linux 7.2 x64
Driver: Cuda 8.0.44
Software: NCCL https://github.com/NVIDIA/nccl/archive/v1.2.3-1+cuda8.0.tar.gz
[root@103-124 nccl-1.2.3-1-cuda8.0]# ./build/test/single/all_reduce_test 10000000
# Using devices
# Rank 0 uses device 0 [0x04] TITAN X (Pascal)
# Rank 1 uses device 1 [0x05] TITAN X (Pascal)
# Rank 2 uses device 2 [0x06] TITAN X (Pascal)
# Rank 3 uses device 3 [0x07] TITAN X (Pascal)
# Rank 4 uses device 4 [0x08] TITAN X (Pascal)
# Rank 5 uses device 5 [0x0b] TITAN X (Pascal)
# Rank 6 uses device 6 [0x0c] TITAN X (Pascal)
# Rank 7 uses device 7 [0x0d] TITAN X (Pascal)
# Rank 8 uses device 8 [0x0e] TITAN X (Pascal)
# Rank 9 uses device 9 [0x0f] TITAN X (Pascal)
# out-of-place in-place
# bytes N type op time algbw busbw res time algbw busbw res
10000000 10000000 char sum 2.978 3.36 6.04 0e+00 2.990 3.34 6.02 0e+00
10000000 10000000 char prod 2.972 3.36 6.06 0e+00 3.600 2.78 5.00 0e+00
10000000 10000000 char max 2.971 3.37 6.06 0e+00 3.003 3.33 5.99 0e+00
10000000 10000000 char min 2.975 3.36 6.05 0e+00 2.999 3.33 6.00 0e+00
10000000 2500000 int sum 3.009 3.32 5.98 0e+00 3.038 3.29 5.93 0e+00
10000000 2500000 int prod 3.015 3.32 5.97 0e+00 3.045 3.28 5.91 0e+00
10000000 2500000 int max 3.015 3.32 5.97 0e+00 3.018 3.31 5.96 0e+00
10000000 2500000 int min 3.025 3.31 5.95 0e+00 3.028 3.30 5.95 0e+00
10000000 5000000 half sum 2.974 3.36 6.05 2e-02 2.980 3.36 6.04 2e-02
10000000 5000000 half prod 2.965 3.37 6.07 6e-04 2.988 3.35 6.02 6e-04
10000000 5000000 half max 2.965 3.37 6.07 0e+00 2.986 3.35 6.03 0e+00
10000000 5000000 half min 2.968 3.37 6.06 0e+00 2.980 3.36 6.04 0e+00
10000000 2500000 float sum 3.012 3.32 5.98 2e-06 3.021 3.31 5.96 2e-06
10000000 2500000 float prod 2.999 3.33 6.00 6e-08 3.031 3.30 5.94 6e-08
10000000 2500000 float max 3.012 3.32 5.98 0e+00 3.026 3.30 5.95 0e+00
10000000 2500000 float min 3.002 3.33 6.00 0e+00 3.013 3.32 5.97 0e+00
10000000 1250000 double sum 3.029 3.30 5.94 0e+00 3.040 3.29 5.92 0e+00
10000000 1250000 double prod 3.019 3.31 5.96 6e-17 3.059 3.27 5.88 6e-17
10000000 1250000 double max 3.016 3.32 5.97 0e+00 3.038 3.29 5.93 0e+00
10000000 1250000 double min 3.004 3.33 5.99 0e+00 3.034 3.30 5.93 0e+00
10000000 1250000 int64 sum 3.023 3.31 5.96 0e+00 3.034 3.30 5.93 0e+00
10000000 1250000 int64 prod 3.020 3.31 5.96 0e+00 3.034 3.30 5.93 0e+00
10000000 1250000 int64 max 3.020 3.31 5.96 0e+00 3.054 3.27 5.89 0e+00
10000000 1250000 int64 min 3.015 3.32 5.97 0e+00 3.033 3.30 5.93 0e+00
10000000 1250000 uint64 sum 3.019 3.31 5.96 0e+00 3.036 3.29 5.93 0e+00
10000000 1250000 uint64 prod 3.017 3.31 5.97 0e+00 3.039 3.29 5.92 0e+00
10000000 1250000 uint64 max 3.019 3.31 5.96 0e+00 3.037 3.29 5.93 0e+00
10000000 1250000 uint64 min 3.016 3.32 5.97 0e+00 3.070 3.26 5.86 0e+00
[root@103-124 nccl-1.2.3-1-cuda8.0]# ./build/test/single/all_gather_test 10000000
# Using devices
# Rank 0 uses device 0 [0x04] TITAN X (Pascal)
# Rank 1 uses device 1 [0x05] TITAN X (Pascal)
# Rank 2 uses device 2 [0x06] TITAN X (Pascal)
# Rank 3 uses device 3 [0x07] TITAN X (Pascal)
# Rank 4 uses device 4 [0x08] TITAN X (Pascal)
# Rank 5 uses device 5 [0x0b] TITAN X (Pascal)
# Rank 6 uses device 6 [0x0c] TITAN X (Pascal)
# Rank 7 uses device 7 [0x0d] TITAN X (Pascal)
# Rank 8 uses device 8 [0x0e] TITAN X (Pascal)
# Rank 9 uses device 9 [0x0f] TITAN X (Pascal)
# bytes N type time algbw busbw delta
10000000 10000000 char 14.514 6.20 6.20 0e+00
10000000 2500000 int 14.555 6.18 6.18 0e+00
10000000 5000000 half 14.540 6.19 6.19 0e+00
10000000 2500000 float 14.553 6.18 6.18 0e+00
10000000 1250000 double 14.553 6.18 6.18 0e+00
10000000 1250000 int64 14.571 6.18 6.18 0e+00
10000000 1250000 uint64 14.566 6.18 6.18 0e+00
BIOS: 7/27/2016
IPMI: 3.44
CPU: E5 2689 3.1Ghz V4 x 2
Memory: Samsung 16GB x 24
GPU: Nvidia Titan X Pascal x 10
OS: Redhat Linux 7.2 x64
Driver: Cuda 8.0.44
Software: NCCL https://github.com/NVIDIA/nccl/archive/v1.2.3-1+cuda8.0.tar.gz
[root@103-124 nccl-1.2.3-1-cuda8.0]# ./build/test/single/all_reduce_test 10000000
# Using devices
# Rank 0 uses device 0 [0x04] TITAN X (Pascal)
# Rank 1 uses device 1 [0x05] TITAN X (Pascal)
# Rank 2 uses device 2 [0x06] TITAN X (Pascal)
# Rank 3 uses device 3 [0x07] TITAN X (Pascal)
# Rank 4 uses device 4 [0x08] TITAN X (Pascal)
# Rank 5 uses device 5 [0x0b] TITAN X (Pascal)
# Rank 6 uses device 6 [0x0c] TITAN X (Pascal)
# Rank 7 uses device 7 [0x0d] TITAN X (Pascal)
# Rank 8 uses device 8 [0x0e] TITAN X (Pascal)
# Rank 9 uses device 9 [0x0f] TITAN X (Pascal)
# out-of-place in-place
# bytes N type op time algbw busbw res time algbw busbw res
10000000 10000000 char sum 2.978 3.36 6.04 0e+00 2.990 3.34 6.02 0e+00
10000000 10000000 char prod 2.972 3.36 6.06 0e+00 3.600 2.78 5.00 0e+00
10000000 10000000 char max 2.971 3.37 6.06 0e+00 3.003 3.33 5.99 0e+00
10000000 10000000 char min 2.975 3.36 6.05 0e+00 2.999 3.33 6.00 0e+00
10000000 2500000 int sum 3.009 3.32 5.98 0e+00 3.038 3.29 5.93 0e+00
10000000 2500000 int prod 3.015 3.32 5.97 0e+00 3.045 3.28 5.91 0e+00
10000000 2500000 int max 3.015 3.32 5.97 0e+00 3.018 3.31 5.96 0e+00
10000000 2500000 int min 3.025 3.31 5.95 0e+00 3.028 3.30 5.95 0e+00
10000000 5000000 half sum 2.974 3.36 6.05 2e-02 2.980 3.36 6.04 2e-02
10000000 5000000 half prod 2.965 3.37 6.07 6e-04 2.988 3.35 6.02 6e-04
10000000 5000000 half max 2.965 3.37 6.07 0e+00 2.986 3.35 6.03 0e+00
10000000 5000000 half min 2.968 3.37 6.06 0e+00 2.980 3.36 6.04 0e+00
10000000 2500000 float sum 3.012 3.32 5.98 2e-06 3.021 3.31 5.96 2e-06
10000000 2500000 float prod 2.999 3.33 6.00 6e-08 3.031 3.30 5.94 6e-08
10000000 2500000 float max 3.012 3.32 5.98 0e+00 3.026 3.30 5.95 0e+00
10000000 2500000 float min 3.002 3.33 6.00 0e+00 3.013 3.32 5.97 0e+00
10000000 1250000 double sum 3.029 3.30 5.94 0e+00 3.040 3.29 5.92 0e+00
10000000 1250000 double prod 3.019 3.31 5.96 6e-17 3.059 3.27 5.88 6e-17
10000000 1250000 double max 3.016 3.32 5.97 0e+00 3.038 3.29 5.93 0e+00
10000000 1250000 double min 3.004 3.33 5.99 0e+00 3.034 3.30 5.93 0e+00
10000000 1250000 int64 sum 3.023 3.31 5.96 0e+00 3.034 3.30 5.93 0e+00
10000000 1250000 int64 prod 3.020 3.31 5.96 0e+00 3.034 3.30 5.93 0e+00
10000000 1250000 int64 max 3.020 3.31 5.96 0e+00 3.054 3.27 5.89 0e+00
10000000 1250000 int64 min 3.015 3.32 5.97 0e+00 3.033 3.30 5.93 0e+00
10000000 1250000 uint64 sum 3.019 3.31 5.96 0e+00 3.036 3.29 5.93 0e+00
10000000 1250000 uint64 prod 3.017 3.31 5.97 0e+00 3.039 3.29 5.92 0e+00
10000000 1250000 uint64 max 3.019 3.31 5.96 0e+00 3.037 3.29 5.93 0e+00
10000000 1250000 uint64 min 3.016 3.32 5.97 0e+00 3.070 3.26 5.86 0e+00
[root@103-124 nccl-1.2.3-1-cuda8.0]# ./build/test/single/all_gather_test 10000000
# Using devices
# Rank 0 uses device 0 [0x04] TITAN X (Pascal)
# Rank 1 uses device 1 [0x05] TITAN X (Pascal)
# Rank 2 uses device 2 [0x06] TITAN X (Pascal)
# Rank 3 uses device 3 [0x07] TITAN X (Pascal)
# Rank 4 uses device 4 [0x08] TITAN X (Pascal)
# Rank 5 uses device 5 [0x0b] TITAN X (Pascal)
# Rank 6 uses device 6 [0x0c] TITAN X (Pascal)
# Rank 7 uses device 7 [0x0d] TITAN X (Pascal)
# Rank 8 uses device 8 [0x0e] TITAN X (Pascal)
# Rank 9 uses device 9 [0x0f] TITAN X (Pascal)
# bytes N type time algbw busbw delta
10000000 10000000 char 14.514 6.20 6.20 0e+00
10000000 2500000 int 14.555 6.18 6.18 0e+00
10000000 5000000 half 14.540 6.19 6.19 0e+00
10000000 2500000 float 14.553 6.18 6.18 0e+00
10000000 1250000 double 14.553 6.18 6.18 0e+00
10000000 1250000 int64 14.571 6.18 6.18 0e+00
10000000 1250000 uint64 14.566 6.18 6.18 0e+00