Titan X pascal NCCL tesing on SYS-4028GR-TR

Notice: Page may contain affiliate links for which we may earn a small commission through services like Amazon Affiliates or Skimlinks.

dhenzjhen

Member
Sep 14, 2016
38
55
18
San Jose, California
Motherboard: X10DRG-O / Product Name: SYS-4028GR-TR
BIOS: 7/27/2016
IPMI: 3.44
CPU: E5 2689 3.1Ghz V4 x 2
Memory: Samsung 16GB x 24
GPU: Nvidia Titan X Pascal x 10
OS: Redhat Linux 7.2 x64
Driver: Cuda 8.0.44
Software: NCCL https://github.com/NVIDIA/nccl/archive/v1.2.3-1+cuda8.0.tar.gz


[root@103-124 nccl-1.2.3-1-cuda8.0]# ./build/test/single/all_reduce_test 10000000
# Using devices
# Rank 0 uses device 0 [0x04] TITAN X (Pascal)
# Rank 1 uses device 1 [0x05] TITAN X (Pascal)
# Rank 2 uses device 2 [0x06] TITAN X (Pascal)
# Rank 3 uses device 3 [0x07] TITAN X (Pascal)
# Rank 4 uses device 4 [0x08] TITAN X (Pascal)
# Rank 5 uses device 5 [0x0b] TITAN X (Pascal)
# Rank 6 uses device 6 [0x0c] TITAN X (Pascal)
# Rank 7 uses device 7 [0x0d] TITAN X (Pascal)
# Rank 8 uses device 8 [0x0e] TITAN X (Pascal)
# Rank 9 uses device 9 [0x0f] TITAN X (Pascal)

# out-of-place in-place
# bytes N type op time algbw busbw res time algbw busbw res
10000000 10000000 char sum 2.978 3.36 6.04 0e+00 2.990 3.34 6.02 0e+00
10000000 10000000 char prod 2.972 3.36 6.06 0e+00 3.600 2.78 5.00 0e+00
10000000 10000000 char max 2.971 3.37 6.06 0e+00 3.003 3.33 5.99 0e+00
10000000 10000000 char min 2.975 3.36 6.05 0e+00 2.999 3.33 6.00 0e+00
10000000 2500000 int sum 3.009 3.32 5.98 0e+00 3.038 3.29 5.93 0e+00
10000000 2500000 int prod 3.015 3.32 5.97 0e+00 3.045 3.28 5.91 0e+00
10000000 2500000 int max 3.015 3.32 5.97 0e+00 3.018 3.31 5.96 0e+00
10000000 2500000 int min 3.025 3.31 5.95 0e+00 3.028 3.30 5.95 0e+00
10000000 5000000 half sum 2.974 3.36 6.05 2e-02 2.980 3.36 6.04 2e-02
10000000 5000000 half prod 2.965 3.37 6.07 6e-04 2.988 3.35 6.02 6e-04
10000000 5000000 half max 2.965 3.37 6.07 0e+00 2.986 3.35 6.03 0e+00
10000000 5000000 half min 2.968 3.37 6.06 0e+00 2.980 3.36 6.04 0e+00
10000000 2500000 float sum 3.012 3.32 5.98 2e-06 3.021 3.31 5.96 2e-06
10000000 2500000 float prod 2.999 3.33 6.00 6e-08 3.031 3.30 5.94 6e-08
10000000 2500000 float max 3.012 3.32 5.98 0e+00 3.026 3.30 5.95 0e+00
10000000 2500000 float min 3.002 3.33 6.00 0e+00 3.013 3.32 5.97 0e+00
10000000 1250000 double sum 3.029 3.30 5.94 0e+00 3.040 3.29 5.92 0e+00
10000000 1250000 double prod 3.019 3.31 5.96 6e-17 3.059 3.27 5.88 6e-17
10000000 1250000 double max 3.016 3.32 5.97 0e+00 3.038 3.29 5.93 0e+00
10000000 1250000 double min 3.004 3.33 5.99 0e+00 3.034 3.30 5.93 0e+00
10000000 1250000 int64 sum 3.023 3.31 5.96 0e+00 3.034 3.30 5.93 0e+00
10000000 1250000 int64 prod 3.020 3.31 5.96 0e+00 3.034 3.30 5.93 0e+00
10000000 1250000 int64 max 3.020 3.31 5.96 0e+00 3.054 3.27 5.89 0e+00
10000000 1250000 int64 min 3.015 3.32 5.97 0e+00 3.033 3.30 5.93 0e+00
10000000 1250000 uint64 sum 3.019 3.31 5.96 0e+00 3.036 3.29 5.93 0e+00
10000000 1250000 uint64 prod 3.017 3.31 5.97 0e+00 3.039 3.29 5.92 0e+00
10000000 1250000 uint64 max 3.019 3.31 5.96 0e+00 3.037 3.29 5.93 0e+00
10000000 1250000 uint64 min 3.016 3.32 5.97 0e+00 3.070 3.26 5.86 0e+00



[root@103-124 nccl-1.2.3-1-cuda8.0]# ./build/test/single/all_gather_test 10000000
# Using devices
# Rank 0 uses device 0 [0x04] TITAN X (Pascal)
# Rank 1 uses device 1 [0x05] TITAN X (Pascal)
# Rank 2 uses device 2 [0x06] TITAN X (Pascal)
# Rank 3 uses device 3 [0x07] TITAN X (Pascal)
# Rank 4 uses device 4 [0x08] TITAN X (Pascal)
# Rank 5 uses device 5 [0x0b] TITAN X (Pascal)
# Rank 6 uses device 6 [0x0c] TITAN X (Pascal)
# Rank 7 uses device 7 [0x0d] TITAN X (Pascal)
# Rank 8 uses device 8 [0x0e] TITAN X (Pascal)
# Rank 9 uses device 9 [0x0f] TITAN X (Pascal)

# bytes N type time algbw busbw delta
10000000 10000000 char 14.514 6.20 6.20 0e+00
10000000 2500000 int 14.555 6.18 6.18 0e+00
10000000 5000000 half 14.540 6.19 6.19 0e+00
10000000 2500000 float 14.553 6.18 6.18 0e+00
10000000 1250000 double 14.553 6.18 6.18 0e+00
10000000 1250000 int64 14.571 6.18 6.18 0e+00
10000000 1250000 uint64 14.566 6.18 6.18 0e+00
 

dhenzjhen

Member
Sep 14, 2016
38
55
18
San Jose, California
[root@103-124 nccl-1.2.3-1-cuda8.0]# ./build/test/single/broadcast_test 10000000
# Using devices
# Rank 0 uses device 0 [0x04] TITAN X (Pascal)
# Rank 1 uses device 1 [0x05] TITAN X (Pascal)
# Rank 2 uses device 2 [0x06] TITAN X (Pascal)
# Rank 3 uses device 3 [0x07] TITAN X (Pascal)
# Rank 4 uses device 4 [0x08] TITAN X (Pascal)
# Rank 5 uses device 5 [0x0b] TITAN X (Pascal)
# Rank 6 uses device 6 [0x0c] TITAN X (Pascal)
# Rank 7 uses device 7 [0x0d] TITAN X (Pascal)
# Rank 8 uses device 8 [0x0e] TITAN X (Pascal)
# Rank 9 uses device 9 [0x0f] TITAN X (Pascal)

# bytes N type root time algbw busbw delta
10000000 10000000 char 0 1.678 5.96 5.96 0e+00
10000000 10000000 char 1 1.709 5.85 5.85 0e+00
10000000 10000000 char 2 1.726 5.79 5.79 0e+00
10000000 10000000 char 3 1.733 5.77 5.77 0e+00
10000000 10000000 char 4 1.737 5.76 5.76 0e+00
10000000 10000000 char 5 1.730 5.78 5.78 0e+00
10000000 10000000 char 6 1.745 5.73 5.73 0e+00
10000000 10000000 char 7 1.754 5.70 5.70 0e+00
10000000 10000000 char 8 1.735 5.76 5.76 0e+00
10000000 10000000 char 9 1.693 5.91 5.91 0e+00
10000000 2500000 int 0 1.675 5.97 5.97 0e+00
10000000 2500000 int 1 1.703 5.87 5.87 0e+00
10000000 2500000 int 2 1.730 5.78 5.78 0e+00
10000000 2500000 int 3 1.729 5.78 5.78 0e+00
10000000 2500000 int 4 1.731 5.78 5.78 0e+00
10000000 2500000 int 5 1.721 5.81 5.81 0e+00
10000000 2500000 int 6 1.738 5.75 5.75 0e+00
10000000 2500000 int 7 1.744 5.73 5.73 0e+00
10000000 2500000 int 8 1.726 5.79 5.79 0e+00
10000000 2500000 int 9 1.687 5.93 5.93 0e+00
10000000 5000000 half 0 1.675 5.97 5.97 0e+00
10000000 5000000 half 1 2.304 4.34 4.34 0e+00
10000000 5000000 half 2 1.725 5.80 5.80 0e+00
10000000 5000000 half 3 1.724 5.80 5.80 0e+00
10000000 5000000 half 4 1.731 5.78 5.78 0e+00
10000000 5000000 half 5 1.726 5.79 5.79 0e+00
10000000 5000000 half 6 1.738 5.75 5.75 0e+00
10000000 5000000 half 7 1.736 5.76 5.76 0e+00
10000000 5000000 half 8 1.727 5.79 5.79 0e+00
10000000 5000000 half 9 1.692 5.91 5.91 0e+00
10000000 2500000 float 0 1.680 5.95 5.95 0e+00
10000000 2500000 float 1 1.713 5.84 5.84 0e+00
10000000 2500000 float 2 1.714 5.84 5.84 0e+00
10000000 2500000 float 3 1.727 5.79 5.79 0e+00
10000000 2500000 float 4 1.733 5.77 5.77 0e+00
10000000 2500000 float 5 1.734 5.77 5.77 0e+00
10000000 2500000 float 6 1.740 5.75 5.75 0e+00
10000000 2500000 float 7 1.748 5.72 5.72 0e+00
10000000 2500000 float 8 1.727 5.79 5.79 0e+00
10000000 2500000 float 9 1.686 5.93 5.93 0e+00
10000000 1250000 double 0 1.679 5.95 5.95 0e+00
10000000 1250000 double 1 1.703 5.87 5.87 0e+00
10000000 1250000 double 2 1.728 5.79 5.79 0e+00
10000000 1250000 double 3 1.721 5.81 5.81 0e+00
10000000 1250000 double 4 1.729 5.79 5.79 0e+00
10000000 1250000 double 5 1.726 5.79 5.79 0e+00
10000000 1250000 double 6 1.737 5.76 5.76 0e+00
10000000 1250000 double 7 1.741 5.74 5.74 0e+00
10000000 1250000 double 8 1.725 5.80 5.80 0e+00
10000000 1250000 double 9 1.688 5.93 5.93 0e+00
10000000 1250000 int64 0 1.674 5.97 5.97 0e+00
10000000 1250000 int64 1 1.698 5.89 5.89 0e+00
10000000 1250000 int64 2 1.727 5.79 5.79 0e+00
10000000 1250000 int64 3 1.730 5.78 5.78 0e+00
10000000 1250000 int64 4 1.735 5.76 5.76 0e+00
10000000 1250000 int64 5 1.727 5.79 5.79 0e+00
10000000 1250000 int64 6 1.743 5.74 5.74 0e+00
10000000 1250000 int64 7 1.758 5.69 5.69 0e+00
10000000 1250000 int64 8 1.732 5.77 5.77 0e+00
10000000 1250000 int64 9 1.692 5.91 5.91 0e+00
10000000 1250000 uint64 0 1.676 5.97 5.97 0e+00
10000000 1250000 uint64 1 1.698 5.89 5.89 0e+00
10000000 1250000 uint64 2 1.730 5.78 5.78 0e+00
10000000 1250000 uint64 3 1.726 5.80 5.80 0e+00
10000000 1250000 uint64 4 1.731 5.78 5.78 0e+00
10000000 1250000 uint64 5 1.724 5.80 5.80 0e+00
10000000 1250000 uint64 6 1.739 5.75 5.75 0e+00
10000000 1250000 uint64 7 1.746 5.73 5.73 0e+00
10000000 1250000 uint64 8 1.726 5.79 5.79 0e+00
10000000 1250000 uint64 9 1.688 5.93 5.93 0e+00
 

dhenzjhen

Member
Sep 14, 2016
38
55
18
San Jose, California
[root@103-124 nccl-1.2.3-1-cuda8.0]# ./build/test/single/reduce_scatter_test 10000000
# Using devices
# Rank 0 uses device 0 [0x04] TITAN X (Pascal)
# Rank 1 uses device 1 [0x05] TITAN X (Pascal)
# Rank 2 uses device 2 [0x06] TITAN X (Pascal)
# Rank 3 uses device 3 [0x07] TITAN X (Pascal)
# Rank 4 uses device 4 [0x08] TITAN X (Pascal)
# Rank 5 uses device 5 [0x0b] TITAN X (Pascal)
# Rank 6 uses device 6 [0x0c] TITAN X (Pascal)
# Rank 7 uses device 7 [0x0d] TITAN X (Pascal)
# Rank 8 uses device 8 [0x0e] TITAN X (Pascal)
# Rank 9 uses device 9 [0x0f] TITAN X (Pascal)

# out-of-place in-place
# bytes N type op time algbw busbw delta time algbw busbw delta
10000000 10000000 char sum 15.041 0.66 5.98 0e+00 15.115 0.66 5.95 0e+00
10000000 10000000 char prod 15.002 0.67 6.00 0e+00 15.008 0.67 6.00 0e+00
10000000 10000000 char max 15.028 0.67 5.99 0e+00 15.045 0.66 5.98 0e+00
10000000 10000000 char min 15.016 0.67 5.99 0e+00 15.073 0.66 5.97 1e+02
10000000 2500000 int sum 14.750 0.68 6.10 2e+09 14.759 0.68 6.10 2e+09
10000000 2500000 int prod 14.923 0.67 6.03 2e+09 14.942 0.67 6.02 2e+09
10000000 2500000 int max 14.742 0.68 6.11 2e+09 14.762 0.68 6.10 2e+09
10000000 2500000 int min 14.768 0.68 6.09 2e+09 14.790 0.68 6.09 2e+09
10000000 5000000 half sum 14.981 0.67 6.01 2e-02 15.059 0.66 5.98 2e-02
10000000 5000000 half prod 15.000 0.67 6.00 7e-04 15.025 0.67 5.99 7e-04
10000000 5000000 half max 15.087 0.66 5.97 0e+00 15.105 0.66 5.96 0e+00
10000000 5000000 half min 15.077 0.66 5.97 0e+00 15.098 0.66 5.96 0e+00
10000000 2500000 float sum 14.769 0.68 6.09 3e+00 14.800 0.68 6.08 2e+00
10000000 2500000 float prod 14.770 0.68 6.09 6e-02 14.773 0.68 6.09 3e-02
10000000 2500000 float max 14.953 0.67 6.02 5e-01 14.983 0.67 6.01 5e-01
10000000 2500000 float min 14.994 0.67 6.00 4e-01 15.025 0.67 5.99 5e-01
10000000 1250000 double sum 14.836 0.67 6.07 3e+00 14.860 0.67 6.06 3e+00
10000000 1250000 double prod 14.840 0.67 6.06 5e-02 14.868 0.67 6.05 8e-02
10000000 1250000 double max 14.981 0.67 6.01 4e-01 15.034 0.67 5.99 3e-01
10000000 1250000 double min 14.930 0.67 6.03 5e-01 14.947 0.67 6.02 5e-01
10000000 1250000 int64 sum 14.892 0.67 6.04 1e+16 14.911 0.67 6.04 8e+15
10000000 1250000 int64 prod 14.940 0.67 6.02 0e+00 14.960 0.67 6.02 0e+00
10000000 1250000 int64 max 14.906 0.67 6.04 3e+15 14.962 0.67 6.02 3e+15
10000000 1250000 int64 min 14.901 0.67 6.04 4e+15 14.957 0.67 6.02 3e+15
10000000 1250000 uint64 sum 14.909 0.67 6.04 2e+19 14.927 0.67 6.03 2e+19
10000000 1250000 uint64 prod 14.961 0.67 6.02 0e+00 14.956 0.67 6.02 0e+00
10000000 1250000 uint64 max 14.911 0.67 6.04 2e+19 14.933 0.67 6.03 2e+19
10000000 1250000 uint64 min 14.952 0.67 6.02 1e+18 15.014 0.67 5.99 4e+15