This is a weird one. I boot up a blade with a Connect-X2 in it. Everything seems to be working just fine for about five minutes. (Maybe 10, haven't really timed it.) Then, poof. The card is "gone". What gives? This nodes's supposed to be my manager. Not having a card makes it pretty difficult to do that! It's CentOS 7.4 running OpenSM.
Code:
[root@central tim]# ibhosts
Ca : 0x0008f104039a5b70 ports 2 "esxi-08 HCA-1"
Ca : 0x0008f104039a6d5c ports 2 "esxi-06 HCA-1"
Ca : 0x0008f104039a65f8 ports 2 "esxi-05 HCA-1"
Ca : 0x0008f104039a5f70 ports 2 "central mlx4_0"
[root@central tim]# ibdiagnet
Loading IBDIAGNET from: /usr/lib64/ibdiagnet1.5.7
-W- Topology file is not specified.
Reports regarding cluster links will use direct routes.
Loading IBDM from: /usr/lib64/ibdm1.5.7
-I- Using port 1 as the local port.
-I- Discovering ... 5 nodes (1 Switches & 4 CA-s) discovered.
-I---------------------------------------------------
-I- Bad Guids/LIDs Info
-I---------------------------------------------------
-I- No bad Guids were found
-I---------------------------------------------------
-I- Links With Logical State = INIT
-I---------------------------------------------------
-I- No bad Links (with logical state = INIT) were found
-I---------------------------------------------------
-I- General Device Info
-I---------------------------------------------------
-I---------------------------------------------------
-I- PM Counters Info
-I---------------------------------------------------
-I- No illegal PM counters values were found
-I---------------------------------------------------
-I- Fabric Partitions Report (see ibdiagnet.pkey for a full hosts list)
-I---------------------------------------------------
-I- PKey:0x7fff Hosts:7 full:7 limited:0
-I---------------------------------------------------
-I- IPoIB Subnets Check
-I---------------------------------------------------
-I- Subnet: IPv4 PKey:0x7fff QKey:0x00000b1b MTU:2048Byte rate:10Gbps SL:0x00
-W- Suboptimal rate for group. Lowest member rate:40Gbps > group-rate:10Gbps
-I---------------------------------------------------
-I- Bad Links Info
-I- No bad link were found
-I---------------------------------------------------
----------------------------------------------------------------
-I- Stages Status Report:
STAGE Errors Warnings
Bad GUIDs/LIDs Check 0 0
Link State Active Check 0 0
General Devices Info Report 0 0
Performance Counters Report 0 0
Partitions Check 0 0
IPoIB Subnets Check 0 1
Please see /var/cache/ibutils/ibdiagnet.log for complete log
----------------------------------------------------------------
-I- Done. Run time was 0 seconds.
[root@central tim]# ibnodes
Ca : 0x0008f104039a5b70 ports 2 "esxi-08 HCA-1"
Ca : 0x0008f104039a6d5c ports 2 "esxi-06 HCA-1"
Ca : 0x0008f104039a65f8 ports 2 "esxi-05 HCA-1"
Ca : 0x0008f104039a5f70 ports 2 "central mlx4_0"
Switch : 0x0002c9020042ead0 ports 36 "Infiniscale-IV Mellanox Technologies" base port 0 lid 2 lmc 0
[root@central tim]# ibv_devinfo
hca_id: mlx4_0
transport: InfiniBand (0)
fw_ver: 2.10.720
node_guid: 0008:f104:039a:5f70
sys_image_guid: 0008:f104:039a:5f73
vendor_id: 0x02c9
vendor_part_id: 26428
hw_ver: 0xB0
board_id: MT_0D81120009
phys_port_cnt: 2
port: 1
state: PORT_ACTIVE (4)
max_mtu: 4096 (5)
active_mtu: 4096 (5)
sm_lid: 4
port_lid: 4
port_lmc: 0x00
link_layer: InfiniBand
port: 2
state: PORT_DOWN (1)
max_mtu: 4096 (5)
active_mtu: 4096 (5)
sm_lid: 0
port_lid: 0
port_lmc: 0x00
link_layer: InfiniBand
[root@central tim]# ibaddr
GID fe80::8:f104:39a:5f71 LID start 0x4 end 0x4
[root@central tim]# iblinkinfo
CA: esxi-08 HCA-1:
0x0008f104039a5b71 20 1[ ] ==( 4X 10.0 Gbps Active/ LinkUp)==> 2 15[ ] "Infiniscale-IV Mellanox Technologies" ( )
0x0008f104039a5b72 21 2[ ] ==( 4X 10.0 Gbps Active/ LinkUp)==> 2 16[ ] "Infiniscale-IV Mellanox Technologies" ( )
CA: esxi-06 HCA-1:
0x0008f104039a6d5d 12 1[ ] ==( 4X 10.0 Gbps Active/ LinkUp)==> 2 11[ ] "Infiniscale-IV Mellanox Technologies" ( )
0x0008f104039a6d5e 13 2[ ] ==( 4X 10.0 Gbps Active/ LinkUp)==> 2 12[ ] "Infiniscale-IV Mellanox Technologies" ( )
CA: esxi-05 HCA-1:
0x0008f104039a65f9 8 1[ ] ==( 4X 10.0 Gbps Active/ LinkUp)==> 2 9[ ] "Infiniscale-IV Mellanox Technologies" ( )
0x0008f104039a65fa 9 2[ ] ==( 4X 10.0 Gbps Active/ LinkUp)==> 2 10[ ] "Infiniscale-IV Mellanox Technologies" ( )
Switch: 0x0002c9020042ead0 Infiniscale-IV Mellanox Technologies:
2 1[ ] ==( 4X 10.0 Gbps Active/ LinkUp)==> 4 1[ ] "central mlx4_0" ( )
2 2[ ] ==( Down/ Polling)==> [ ] "" ( )
2 3[ ] ==( Down/ Polling)==> [ ] "" ( )
2 4[ ] ==( Down/ Polling)==> [ ] "" ( )
2 5[ ] ==( Down/ Polling)==> [ ] "" ( )
2 6[ ] ==( Down/ Polling)==> [ ] "" ( )
2 7[ ] ==( Down/ Polling)==> [ ] "" ( )
2 8[ ] ==( Down/ Polling)==> [ ] "" ( )
2 9[ ] ==( 4X 10.0 Gbps Active/ LinkUp)==> 8 1[ ] "esxi-05 HCA-1" ( )
2 10[ ] ==( 4X 10.0 Gbps Active/ LinkUp)==> 9 2[ ] "esxi-05 HCA-1" ( )
2 11[ ] ==( 4X 10.0 Gbps Active/ LinkUp)==> 12 1[ ] "esxi-06 HCA-1" ( )
2 12[ ] ==( 4X 10.0 Gbps Active/ LinkUp)==> 13 2[ ] "esxi-06 HCA-1" ( )
2 13[ ] ==( Down/ Polling)==> [ ] "" ( )
2 14[ ] ==( Down/ Polling)==> [ ] "" ( )
2 15[ ] ==( 4X 10.0 Gbps Active/ LinkUp)==> 20 1[ ] "esxi-08 HCA-1" ( )
2 16[ ] ==( 4X 10.0 Gbps Active/ LinkUp)==> 21 2[ ] "esxi-08 HCA-1" ( )
2 17[ ] ==( Down/ Polling)==> [ ] "" ( )
2 18[ ] ==( Down/ Polling)==> [ ] "" ( )
2 19[ ] ==( Down/ Polling)==> [ ] "" ( )
2 20[ ] ==( Down/ Polling)==> [ ] "" ( )
2 21[ ] ==( Down/ Polling)==> [ ] "" ( )
2 22[ ] ==( Down/ Polling)==> [ ] "" ( )
2 23[ ] ==( Down/ Polling)==> [ ] "" ( )
2 24[ ] ==( Down/ Polling)==> [ ] "" ( )
2 25[ ] ==( Down/ Polling)==> [ ] "" ( )
2 26[ ] ==( Down/ Polling)==> [ ] "" ( )
2 27[ ] ==( Down/ Polling)==> [ ] "" ( )
2 28[ ] ==( Down/ Polling)==> [ ] "" ( )
2 29[ ] ==( Down/ Polling)==> [ ] "" ( )
2 30[ ] ==( Down/ Polling)==> [ ] "" ( )
2 31[ ] ==( Down/ Polling)==> [ ] "" ( )
2 32[ ] ==( Down/ Polling)==> [ ] "" ( )
2 33[ ] ==( Down/ Polling)==> [ ] "" ( )
2 34[ ] ==( Down/ Polling)==> [ ] "" ( )
2 35[ ] ==( Down/ Polling)==> [ ] "" ( )
2 36[ ] ==( Down/ Polling)==> [ ] "" ( )
CA: central mlx4_0:
0x0008f104039a5f71 4 1[ ] ==( 4X 10.0 Gbps Active/ LinkUp)==> 2 1[ ] "Infiniscale-IV Mellanox Technologies" ( )
[root@central tim]#
---WAIT 5-10 MINUTES---
[root@central tim]# iblinkinfo
ibwarn: [1832] mad_rpc_open_port: can't open UMAD port ((null):0)
Failed to open (null) port 0
[root@central tim]# ibv_devinfo
No IB devices found