Skip to content

Commit fda14d8

Browse files
committed
InfUdDriver: obtain network bandwidth properly by querying the port
1 parent 37f0d65 commit fda14d8

File tree

3 files changed

+70
-18
lines changed

3 files changed

+70
-18
lines changed

src/InfUdDriver.cc

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -98,22 +98,7 @@ InfUdDriver::InfUdDriver(Context* context, const ServiceLocator *sl,
9898
, qpn(0)
9999
, localMac()
100100
, locatorString("infud:")
101-
// As of 11/2018, the network bandwidth of our RC machines at Stanford is
102-
// actually limited by the effective bandwidth of PCIe 2.0x4, which should
103-
// be ~29Gbps when taking into account the overhead of PCIe headers.
104-
// For example, suppose the HCA's MTU is 256B and the PCIe headers are 24B
105-
// in total, the effective bandwidth of PCIe 2.0x4 is
106-
// 32Gbps * 256 / (256 + 24) = 29.25Gbps
107-
// Unfortunately, it appears that our ConnextX-2 HCA somehow cannot fully
108-
// utilize the 29Gbps PCIe bandwidth when sending UD packets. This can be
109-
// verified by running one or more ib_send_bw programs on two machines.
110-
// The maximum outgoing bandwidth we can achieve in practice is ~3020MB/s,
111-
// or 23.6Gbps. Note that we need to set the outgoing bandwidth slightly
112-
// higher than 24Gbps in order to saturate the 23.6Gbps outgoing bandwidth.
113-
// This is because the throughput of the HCA has non-negligible variation:
114-
// when it's running faster than 24Gbps, we don't want the transport to
115-
// throttle the throughput and leave the HCA idle.
116-
, bandwidthGbps(26) // Default outgoing bandwidth in gbs
101+
, bandwidthGbps(~0u)
117102
, zeroCopyStart(NULL)
118103
, zeroCopyEnd(NULL)
119104
, zeroCopyRegion(NULL)
@@ -146,6 +131,27 @@ InfUdDriver::InfUdDriver(Context* context, const ServiceLocator *sl,
146131
} catch (ServiceLocator::NoSuchKeyException& e) {}
147132
}
148133
infiniband = realInfiniband.construct(ibDeviceName);
134+
#if 1
135+
bandwidthGbps = std::min(bandwidthGbps,
136+
static_cast<int>(infiniband->getBandwidthGbps(ibPhysicalPort)));
137+
#else
138+
// As of 11/2018, the network bandwidth of our RC machines at Stanford is
139+
// actually limited by the effective bandwidth of PCIe 2.0x4, which should
140+
// be ~29Gbps when taking into account the overhead of PCIe headers.
141+
// For example, suppose the HCA's MTU is 256B and the PCIe headers are 24B
142+
// in total, the effective bandwidth of PCIe 2.0x4 is
143+
// 32Gbps * 256 / (256 + 24) = 29.25Gbps
144+
// Unfortunately, it appears that our ConnextX-2 HCA somehow cannot fully
145+
// utilize the 29Gbps PCIe bandwidth when sending UD packets. This can be
146+
// verified by running one or more ib_send_bw programs on two machines.
147+
// The maximum outgoing bandwidth we can achieve in practice is ~3020MB/s,
148+
// or 23.6Gbps. Note that we need to set the outgoing bandwidth slightly
149+
// higher than 24Gbps in order to saturate the 23.6Gbps outgoing bandwidth.
150+
// This is because the throughput of the HCA has non-negligible variation:
151+
// when it's running faster than 24Gbps, we don't want the transport to
152+
// throttle the throughput and leave the HCA idle.
153+
bandwidthGbps = std::min(bandwidthGbps, 26);
154+
#endif
149155
mtu = infiniband->getMtu(ibPhysicalPort);
150156
queueEstimator.setBandwidth(1000*bandwidthGbps);
151157
maxTransmitQueueSize = (uint32_t) (static_cast<double>(bandwidthGbps)

src/Infiniband.cc

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,52 @@ Infiniband::getLid(int port)
140140
return ipa.lid;
141141
}
142142

143+
/**
144+
* Obtain the link speed on this port to transmit and receive.
145+
*
146+
* \param[in] port
147+
* Port on the device we're looking up. This value is typically 1, except
148+
* on adapters with multiple physical ports.
149+
* \return
150+
* The bandwidth, in Gbps.
151+
* \throw
152+
* TransportException if the port cannot be queried.
153+
*/
154+
uint32_t
155+
Infiniband::getBandwidthGbps(int port)
156+
{
157+
ibv_port_attr ipa;
158+
int ret = ibv_query_port(device.ctxt, downCast<uint8_t>(port), &ipa);
159+
if (ret) {
160+
RAMCLOUD_LOG(ERROR, "ibv_query_port failed on port %d", port);
161+
throw TransportException(HERE, ret);
162+
}
163+
// The meaning of fields active_width and active_speed can be found at:
164+
// https://www.rdmamojo.com/2012/07/21/ibv_query_port/
165+
uint32_t gbps = 1;
166+
switch (ipa.active_width) {
167+
case 1: gbps = 1; break;
168+
case 2: gbps = 4; break;
169+
case 4: gbps = 8; break;
170+
case 8: gbps = 12; break;
171+
default:
172+
LOG(ERROR, "unexpected active width %u on port %d",
173+
ipa.active_width, port);
174+
}
175+
switch (ipa.active_speed) {
176+
case 1: gbps = gbps*5/2; break;
177+
case 2: gbps *= 5; break;
178+
case 4:
179+
case 8: gbps *= 10; break;
180+
case 16: gbps *= 14; break;
181+
case 32: gbps *= 25; break;
182+
default:
183+
LOG(ERROR, "unexpected active speed %u on port %d",
184+
ipa.active_speed, port);
185+
}
186+
return gbps;
187+
}
188+
143189
/**
144190
* Obtain the MTU enabled on this port to transmit and receive.
145191
*

src/Infiniband.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -456,8 +456,8 @@ class Infiniband {
456456

457457
int
458458
getLid(int port);
459-
uint32_t
460-
getMtu(int port);
459+
uint32_t getBandwidthGbps(int port);
460+
uint32_t getMtu(int port);
461461

462462
BufferDescriptor*
463463
tryReceive(QueuePair* qp, Tub<Address>* sourceAddress = NULL);

0 commit comments

Comments
 (0)