apache
diff --git a/‎hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/EnhancedHeadroom.java‎
Lines changed: 5 additions & 0 deletions b/‎hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/EnhancedHeadroom.java‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java‎
Lines changed: 39 additions & 0 deletions b/‎hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml‎
Lines changed: 80 additions & 0 deletions b/‎hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/AMRMClientRelayer.java‎
Lines changed: 49 additions & 6 deletions b/‎hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/AMRMClientRelayer.java‎
Lines changed: 49 additions & 6 deletions
diff --git a/‎hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/ContainerAllocationHistory.java‎
Lines changed: 69 additions & 0 deletions b/‎hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/ContainerAllocationHistory.java‎
Lines changed: 69 additions & 0 deletions
@@ -69,4 +69,9 @@ public String toString() {
  sb.append(">");
  return sb.toString();
  }
+
+ public double getNormalizedPendingCount(long multiplier) {
+ int totalPendingCount = getTotalPendingCount();
+ return (double) totalPendingCount * multiplier;
+ }
 }
@@ -4058,6 +4058,45 @@ public static boolean isAclEnabled(Configuration conf) {
  public static final long DEFAULT_FEDERATION_AMRMPROXY_SUBCLUSTER_TIMEOUT =
  60000; // one minute
 
+ // Prefix for configs related to selecting SC based on load
+ public static final String LOAD_BASED_SC_SELECTOR_PREFIX =
+ NM_PREFIX + "least-load-policy-selector.";
+
+ // Config to enable re-rerouting node requests base on SC load
+ public static final String LOAD_BASED_SC_SELECTOR_ENABLED =
+ LOAD_BASED_SC_SELECTOR_PREFIX + "enabled";
+ public static final boolean DEFAULT_LOAD_BASED_SC_SELECTOR_ENABLED = false;
+
+ // Pending container threshold for selecting SC
+ public static final String LOAD_BASED_SC_SELECTOR_THRESHOLD =
+ LOAD_BASED_SC_SELECTOR_PREFIX + "pending-container.threshold";
+ public static final int DEFAULT_LOAD_BASED_SC_SELECTOR_THRESHOLD = 10000;
+
+ // Whether to consider total number of active cores in the subcluster for load
+ public static final String LOAD_BASED_SC_SELECTOR_USE_ACTIVE_CORE =
+ LOAD_BASED_SC_SELECTOR_PREFIX + "use-active-core";
+ public static final boolean DEFAULT_LOAD_BASED_SC_SELECTOR_USE_ACTIVE_CORE = false;
+
+ // multiplier to normalize pending container to active cores
+ public static final String LOAD_BASED_SC_SELECTOR_MULTIPLIER =
+ LOAD_BASED_SC_SELECTOR_PREFIX + "multiplier";
+ public static final int DEFAULT_LOAD_BASED_SC_SELECTOR_MULTIPLIER = 50000;
+
+ // max count to maintain for container allocation history
+ public static final String FEDERATION_ALLOCATION_HISTORY_MAX_ENTRY =
+ FEDERATION_PREFIX + "amrmproxy.allocation.history.max.entry";
+ public static final int DEFAULT_FEDERATION_ALLOCATION_HISTORY_MAX_ENTRY = 100;
+
+ // Whether to fail directly if activeSubCluster is less than 1.
+ public static final String LOAD_BASED_SC_SELECTOR_FAIL_ON_ERROR =
+ LOAD_BASED_SC_SELECTOR_PREFIX + "fail-on-error";
+ public static final boolean DEFAULT_LOAD_BASED_SC_SELECTOR_FAIL_ON_ERROR = true;
+
+ // Blacklisted subClusters.
+ public static final String FEDERATION_BLACKLIST_SUBCLUSTERS =
+ LOAD_BASED_SC_SELECTOR_PREFIX + "blacklist-subclusters";
+ public static final String DEFAULT_FEDERATION_BLACKLIST_SUBCLUSTERS = "";
+
  // AMRMProxy Register UAM Retry-Num
  public static final String FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT =
  FEDERATION_PREFIX + "amrmproxy.register.uam.retry-count";
 
@@ -5558,4 +5558,84 @@
  <value>0.0.0.0:8070</value>
  </property>
 
+ <property>
+ <description>
+ This configuration will enable request rerouting according to the load of the subCluster.
+ If it is true, it will reroute the request according to the load of the subCluster.
+ The default configuration is false.
+ </description>
+ <name>yarn.nodemanager.least-load-policy-selector.enabled</name>
+ <value>false</value>
+ </property>
+
+ <property>
+ <description>
+ SubCluster pending container threshold. The default value is 10000.
+ This configuration will determine the load weight of a subCluster.
+ For SC with pending containers count bigger than container threshold / 2,
+ use threshold / pending as weight.
+ For SC with pending containers count less than threshold / 2, we cap the weight at 2.
+ </description>
+ <name>yarn.nodemanager.least-load-policy-selector.pending-container.threshold</name>
+ <value>10000</value>
+ </property>
+
+ <property>
+ <description>
+ Whether to consider the configured vcores when calculating the subCluster load.
+ The default value is false, we only consider the number of cluster pending containers.
+ If this configuration item is set to true, This configuration item needs to be used together
+ with yarn.nodemanager.least-load-policy-selector.multiplier. We will use the following formula
+ when calculating subCluster pending.
+ pendingContainersCountNormalize = (totalPendingContainersCount * multiplier) / totalActiveCores.
+ </description>
+ <name>yarn.nodemanager.least-load-policy-selector.use-active-core</name>
+ <value>false</value>
+ </property>
+
+ <property>
+ <description>
+ Max count to maintain for container allocation history.
+ </description>
+ <name>yarn.federation.amrmproxy.allocation.history.max.entry</name>
+ <value>100</value>
+ </property>
+
+ <property>
+ <description>
+ Whether to fail directly if activeSubCluster is less than 1.
+ The default is true.
+ If We set to false, We will try to re-fetch activeSubCluster list.
+ </description>
+ <name>yarn.nodemanager.least-load-policy-selector.fail-on-error</name>
+ <value>true</value>
+ </property>
+
+ <property>
+ <description>
+ The subCluster configured in the blacklist will not be forwarded requests.
+ The default value is empty.
+ </description>
+ <name>yarn.nodemanager.least-load-policy-selector.blacklist-subclusters</name>
+ <value></value>
+ </property>
+
+ <property>
+ <description>
+ Max count to maintain for container allocation history.
+ </description>
+ <name>yarn.federation.amrmproxy.allocation.history.max.entry</name>
+ <value>100</value>
+ </property>
+
+ <property>
+ <description>
+ This configuration will be used
+ when yarn.nodemanager.least-load-policy-selector.use-active-core is set to true.
+ The purpose of this value is to help normalize the pendingContainersCount.
+ </description>
+ <name>yarn.nodemanager.least-load-policy-selector.multiplier</name>
+ <value>50000</value>
+ </property>
+
 </configuration>
@@ -28,6 +28,7 @@
 import java.util.TreeSet;
 
 import org.apache.hadoop.HadoopIllegalArgumentException;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.ipc.RPC;
 import org.apache.hadoop.yarn.api.ApplicationMasterProtocol;
 import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest;
@@ -132,16 +133,23 @@ public class AMRMClientRelayer implements ApplicationMasterProtocol {
 
  private AMRMClientRelayerMetrics metrics;
 
+ private ContainerAllocationHistory allocationHistory;
+
  public AMRMClientRelayer(ApplicationMasterProtocol rmClient,
  ApplicationId appId, String rmId) {
  this.resetResponseId = -1;
  this.metrics = AMRMClientRelayerMetrics.getInstance();
- this.rmId = "";
  this.rmClient = rmClient;
  this.appId = appId;
  this.rmId = rmId;
  }
 
+ public AMRMClientRelayer(ApplicationMasterProtocol rmClient,
+ ApplicationId appId, String rmId, Configuration conf) {
+ this(rmClient, appId, rmId);
+ this.allocationHistory = new ContainerAllocationHistory(conf);
+ }
+
  public void setAMRegistrationRequest(
  RegisterApplicationMasterRequest registerRequest) {
  this.amRegistrationRequest = registerRequest;
@@ -444,6 +452,8 @@ private void updateMetrics(AllocateResponse allocateResponse,
  if (this.knownContainers.add(container.getId())) {
  this.metrics.addFulfilledQPS(this.rmId, AMRMClientRelayerMetrics
  .getRequestType(container.getExecutionType()), 1);
+ long currentTime = System.currentTimeMillis();
+ long fulfillLatency = -1;
  if (container.getAllocationRequestId() != 0) {
  Integer count = this.pendingCountForMetrics
  .get(container.getAllocationRequestId());
@@ -453,13 +463,14 @@ private void updateMetrics(AllocateResponse allocateResponse,
  this.metrics.decrClientPending(this.rmId,
  AMRMClientRelayerMetrics
  .getRequestType(container.getExecutionType()), 1);
- this.metrics.addFulfillLatency(this.rmId,
- AMRMClientRelayerMetrics
-  .getRequestType(container.getExecutionType()),
- System.currentTimeMillis() - this.askTimeStamp
-  .get(container.getAllocationRequestId()));
+ fulfillLatency = currentTime - this.askTimeStamp.get(
+ container.getAllocationRequestId());
+ AMRMClientRelayerMetrics.RequestType requestType = AMRMClientRelayerMetrics
+ .getRequestType(container.getExecutionType());
+ this.metrics.addFulfillLatency(this.rmId, requestType, fulfillLatency);
  }
  }
+ addAllocationHistoryEntry(container, currentTime, fulfillLatency);
  }
  }
  }
@@ -576,6 +587,38 @@ private void addResourceRequestToAsk(ResourceRequest remoteRequest) {
  this.ask.add(remoteRequest);
  }
 
+ public ContainerAllocationHistory getAllocationHistory() {
+ return this.allocationHistory;
+ }
+
+ private void addAllocationHistoryEntry(Container container, long fulfillTimeStamp,
+ long fulfillLatency) {
+ ResourceRequestSetKey key = ResourceRequestSetKey.extractMatchingKey(container,
+ this.remotePendingAsks.keySet());
+ if (key == null) {
+ LOG.info("allocation history ignoring {}, no matching request key found.", container);
+ return;
+ }
+ this.allocationHistory.addAllocationEntry(container, this.remotePendingAsks.get(key),
+ fulfillTimeStamp, fulfillLatency);
+ }
+
+ public void gatherReadOnlyPendingAsksInfo(Map<ResourceRequestSetKey,
+ ResourceRequestSet> pendingAsks, Map<ResourceRequestSetKey, Long> pendingTime) {
+ pendingAsks.clear();
+ pendingTime.clear();
+ synchronized (this) {
+ pendingAsks.putAll(this.remotePendingAsks);
+ for (ResourceRequestSetKey key : pendingAsks.keySet()) {
+ Long startTime = this.askTimeStamp.get(key.getAllocationRequestId());
+ if (startTime != null) {
+ long elapsedMs = System.currentTimeMillis() - startTime;
+ pendingTime.put(key, elapsedMs);
+ }
+ }
+ }
+ }
+
  @VisibleForTesting
  protected Map<ResourceRequestSetKey, ResourceRequestSet>
  getRemotePendingAsks() {
 
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.yarn.server;
+
+import java.util.AbstractMap;
+import java.util.LinkedList;
+import java.util.Map.Entry;
+import java.util.Queue;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.yarn.api.records.Container;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.scheduler.ResourceRequestSet;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Records the allocation history from YarnRM and provide aggregated insights.
+ */
+public class ContainerAllocationHistory {
+ private static final Logger LOG = LoggerFactory.getLogger(AMRMClientRelayer.class);
+
+ private int maxEntryCount;
+
+ // Allocate timing history <AllocateTimeStamp, AllocateLatency>
+ private Queue<Entry<Long, Long>> relaxableG = new LinkedList<>();
+
+ public ContainerAllocationHistory(Configuration conf) {
+ this.maxEntryCount = conf.getInt(
+ YarnConfiguration.FEDERATION_ALLOCATION_HISTORY_MAX_ENTRY,
+ YarnConfiguration.DEFAULT_FEDERATION_ALLOCATION_HISTORY_MAX_ENTRY);
+ }
+
+ /**
+ * Record the allocation history for the container.
+ *
+ * @param container to add record for
+ * @param requestSet resource request ask set
+ * @param fulfillTimeStamp time at which allocation happened
+ * @param fulfillLatency time elapsed in allocating since asked
+ */
+ public synchronized void addAllocationEntry(Container container,
+ ResourceRequestSet requestSet, long fulfillTimeStamp, long fulfillLatency){
+ if (!requestSet.isANYRelaxable()) {
+ LOG.info("allocation history ignoring {}, relax locality is false", container);
+ return;
+ }
+ this.relaxableG.add(new AbstractMap.SimpleEntry<>(
+ fulfillTimeStamp, fulfillLatency));
+ if (this.relaxableG.size() > this.maxEntryCount) {
+ this.relaxableG.remove();
+ }
+ }
+}
Original file line number	Diff line number	Diff line change
`@@ -69,4 +69,9 @@ public String toString() {`
`69`	`69`	`sb.append(">");`
`70`	`70`	`return sb.toString();`
`71`	`71`	`}`
	`72`	`+`
	`73`	`+ public double getNormalizedPendingCount(long multiplier) {`
	`74`	`+ int totalPendingCount = getTotalPendingCount();`
	`75`	`+ return (double) totalPendingCount * multiplier;`
	`76`	`+ }`
`72`	`77`	`}`