ad02184da723198123c198ba4dc9a3e96b1505dd
java/com.sap.sailing.landscape/src/com/sap/sailing/landscape/impl/ArchiveCandidateMonitoringBackgroundTask.java
| ... | ... | @@ -101,6 +101,7 @@ public class ArchiveCandidateMonitoringBackgroundTask implements Runnable { |
| 101 | 101 | private final static Duration LONG_TIMEOUT = Duration.ONE_DAY.times(3); |
| 102 | 102 | private final static double MAXIMUM_ONE_MINUTE_SYSTEM_LOAD_AVERAGE = 2.0; |
| 103 | 103 | private final static int MAXIMUM_THREAD_POOL_QUEUE_SIZE = 10; |
| 104 | + private final static Optional<Duration> TIMEOUT_FIRST_CONTACT = Optional.of(Landscape.WAIT_FOR_PROCESS_TIMEOUT.get().plus(Landscape.WAIT_FOR_HOST_TIMEOUT.get())); |
|
| 104 | 105 | private final static Duration SERVER_COMPARISON_TIMEOUT = Duration.ONE_MINUTE.times(10); // good for two or three attempts, usually |
| 105 | 106 | private final static Duration DELAY_BETWEEN_COMPARISON_CHECKS = Duration.ONE_MINUTE; |
| 106 | 107 | |
| ... | ... | @@ -183,7 +184,9 @@ public class ArchiveCandidateMonitoringBackgroundTask implements Runnable { |
| 183 | 184 | logger.severe("Check "+currentCheck+" failed and has timed out; giving up on candidate "+replicaSet.getMaster().getHost().getHostname()); |
| 184 | 185 | notifyProcessOwnerCandidateFailedToBecomeReadyForProduction(); // this ends the re-scheduling loop |
| 185 | 186 | } else { |
| 186 | - logger.info("Check "+currentCheck+" failed but has not yet timed out; re-scheduling to check again after "+currentCheck.getDelayAfterFailure()); |
|
| 187 | + logger.info("Check " + currentCheck + " failed with message \"" + currentCheck.getLastFailureMessage() |
|
| 188 | + + "\" but has not yet timed out; re-scheduling to check again after " |
|
| 189 | + + currentCheck.getDelayAfterFailure()); |
|
| 187 | 190 | executor.schedule(this, currentCheck.getDelayAfterFailure().asMillis(), TimeUnit.MILLISECONDS); |
| 188 | 191 | } |
| 189 | 192 | } |
| ... | ... | @@ -192,14 +195,14 @@ public class ArchiveCandidateMonitoringBackgroundTask implements Runnable { |
| 192 | 195 | private static final long serialVersionUID = -4265303532881568290L; |
| 193 | 196 | |
| 194 | 197 | private IsReady() { |
| 195 | - super("is ready", LONG_TIMEOUT, DELAY_BETWEEN_CHECKS); |
|
| 198 | + super("is ready", TIMEOUT_FIRST_CONTACT.get(), DELAY_BETWEEN_CHECKS); |
|
| 196 | 199 | } |
| 197 | 200 | |
| 198 | 201 | @Override |
| 199 | 202 | public boolean runCheck() throws Exception { |
| 200 | 203 | final boolean result = replicaSet.getMaster().isReady(Landscape.WAIT_FOR_PROCESS_TIMEOUT); |
| 201 | 204 | if (!result) { |
| 202 | - setLastFailureMessage("Candidate is not ready yet"); |
|
| 205 | + setLastFailureMessage("Candidate at "+replicaSet.getMaster().getHost().getPrivateAddress()+" not ready yet"); |
|
| 203 | 206 | } |
| 204 | 207 | return result; |
| 205 | 208 | } |
| ... | ... | @@ -217,8 +220,9 @@ public class ArchiveCandidateMonitoringBackgroundTask implements Runnable { |
| 217 | 220 | final double lastMinuteSystemLoadAverage = replicaSet.getMaster().getLastMinuteSystemLoadAverage(Landscape.WAIT_FOR_PROCESS_TIMEOUT); |
| 218 | 221 | final boolean result = lastMinuteSystemLoadAverage < MAXIMUM_ONE_MINUTE_SYSTEM_LOAD_AVERAGE; |
| 219 | 222 | if (!result) { |
| 220 | - setLastFailureMessage("Candidate has too high system load average of "+lastMinuteSystemLoadAverage+ |
|
| 221 | - " which is still above the maximum of "+MAXIMUM_ONE_MINUTE_SYSTEM_LOAD_AVERAGE); |
|
| 223 | + setLastFailureMessage("Candidate at " + replicaSet.getMaster().getHost().getPrivateAddress() |
|
| 224 | + + " has too high system load average of " + lastMinuteSystemLoadAverage |
|
| 225 | + + " which is still above the maximum of " + MAXIMUM_ONE_MINUTE_SYSTEM_LOAD_AVERAGE); |
|
| 222 | 226 | } |
| 223 | 227 | return result; |
| 224 | 228 | } |
| ... | ... | @@ -236,7 +240,8 @@ public class ArchiveCandidateMonitoringBackgroundTask implements Runnable { |
| 236 | 240 | final int defaultBackgroundThreadPoolExecutorQueueSize = replicaSet.getMaster().getDefaultBackgroundThreadPoolExecutorQueueSize(Landscape.WAIT_FOR_PROCESS_TIMEOUT); |
| 237 | 241 | final boolean result = defaultBackgroundThreadPoolExecutorQueueSize < MAXIMUM_THREAD_POOL_QUEUE_SIZE; |
| 238 | 242 | if (!result) { |
| 239 | - setLastFailureMessage("Candidate has too many tasks in default background thread pool executor queue: "+defaultBackgroundThreadPoolExecutorQueueSize+ |
|
| 243 | + setLastFailureMessage("Candidate at " + replicaSet.getMaster().getHost().getPrivateAddress() |
|
| 244 | + + " has too many tasks in default background thread pool executor queue: "+defaultBackgroundThreadPoolExecutorQueueSize+ |
|
| 240 | 245 | " which is still above the maximum of "+MAXIMUM_THREAD_POOL_QUEUE_SIZE); |
| 241 | 246 | } |
| 242 | 247 | return result; |
| ... | ... | @@ -255,7 +260,8 @@ public class ArchiveCandidateMonitoringBackgroundTask implements Runnable { |
| 255 | 260 | final int defaultForegroundThreadPoolExecutorQueueSize = replicaSet.getMaster().getDefaultForegroundThreadPoolExecutorQueueSize(Landscape.WAIT_FOR_PROCESS_TIMEOUT); |
| 256 | 261 | final boolean result = defaultForegroundThreadPoolExecutorQueueSize < MAXIMUM_THREAD_POOL_QUEUE_SIZE; |
| 257 | 262 | if (!result) { |
| 258 | - setLastFailureMessage("Candidate has too many tasks in default foreground thread pool executor queue: "+defaultForegroundThreadPoolExecutorQueueSize+ |
|
| 263 | + setLastFailureMessage("Candidate at "+replicaSet.getMaster().getHost().getPrivateAddress() |
|
| 264 | + + " has too many tasks in default foreground thread pool executor queue: "+defaultForegroundThreadPoolExecutorQueueSize+ |
|
| 259 | 265 | " which is still above the maximum of "+MAXIMUM_THREAD_POOL_QUEUE_SIZE); |
| 260 | 266 | } |
| 261 | 267 | return result; |