docker/Dockerfile_windestimation
... ...
@@ -13,4 +13,4 @@ RUN wget -O /tmp/rds.pem https://s3.amazonaws.com/rds-downloads/rds-combined-ca-
13 13
&& /opt/sapjvm_8/bin/keytool -importcert -alias AWSRDS -file /tmp/rds.pem -keystore /opt/sapjvm_8/jre/lib/security/cacerts -noprompt -storepass changeit \
14 14
&& rm /tmp/rds.pem
15 15
RUN wget -O /home/sailing/WindEstimationModelsTraining.jar https://static.sapsailing.com/WindEstimationModelsTraining.jar
16
-CMD exec java "${MEMORY}" -Dmongo.uri="${MONGODB_URI}" -XX:UseParallelGC -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=dump/ -Xlog:gc*=info,gc+region*=info,gc+ergo*=info,gc+humongous*=info,gc+liveness=trace:file=logs/gc.log:time,level,tags:filecount=10,filesize=100000000 -jar WindEstimationModelsTraining.jar "${BEARER_TOKEN}" ${TRAINING_DATA_PERCENT}
16
+CMD exec java "${MEMORY}" -Dmongo.uri="${MONGODB_URI}" -XX:+UseParallelGC -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=dump/ -Xlog:gc*=info,gc+region*=info,gc+ergo*=info,gc+humongous*=info,gc+liveness=trace:file=logs/gc.log:time,level,tags:filecount=10,filesize=100000000 -jar WindEstimationModelsTraining.jar "${BEARER_TOKEN}" ${TRAINING_DATA_PERCENT}
java/com.sap.sailing.polars/src/com/sap/sailing/polars/regression/impl/IncrementalAnyOrderLeastSquaresImpl.java
... ...
@@ -95,10 +95,10 @@ public class IncrementalAnyOrderLeastSquaresImpl implements IncrementalLeastSqua
95 95
}
96 96
97 97
public IncrementalAnyOrderLeastSquaresImpl(int polynomialOrder, boolean hasIntercept,
98
- boolean useSymbollicInversionIfPossible) {
98
+ boolean useSymbolicInversionIfPossible) {
99 99
this.hasIntercept = hasIntercept;
100 100
this.polynomialOrder = polynomialOrder;
101
- this.useSymbollicInversionIfPossible = useSymbollicInversionIfPossible;
101
+ this.useSymbollicInversionIfPossible = useSymbolicInversionIfPossible;
102 102
if (hasIntercept) {
103 103
matrixOfXSums = new double[polynomialOrder + 1][polynomialOrder + 1];
104 104
vectorOfXYMultSums = new double[polynomialOrder + 1];
java/com.sap.sailing.windestimation.lab/SimpleModelsTrainingPart2.launch
... ...
@@ -1,13 +1,15 @@
1 1
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
2 2
<launchConfiguration type="org.eclipse.jdt.launching.localJavaApplication">
3
-<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS">
4
-<listEntry value="/com.sap.sailing.windestimation.lab/src/com/sap/sailing/windestimation/model/SimpleModelsTrainingPart2.java"/>
5
-</listAttribute>
6
-<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES">
7
-<listEntry value="1"/>
8
-</listAttribute>
9
-<booleanAttribute key="org.eclipse.jdt.launching.ATTR_EXCLUDE_TEST_CODE" value="true"/>
10
-<stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="com.sap.sailing.windestimation.model.SimpleModelsTrainingPart2"/>
11
-<stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="com.sap.sailing.windestimation.lab"/>
12
-<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-ea -Dmongo.port=10202 -Dmongo.dbName=windestimation"/>
3
+ <booleanAttribute key="org.eclipse.debug.core.ATTR_FORCE_SYSTEM_CONSOLE_ENCODING" value="false"/>
4
+ <listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS">
5
+ <listEntry value="/com.sap.sailing.windestimation.lab/src/com/sap/sailing/windestimation/model/SimpleModelsTrainingPart2.java"/>
6
+ </listAttribute>
7
+ <listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES">
8
+ <listEntry value="1"/>
9
+ </listAttribute>
10
+ <booleanAttribute key="org.eclipse.jdt.launching.ATTR_EXCLUDE_TEST_CODE" value="true"/>
11
+ <stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="com.sap.sailing.windestimation.model.SimpleModelsTrainingPart2"/>
12
+ <stringAttribute key="org.eclipse.jdt.launching.MODULE_NAME" value="com.sap.sailing.windestimation.lab"/>
13
+ <stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="com.sap.sailing.windestimation.lab"/>
14
+ <stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-ea -Dmongo.port=10200 -Dmongo.dbName=windestimation"/>
13 15
</launchConfiguration>
java/com.sap.sailing.windestimation.test/resources/trained_wind_estimation_models/SERIALIZATION.modelForDurationBasedTwdDeltaStdRegressor.IncrementalSingleDimensionPolynomialRegressor.DurationBasedTwdTransitionRegressorFrom0.0To1.0.clf
wiki/info/landscape/olympic-plan-for-paris-marseilles-2024.md
... ...
@@ -0,0 +1,49 @@
1
+# Thoughts on Landscape Configuration for Paris 2024 / Marseille
2
+
3
+As a baseline we'll use the [Olympic Setup](/wiki/info/landscape/olympic-setup). The major change, though, would be that instead of running a local on-site master and a local on-site replica we would run two master instances locally on site where one is the "shadow" and the other one is the "production" master.
4
+
5
+## Master and Shadow Master
6
+
7
+We will use one laptop as production master, the other as "shadow master." The reason for not using a master and a local replica is that if the local master fails, re-starting later in the event can cause significant delays until all races have loaded and replicated again.
8
+
9
+Both laptops shall run their local RabbitMQ instance. Each of the two master processes can optionally write into its local RabbitMQ through an SSH tunnel which may instead redirect to the cloud-based RabbitMQ for an active Internet/Cloud connection.
10
+
11
+This will require to set up two MongoDB databases (not separate processes, just different DB names).
12
+
13
+Note: The shadow master must have at least one registered replica because otherwise it would not send any operations into the RabbitMQ replication channel. This can be a challenge for a shadow master that has never seen any replica. We could, for example, simulate a replica registration when the shadow master is still basically empty, using, e.g., a CURL request and then ignoring and later deleting the initial load queue on the local RabbitMQ.
14
+
15
+Furthermore, the shadow master must not send into the production RabbitMQ replication channel that is used by the production master instance while it is not in production itself, because it would duplicate the operations sent. Instead, the shadow master shall use a local RabbitMQ instance to which an SSH tunnel forwards.
16
+
17
+## Switching
18
+
19
+### Production Master Failure
20
+
21
+Situation: production master fails, e.g., because of a Java VM crash or a deadlock or user issues such as killing the wrong process...
22
+
23
+Approach: Switch to previous shadow master, re-configuring all SSH tunnels accordingly; this includes the 8888 reverse forward from the cloud to the local on-site master, as well as the RabbitMQ forward which needs to switch from the local RabbitMQ running on the shadow master's host to the cloud-based RabbitMQ. Clients such as SwissTiming clients need to switch to the shadow master. To remedy gaps in replication due to the SSH tunnel switch we may want to circulate the replica instances, rolling over to a new set of replicas that fetch a new initial load.
24
+
25
+### Internet Failure
26
+
27
+As in the Tokyo 2020 scenario; in particular, the local security service must be started which will work off a regularly updated local MongoDB copy of the cloud-based security-service.sapsailing.com; this also requires to adjust /etc/hosts and the tunnels accordingly.
28
+
29
+## SSH Tunnels
30
+
31
+TBD; baseline is again the Tokyo 2020 set-up.
32
+
33
+## Test Plan for Test Event Marseille July 2023
34
+
35
+### Test Internet Failure
36
+
37
+We shall emulate the lack of a working Internet connection and practice and test the procedures for switching to a local security-service.sapsailing.com installation as well as a local RabbitMQ standing in for the RabbitMQ deployed in the cloud.
38
+
39
+### Test Primary Master Hardware Failure
40
+
41
+This will require switching entirely to the shadow master. Depending on the state of the reverse port forward of the 8888 HTTP port from the cloud we may or may not have to try to terminate a hanging connection in order to be able to establish a new reverse port forward pointing from the cloud to the shadow master. The shadow master also then needs to use the cloud-based RabbitMQ instead of its local one. As a fine-tuning, we can practice the rolling re-sync of all cloud replicas which will likely have missed operations in the meantime.
42
+
43
+### Test Primary Master Java VM Failure
44
+
45
+This can be caused by a deadlock, VM crash, Full GC phase, massive performance degradation or other faulty behavior. We then need to actively close the reverse SSH port forward from the cloud to the production master's 8888 HTTP port, as a precaution switch the RabbitMQ tunnel from the cloud-based to the local RabbitMQ instance so that in case the production master "wakes up" again, e.g., after a Full GC, it does not start to interfere with the now active shadow master on the RabbitMQ fan-out exchange. On the shadow master we need to re-configure the SSH tunnels, particularly to target the cloud-based RabbitMQ and have the reverse port forward on port 8888 target the shadow master on site now.
46
+
47
+### Test Primary Mater Failures with no Internet Connection
48
+
49
+Combine the above scenarios: a failing production master (hardware or VM-only) will require different tunnel re-configurations, especially regarding the then local security-service.sapsailing.com environment which may need to move to the shadow laptop.