629b5f9ae2bd450b14a4d3b3c14f926c151db274
java/com.sap.sailing.windestimation.lab/src/com/sap/sailing/windestimation/evaluation/WindEstimatorManeuverNumberDependentEvaluationRunner.java
| ... | ... | @@ -110,6 +110,7 @@ public class WindEstimatorManeuverNumberDependentEvaluationRunner { |
| 110 | 110 | out.write(line); |
| 111 | 111 | } |
| 112 | 112 | } |
| 113 | + LoggingUtil.logInfo("CSV with evaluation results have been stored in: " + csvFile.getAbsolutePath()); |
|
| 113 | 114 | } |
| 114 | 115 | |
| 115 | 116 | } |
java/com.sap.sailing.windestimation.lab/src/com/sap/sailing/windestimation/model/SimpleModelsTraining.java
| ... | ... | @@ -20,7 +20,11 @@ import com.sap.sailing.windestimation.datavisualization.AggregatedDistanceDimens |
| 20 | 20 | import com.sap.sailing.windestimation.datavisualization.AggregatedDurationDimensionPlot; |
| 21 | 21 | import com.sap.sailing.windestimation.model.classifier.maneuver.ManeuverClassifierTrainer; |
| 22 | 22 | import com.sap.sailing.windestimation.model.classifier.maneuver.PersistedManeuverClassifiersScorePrinter; |
| 23 | +import com.sap.sailing.windestimation.model.regressor.twdtransition.DistanceBasedTwdTransitionRegressorModelContext; |
|
| 24 | +import com.sap.sailing.windestimation.model.regressor.twdtransition.DistanceBasedTwdTransitionRegressorModelContext.DistanceValueRange; |
|
| 23 | 25 | import com.sap.sailing.windestimation.model.regressor.twdtransition.DistanceBasedTwdTransitionStdRegressorTrainer; |
| 26 | +import com.sap.sailing.windestimation.model.regressor.twdtransition.DurationBasedTwdTransitionRegressorModelContext; |
|
| 27 | +import com.sap.sailing.windestimation.model.regressor.twdtransition.DurationBasedTwdTransitionRegressorModelContext.DurationValueRange; |
|
| 24 | 28 | import com.sap.sailing.windestimation.model.regressor.twdtransition.DurationBasedTwdTransitionStdRegressorTrainer; |
| 25 | 29 | import com.sap.sailing.windestimation.util.LoggingUtil; |
| 26 | 30 | |
| ... | ... | @@ -32,9 +36,7 @@ import com.sap.sailing.windestimation.util.LoggingUtil; |
| 32 | 36 | public class SimpleModelsTraining { |
| 33 | 37 | |
| 34 | 38 | private static final int NUMBER_OF_THREADS = 3; |
| 35 | - private static final ExecutorService executorService = new ThreadPoolExecutor(NUMBER_OF_THREADS, NUMBER_OF_THREADS, |
|
| 36 | - 0, TimeUnit.MILLISECONDS, new ArrayBlockingQueue<>(NUMBER_OF_THREADS), |
|
| 37 | - new ThreadPoolExecutor.CallerRunsPolicy()); |
|
| 39 | + private static ExecutorService executorService; |
|
| 38 | 40 | |
| 39 | 41 | public static void main(String[] args) throws Exception { |
| 40 | 42 | new ManeuverForEstimationPersistenceManager().dropCollection(); |
| ... | ... | @@ -63,11 +65,15 @@ public class SimpleModelsTraining { |
| 63 | 65 | showInfoAboutDataCleaning(AggregatedSingleDimensionType.DURATION); |
| 64 | 66 | AggregatedDurationDimensionPlot.awaitWindowClosed(); |
| 65 | 67 | } while (JOptionPane.YES_OPTION != askDataCleaningFinished(AggregatedSingleDimensionType.DURATION)); |
| 68 | + showInfoAboutIntervalAdjustments(DurationBasedTwdTransitionRegressorModelContext.class, |
|
| 69 | + DurationValueRange.class); |
|
| 66 | 70 | do { |
| 67 | 71 | AggregatedDistanceDimensionPlot.main(args); |
| 68 | 72 | showInfoAboutDataCleaning(AggregatedSingleDimensionType.DISTANCE); |
| 69 | 73 | AggregatedDistanceDimensionPlot.awaitWindowClosed(); |
| 70 | 74 | } while (JOptionPane.YES_OPTION != askDataCleaningFinished(AggregatedSingleDimensionType.DISTANCE)); |
| 75 | + showInfoAboutIntervalAdjustments(DistanceBasedTwdTransitionRegressorModelContext.class, |
|
| 76 | + DistanceValueRange.class); |
|
| 71 | 77 | DurationBasedTwdTransitionStdRegressorTrainer.main(args); |
| 72 | 78 | DistanceBasedTwdTransitionStdRegressorTrainer.main(args); |
| 73 | 79 | Thread.sleep(1000); |
| ... | ... | @@ -92,15 +98,25 @@ public class SimpleModelsTraining { |
| 92 | 98 | private static void showInfoAboutDataCleaning(AggregatedSingleDimensionType dimension) { |
| 93 | 99 | JOptionPane.showMessageDialog(null, "Now, clean the data for " + dimension |
| 94 | 100 | + " dimension. Remove instances from MongoDB collection \"" + dimension.getCollectioName() |
| 95 | - + "\" which do not make sense. E.g. values which are represented by a small number of supporting instances (see histogram), values which cause implausible zig zag sections within zero-mean standard deviation curve and etc. Close the graphical tool, when you are done to resume model training."); |
|
| 101 | + + "\" which do not make sense. E.g. values which are represented by a small number of supporting instances (see histogram), values which cause implausible zig zag sections within zero-mean standard deviation curve and etc. Close the graphical tool, when you are done to resume the model training."); |
|
| 102 | + } |
|
| 103 | + |
|
| 104 | + private static void showInfoAboutIntervalAdjustments(Class<?> classToAdjust, Class<?> valueRangeEnum) { |
|
| 105 | + JOptionPane.showMessageDialog(null, "Now, open the source code of the class \"" + classToAdjust.getName() |
|
| 106 | + + "\". Scroll down to the definition of the inner enum \"" + valueRangeEnum.getSimpleName() |
|
| 107 | + + "\", read its JavaDoc and adjust its interval definitions so that each interval can be learned by the adjusted regressor model configuration with minimal error. Press OK ONLY after you are done."); |
|
| 96 | 108 | } |
| 97 | 109 | |
| 98 | 110 | private static void awaitThreadPoolCompletion() throws InterruptedException { |
| 111 | + executorService.shutdown(); |
|
| 99 | 112 | executorService.awaitTermination(24, TimeUnit.HOURS); |
| 100 | 113 | Thread.sleep(1000L); |
| 101 | 114 | } |
| 102 | 115 | |
| 103 | 116 | private static void executeInThreadPool(RunnableWithExceptionsCatch runnable) { |
| 117 | + if (executorService == null || executorService.isShutdown()) { |
|
| 118 | + createNewThreadPool(); |
|
| 119 | + } |
|
| 104 | 120 | executorService.execute(() -> { |
| 105 | 121 | try { |
| 106 | 122 | runnable.run(); |
| ... | ... | @@ -112,6 +128,11 @@ public class SimpleModelsTraining { |
| 112 | 128 | }); |
| 113 | 129 | } |
| 114 | 130 | |
| 131 | + private static void createNewThreadPool() { |
|
| 132 | + executorService = new ThreadPoolExecutor(NUMBER_OF_THREADS, NUMBER_OF_THREADS, 0, TimeUnit.MILLISECONDS, |
|
| 133 | + new ArrayBlockingQueue<>(NUMBER_OF_THREADS), new ThreadPoolExecutor.CallerRunsPolicy()); |
|
| 134 | + } |
|
| 135 | + |
|
| 115 | 136 | /** |
| 116 | 137 | * |
| 117 | 138 | * @author Vladislav Chumak (D069712) |
wiki/howto/windestimation.md
| ... | ... | @@ -2,66 +2,28 @@ |
| 2 | 2 | |
| 3 | 3 | This document describes the generation process of Machine Learning (ML) models which are used internally by wind estimation. It is highly recommended to proceed this howto step by step considering the order of sections. |
| 4 | 4 | |
| 5 | -## Overview |
|
| 6 | -In total, there are the following three categories of ML models used by wind estimation: |
|
| 7 | -1. **Maneuver Classifiers** |
|
| 8 | -2. **Regressors** of TWD delta standard deviation for the dimension **duration** |
|
| 9 | -3. **Regressors** of TWD delta standard deviation for the dimension **distance** |
|
| 10 | - |
|
| 11 | -Each of the model categories are composed of multiple models where each model targets a specific context. A context for a maneuver classifier is determined by the following attributes: |
|
| 12 | -* Maneuver features |
|
| 13 | - * Polar features enabled: yes/no |
|
| 14 | - * Mark features enabled: yes/no |
|
| 15 | - * Scaled speed features enabled: yes/no |
|
| 16 | -* Boat class filtering for the data on which the classifier is trained, such as a specific boat class, or with all boat classes included |
|
| 17 | - |
|
| 18 | -The context of regressor models is represented by its assigned input interval responsibility, e.g. [0 seconds; 62 seconds) for duration, or [80 meters; 1368 meters) for distance. |
|
| 19 | - |
|
| 20 | -Each of the ML model categories must be trained individually. The common workflow looks as follows: |
|
| 21 | -1. Get the training data from REST API of sapsailing.com |
|
| 22 | -2. Preprocess data |
|
| 23 | -3. Train the model category |
|
| 24 | - |
|
| 25 | -For each of the steps, appropriate Java classes must be executed per *Run with...->Java Application*. All referenced classes are located in *com.sap.sailing.windestimation.lab* Java project. Each class execution must finish without uncaught exceptions before proceeding to next instructions. After model training, all trained models can be collected in *./trained_wind_estimation_models*, which is normally */path/to/workspace/com.sap.sailing.windestimation/trained_wind_estimation_models* if you start the training classes in Eclipse per *Run with...->Java Application*. |
|
| 26 | - |
|
| 27 | -The details of the training process for each model category are described in the following sections. |
|
| 28 | - |
|
| 29 | 5 | ## Prerequisites |
| 30 | 6 | To complete the training process successfully, you need to make sure that you have the following stuff: |
| 31 | 7 | * A complete onboarding setup for SAP Sailing Analytics development |
| 32 | 8 | * MongoDB (**3.4 or higher!**) is up and running (same MongoDB instance as required in onboarding howto) |
| 33 | 9 | * At least 100 GB free space on the partition, where MongoDB is operating |
| 34 | 10 | * Installed graphical MongoDB client such as MongoDB Compass (Community version) |
| 35 | - |
|
| 36 | -## Get the training data from sapsailing.com |
|
| 37 | -The following steps import all the data required from sapsailing.com into the local MongoDB. These steps constitute a preprequisite for training of all ML model categories: |
|
| 38 | -1. Run *com.sap.sailing.windestimation.data.importer.ManeuverAndWindImporter* |
|
| 39 | -2. Run *com.sap.sailing.windestimation.data.importer.PolarDataImporter* |
|
| 40 | - |
|
| 41 | -## Maneuver classifiers training |
|
| 42 | -1. Run *com.sap.sailing.windestimation.model.classifier.maneuver.ManeuverClassifierTrainer*. Within the this step, the maneuver data is preprocessed and all maneuver classifiers are trained for each supported context. |
|
| 43 | -2. Optionally run *com.sap.sailing.windestimation.model.classifier.maneuver.ManeuverClassifierScoring* to print the performance of the trained classifiers. After this step, a list with macro-averaged F2-score of each trained classifier will be stored in *./maneuverClassifierScores.csv* |
|
| 44 | - |
|
| 45 | -## Duration-based TWD delta standard deviation regressor |
|
| 46 | - |
|
| 47 | -1. Run *com.sap.sailing.windestimation.data.importer.DurationBasedTwdTransitionImporter* |
|
| 48 | -2. Run *com.sap.sailing.windestimation.data.importer.AggregatedDurationBasedTwdTransitionImporter* |
|
| 49 | -3. Run *com.sap.sailing.windestimation.datavisualization.AggregatedDurationDimensionPlot* to visualize the wind data. A Swing-based GUI-Window must open with two charts, one XY-chart where the x-axis represents **seconds**, and the y-axis represents TWD delta-based series measures (e.g. standard deviation or mean). Below the chart, a histogram for data points of the XY-Chart is provided. You can zoom-in and zoom-out in each of the chart by mouse dragging. Be aware that currently the zoom level of both charts is not synchronized |
|
| 50 | -4. Open your graphical MongoDB client and connect to *windEstimation* database hosted by your local MongoDB. Open the collection with name *aggregatedDurationTwdTransition*. Within the collection you will see all the instances/data points visualized in the previous step. The total number of the points must not exceed 100. |
|
| 51 | -5. Delete all the instances within the collection which do not make sense. For this, use the data visualization tool from step 3 to identify such instances. Pay a special attention to the instances in the beginnning and end. Some of the instances are not representative due to small number of supporting instances which is visualized in the histogram. Restart the data visualization tool as often as need to visualize the changed data. |
|
| 52 | -6. Open the source code of the class *com.sap.sailing.windestimation.model.regressor.twdtransition.DurationBasedTwdTransitionRegressorModelContext*. Scroll down to the definition of the inner class/enum *DurationValueRange*. The enum defines the intervals for which a separate regressor model will be trained. Adjust the intervals accordingly in order to allow the regressor model to learn the data curve with minimal error. Make sure that there are at least 2 data points available within each interval. Datapoint with x = 0, y = 0 will be created automatically. |
|
| 53 | -7. Run *com.sap.sailing.windestimation.model.regressor.twdtransition.DurationBasedTwdTransitionStdRegressorTrainer* |
|
| 54 | -8. Verify the trained regressor functions. They are printed in the console output of the previous step. For instance, you can visualize the polynoms by means of https://www.wolframalpha.com/ |
|
| 55 | - |
|
| 56 | -## Distance-based TWD delta standard deviation regressor |
|
| 57 | - |
|
| 58 | -The steps of this sections are similar to the steps of the previous section. It is recommended to traverse through the previous section before starting with this one, because due to similarity of the steps, the similar steps in this section are described with less details and hints. |
|
| 59 | - |
|
| 60 | -1. Run *com.sap.sailing.windestimation.data.importer.DistanceBasedTwdTransitionImporter* |
|
| 61 | -2. Run *com.sap.sailing.windestimation.data.importer.AggregatedDistanceBasedTwdTransitionImporter* with at least 10 GB JVM memory. |
|
| 62 | -3. Run *com.sap.sailing.windestimation.datavisualization.AggregatedDistanceDimensionPlot* to visualize the wind data. Here, the x-axis of the XY-chart represents **meters** |
|
| 63 | -4. Open your graphical MongoDB client and connect to *windEstimation* database hosted by your local MongoDB. Open collection *aggregatedDistanceTwdTransition* collection. Within the collection you will see all the instances/data points visualized in the previous step. The total number of the points must not exceed 100. |
|
| 64 | -5. Delete all the instances within the collection which do not make sense. |
|
| 65 | -6. Open the source code of the class *com.sap.sailing.windestimation.model.regressor.twdtransition.DistanceBasedTwdTransitionRegressorModelContext*. Scroll down to the definition of the inner class/enum *DistanceValueRange*. The enum defines the intervals for which a separate regressor model will be trained. Adjust the intervals accordingly in order to allow the regressor model to learn the data curve with minimal error. |
|
| 66 | -7. Run *com.sap.sailing.windestimation.model.regressor.twdtransition.DistanceBasedTwdTransitionStdRegressorTrainer* |
|
| 67 | -8. Verify the trained regressor functions. They are printed in the console output of the previous step. |
|
| ... | ... | \ No newline at end of file |
| 0 | +* 16 GB RAM |
|
| 1 | +* ~24 operation hours of your computer |
|
| 2 | + |
|
| 3 | +## Model training process |
|
| 4 | +1. Run ``com.sap.sailing.windestimation.model.SimpleModelsTraining`` as normal Java Application. This program downloads all the necessary maneuver and wind data, pre-processes them and initiates training of maneuver classifiers. |
|
| 5 | +2. Make sure that the launched program does not get termined by an uncaught exception. Wait until graphical info dialog shows up which requests you to perform data cleansing for duration dimension and press OK. |
|
| 6 | +  |
|
| 7 | + A Swing-based GUI-Window must open with two charts, one XY-chart where the x-axis represents **seconds**, and the y-axis represents TWD delta-based series measures (e.g. standard deviation or mean). Below the chart, a histogram for the data points of the XY-Chart is provided. You can zoom-in and zoom-out in each of the chart by mouse dragging. Be aware that currently, the zoom level of both charts is not synchronizing. |
|
| 8 | +  |
|
| 9 | +3. Open your graphical MongoDB client and connect to ``windEstimation`` database hosted by your local MongoDB. Open the collection with name ``aggregatedDurationTwdTransition``. Within the collection you will see all the instances/data points visualized in the previous step. The attribute used for the x-axis is represented by ``value``. |
|
| 10 | +  |
|
| 11 | +4. Delete all the instances within the collection which do not make sense. For this, use the data visualization tool from step 2 to identify such instances. Some of the instances are not representative due to the small number of supporting instances which is visualized in the histogram. Such instances can produce unreasonable bumps in the XY-chart. The desired output of this step is that the series curve ``Zero mean sigma`` looks smooth and always growing, like depicted below: |
|
| 12 | +  |
|
| 13 | + Use the ``Refresh charts`` button as often as needed to update the charts with the modified data in MongoDB. Close the graphical visualization tool window after you are done with data cleansing to resume the training process. Confirm the confirmation dialog after you have finished the data cleansing of duration dimension: |
|
| 14 | +  |
|
| 15 | +5. A new information dialog shows up (do not press OK yet!) requesting you to open the source code of the class ``com.sap.sailing.windestimation.model.regressor.twdtransition.DurationBasedTwdTransitionRegressorModelContext``. Open it and scroll down to the definition of the inner enum ``DurationValueRange``. The enum defines the intervals for which a separate regressor model will be trained. Read the Javadoc of ``DurationValueRange`` and adjust the intervals accordingly in order to allow the regressor model to learn the ``Zero mean sigma`` curve with minimal error. You can also configurate the polynomial which will be used for regressor training. Make sure that there are at least 2 data points available within each interval. The datapoint with x = 0, y = 0 will be created automatically. Press OK in information dialog after you are done. |
|
| 16 | +6. A graphical info dialog shows up which requests you to perform data cleansing for *distance* dimension. Press OK. All steps for data cleansing for the distance dimension are very similar to the data cleansing steps step 2. until step 5. for the duration dimension. Thus, consult these steps to complete data cleansing and models configuration for the distance duration. The unit used for distance representation is **meters**. The collection name required in step 3. is ``aggregatedDistanceTwdTransition``. The class required in step 5. is ``com.sap.sailing.windestimation.model.regressor.twdtransition.DistanceBasedTwdTransitionRegressorModelContext`` and its inner enum is ``DistanceValueRange``. |
|
| 17 | +7. Wait until model training finishes and the program terminates normally. A new file with serialized representation of internal wind estimation models should be located in ``./windEstimationModels.dat``. The absolute path of the file must be printed in the console output of the program. You can upload the file via HTTP POST to http://sapsailing.com/windestimation/api/windestimation_data (see ``com.sap.sailing.windestimation.jaxrs.api.WindEstimationDataResource``) to update the wind estimation of a server instance. |
|
| 18 | +8. Optionally, run ``com.sap.sailing.windestimation.evaluation.WindEstimatorManeuverNumberDependentEvaluationRunner`` as normal Java Application to evaluate the wind estimation with the new trained models. The evaluation score will be stored as CSV in ``./maneuverNumberDependentEvaluation.csv``. |
|
| ... | ... | \ No newline at end of file |