From 6355f33ab9c1d6e58befc5a05659a660a533e8dd Mon Sep 17 00:00:00 2001
From: Maria Fernanda Morales <65073126+mfmo45@users.noreply.github.com>
Date: Wed, 24 Apr 2024 16:41:47 +0200
Subject: [PATCH] Added ToDos to SequentialDesign

---
 .../surrogate_models/sequential_design.py     | 82 ++++++++++++++-----
 1 file changed, 63 insertions(+), 19 deletions(-)

diff --git a/src/bayesvalidrox/surrogate_models/sequential_design.py b/src/bayesvalidrox/surrogate_models/sequential_design.py
index 322ebbbce..abe3f1a13 100644
--- a/src/bayesvalidrox/surrogate_models/sequential_design.py
+++ b/src/bayesvalidrox/surrogate_models/sequential_design.py
@@ -209,12 +209,13 @@ class SequentialDesign:
 
             return Xnew, None
 
-        # Generate needed Exploration class
-        explore = Exploration(self.ExpDesign, n_candidates)
-        explore.w = 100  # * ndim #500  # TODO: where does this value come from?
-
-        # Select criterion (mc-intersite-proj-th, mc-intersite-proj)
-        explore.mc_criterion = 'mc-intersite-proj'
+        # ------- Calculate Exploration weight -------
+        # Compute exploration weight based on trade off scheme
+        explore_w, exploit_w = self.tradeoff_weights(tradeoff_scheme,
+                                                     old_EDX,
+                                                     old_EDY)
+        print(f"\n Exploration weight={explore_w:0.3f} "
+              f"Exploitation weight={exploit_w:0.3f}\n")
 
         # Generate the candidate samples
         # TODO: here use the sampling method provided by the expdesign?
@@ -231,6 +232,9 @@ class SequentialDesign:
         # -----------------------------------------
         # ---------- EXPLORATION METHODS ----------
         # -----------------------------------------
+        # ToDo: Move this if/else into its own function called "do_exploration", which should select the
+        #       exploration samples, and assign exploration scores. We should send it explore_score, for if/else stmts
+        # ToDo: Check if explore_scores can be nan, and remove them from any score normalization
         if explore_method == 'LOOCV':
             # -----------------------------------------------------------------
             # TODO: LOOCV model construnction based on Feng et al. (2020)
@@ -256,14 +260,16 @@ class SequentialDesign:
 
         else:
             # ------- EXPLORATION: SPACE-FILLING DESIGN -------
+            # ToDo: Remove Exploration class and merge the functions into SequentialDesign class
             # Generate candidate samples from Exploration class
             explore = Exploration(self.ExpDesign, n_candidates)
-            explore.w = 100  # * ndim #500
+            explore.w = 100  # * ndim #500   # TODO: where does this value come from?
             # Select criterion (mc-intersite-proj-th, mc-intersite-proj)
             explore.mc_criterion = 'mc-intersite-proj'
             allCandidates, scoreExploration = explore.get_exploration_samples()
 
             # Temp: ---- Plot all candidates -----
+            # ToDo: Make its own function, called inside of the select_exploration_samples function.
             if ndim == 2:
                 def plotter(points, allCandidates, Method,
                             scoreExploration=None):
@@ -313,23 +319,19 @@ class SequentialDesign:
         if exploit_method.lower() == 'bayesoptdesign' or \
                 exploit_method.lower() == 'bayesactdesign':
 
-            # ------- Calculate Exoploration weight -------
-            # Compute exploration weight based on trade off scheme
-            explore_w, exploit_w = self.tradeoff_weights(tradeoff_scheme,
-                                                         old_EDX,
-                                                         old_EDY)
-            print(f"\n Exploration weight={explore_w:0.3f} "
-                  f"Exploitation weight={exploit_w:0.3f}\n")
-
             # ------- EXPLOITATION: BayesOptDesign & ActiveLearning -------
             if explore_w != 1.0:
                 # Check if all needed properties are set
                 if not hasattr(self.ExpDesign, 'max_func_itr'):
                     raise AttributeError('max_func_itr not given to the experimental design')
 
+
                 # Create a sample pool for rejection sampling
+                # ToDo: remove from here, add only to BayesOptDesign option
                 MCsize = 15000
                 X_MC = self.ExpDesign.generate_samples(MCsize, 'random')
+
+                # ToDo: Get samples from the "do_exploration"
                 candidates = self.ExpDesign.generate_samples(
                     n_candidates, 'latin_hypercube')
 
@@ -350,6 +352,7 @@ class SequentialDesign:
                         results.append(self.run_util_func(exploit_method, split_cand[i], i, sigma2, var, X_MC))
 
                 # Retrieve the results and append them
+                # ToDo: Rename U_J_D (here and everyhwere) to something more representative
                 U_J_d = np.concatenate([results[NofE][1] for NofE in
                                         range(n_cand_groups)])
 
@@ -363,21 +366,28 @@ class SequentialDesign:
                     U_J_d = np.mean(U_J_d.reshape(-1, n_candidates), axis=1)
 
                 # Normalize U_J_d
+                # ToDO: Check if this is working for the case where the util_func should be minimized (e.g. IE)
+                # norm_U_J_D = U_J_d / np.nansum(np.abs(U_J_d))  # Possible solution
                 norm_U_J_d = U_J_d / np.sum(U_J_d)
+
             else:
                 norm_U_J_d = np.zeros((len(scoreExploration)))
 
             # ------- Calculate Total score -------
+            # ToDo: This should be outside of the exploration/exploitation if/else part
             # ------- Trade off between EXPLORATION & EXPLOITATION -------
             # Accumulate the samples
-            finalCandidates = np.concatenate((allCandidates, candidates), axis=0)
-            finalCandidates = np.unique(finalCandidates, axis=0)
+            # ToDo: Stop assuming 2 sets of samples (should only be 1)
+            finalCandidates = np.concatenate((allCandidates, candidates), axis=0)  # ToDo: Remove
+            finalCandidates = np.unique(finalCandidates, axis=0)                   # ToDo: Remove
 
             # Calculations take into account both exploration and exploitation
             # samples without duplicates
             totalScore = np.zeros(finalCandidates.shape[0])
             # self.totalScore = totalScore
 
+            # ToDo: Simplify (remove loop) for only one set of samples
+            # final_weights = explore_score*explore_weights + exploit_score*exploit_weight
             for cand_idx in range(finalCandidates.shape[0]):
                 # find candidate indices
                 idx1 = np.where(allCandidates == finalCandidates[cand_idx])[0]
@@ -406,7 +416,7 @@ class SequentialDesign:
             # find an optimal point subset to add to the initial design by
             # maximization of the utility score and taking care of NaN values
             temp = totalScore.copy()
-            temp[np.isnan(totalScore)] = -np.inf
+            temp[np.isnan(totalScore)] = -np.inf                # Since we are maximizing
             sorted_idxtotalScore = np.argsort(temp)[::-1]
             bestIdx = sorted_idxtotalScore[:n_new_samples]
 
@@ -426,7 +436,6 @@ class SequentialDesign:
                 # TODO: still not changed for e.g. 'Voronoi'
                 Xnew = finalCandidates[sorted_idxtotalScore[:n_new_samples]]
 
-
         elif exploit_method.lower() == 'varoptdesign':
             # ------- EXPLOITATION: VarOptDesign -------
             UtilMethod = var
@@ -458,6 +467,7 @@ class SequentialDesign:
                 ExploitScore = np.max(np.max(allModifiedLOO, axis=1), axis=1)
 
             elif UtilMethod in ['EIGF', 'ALM']:
+                # ToDo: Check the methods it actually can receive (ALC is missing from conditional list and code)
                 # ----- All other in  ['EIGF', 'ALM'] -----
                 # Initilize the ExploitScore array
                 # ExploitScore = np.zeros((len(old_EDX), len(OutputNames)))
@@ -508,6 +518,7 @@ class SequentialDesign:
             # maximization of the utility score and taking care of NaN values
             # Total score
             # Normalize U_J_d
+            # ToDo: MOve this out of the exploitation if/else part (same as with Bayesian approaches)
             ExploitScore = ExploitScore / np.sum(ExploitScore)
             totalScore = exploit_w * ExploitScore
             # print(totalScore.shape)
@@ -534,11 +545,16 @@ class SequentialDesign:
                     # select the requested number of samples
                     Xnew[i] = newSamples[np.argmax(maxminScore)]
 
+        # ToDo: For these 2 last methods, we should find better ways
         elif exploit_method.lower() == 'alphabetic':
+            # ToDo: Check function to see what it does for scores/how it chooses points, so it gives as an output the
+            #       scores. See how it works with exploration_scores.
+            # Todo: Check if it is a minimization or maximization. (We think it is minimization)
             # ------- EXPLOITATION: ALPHABETIC -------
             Xnew = self.util_AlphOptDesign(allCandidates, var)
 
         elif exploit_method == 'Space-filling':
+            # ToDo: Set exploitation score to 0, so we can do tradeoff oustide of if/else
             # ------- EXPLOITATION: SPACE-FILLING -------
             totalScore = scoreExploration
 
@@ -703,6 +719,7 @@ class SequentialDesign:
                     y_hat, std, sigma2Dict, var)
 
         elif method.lower() == 'bayesoptdesign':
+            # ToDo: Create X_MC here, since it is not used in the other active learning approaches.
             NCandidate = candidates.shape[0]
             U_J_d = np.zeros(NCandidate)
             for idx, X_can in tqdm(enumerate(candidates), ascii=True,
@@ -1714,3 +1731,30 @@ class SequentialDesign:
             )
 
         return RMSE_Mean, RMSE_std
+
+    def _select_indexes(self, prior_samples, collocation_points):
+        """
+        ToDo: This function will be used to check the user-input exploration samples, remove training points that
+        were already used, and select the first mc_size samples that have not yet been used for training. It should also
+        assign an exploration score of 0 to all samples.
+        Args:
+            prior_samples: array [mc_size, n_params]
+                Pre-defined samples from the parameter space, out of which the sample sets should be extracted.
+            collocation_points: [tp_size, n_params]
+                array with training points which were already used to train the surrogate model, and should therefore
+                not be re-explored.
+
+        Returns: array[self.mc_size,]
+            With indexes of the new candidate parameter sets, to be read from the prior_samples array
+
+        """
+        n_tp = collocation_points.shape[0]
+        # a) get index of elements that have already been used
+        aux1_ = np.where((prior_samples[:self.mc_samples + n_tp, :] == collocation_points[:, None]).all(-1))[1]
+        # b) give each element in the prior a True if it has not been used before
+        aux2_ = np.invert(np.in1d(np.arange(prior_samples[:self.mc_samples + n_tp, :].shape[0]), aux1_))
+        # c) Select the first d_size_bal elements in prior_sample that have not been used before
+        al_unique_index = np.arange(prior_samples[:self.mc_samples + n_tp, :].shape[0])[aux2_]
+        al_unique_index = al_unique_index[:self.mc_samples]
+
+        return al_unique_index
\ No newline at end of file
-- 
GitLab