(CW) added some documentation to hyperparameter optimization

52c06a22 · Wachter, Christoph · b5dbfb02 · 52c06a22
Commit 52c06a22 authored 9 months ago by Wachter, Christoph
--- a/sample/helpers/HyperparameterOptimization.py
+++ b/sample/helpers/HyperparameterOptimization.py
@@ -88,8 +88,22 @@ def _createLearner(

    return learner

+def parseOptimizationResults():
+    pass
+
 class HyperParamOptimization:
-    """Class for hyperparameter optimization"""
+    """Class for hyperparameter optimization. Finds the optimal hyperparameters from provided
+    starting values.
+
+    Requirements: The loaded sample project needs to contain training data with DFT
+    results. That is, configurations with some 'property_key' (most likely 'E_ads') need
+    to have been added to the project. Learners will be trained based on this quantity.
+    If a second property key for gas phase data is provided, for each hyperparameter set the
+    the gas phase prior is calculated and used for the learner of the adsorbed system.
+
+    Important: Running hyperparameter optimizations for large sample projects may be
+    require a lot of memory for mulitprocessing.
+    """

    def __init__(
        self,
@@ -101,15 +115,39 @@ class HyperParamOptimization:
        unit_string="eV",
        learner_name="hyperparam_opt",
        is_gas_phase=False,
+        outfile="hyperparam_opt.out",
        n_processes=-1,
    ):
+        """Initializes HyperParamOptimization.
+
+        Parameters
+        ----------
+        proj : SampleProject
+        property_key : str
+            Property that used for training for the hyperparamter optimization.
+        gas_phase_property_key : str, optional
+            Second property specifying gas phase data used for learning the gas phase prior.
+        training_set : ConfigurationSet, optional
+            Configurations which have the same property as given by 'property_key'. Only provide
+            this if you don't want to learn on all configurations with 'property_key'.
+        gas_phase_training_set : ConfigurationSet, optional
+            Same as training set but for the gas phase.
+        unit_string : str, optional
+        learner_name : str, optional
+        is_gas_phase : boolean, optional
+            Use this when you only learn on gas phase data, i.e. if you optimize the
+            hyperparameters for the gas phase only.
+        outfile : str, optional
+        n_processes : int, optional
+            Number of processes used when parallelising. Defaults to all available CPUs.
+        """
        self.proj = proj
        self.unit_string = unit_string
        self.property_key = property_key
        self.gas_phase_property_key = gas_phase_property_key
        self.learner_name = learner_name
        self.is_gas_phase = is_gas_phase
-        self.outfile = "hyperparam_opt.out"
+        self.outfile = outfile

        if n_processes == -1:
            n_processes = multiprocessing.cpu_count()
@@ -147,6 +185,8 @@ class HyperParamOptimization:
        self.input, self.hyperparam_combinations = constructHyperParamCombinations(**kwargs)

    def writeOptimizationInfo(self):
+        """Auxiliary method to to write out basic information for the hyperparameter
+        optimization."""
        file = open(self.outfile, "w")
        file.write("Starting hyperparameter optimization\n")
        file.write("Printing hyperparamters given as input:\n\n")
@@ -164,6 +204,7 @@ class HyperParamOptimization:
        file.close()

    def writeRSME(self, file, errors):
+        """Auxiliary method to write the RSME to the output file."""
        if self.gas_phase_property_key is None:
            file.write("\n")
            file.write(f"RSME_LOOCV:  {errors[0]}\n")
@@ -178,8 +219,9 @@ class HyperParamOptimization:
            file.write("\n")

    def getRSMEForHyperParams(self, **kwargs):
+        """Calculates the RMSE for a given set of hyperparameters. If 'gas_phase_property_key'
+        is set, will also learn the gas phase in order to obtain a gas phase prior."""
        errors = []
-        reduced_training = False
        prior_from_gas_phase = None
        # learn gas phase prior if gas_phase_property_key is specified
        if self.gas_phase_property_key is not None and not self.is_gas_phase:
@@ -216,6 +258,8 @@ class HyperParamOptimization:
        min_errors = None
        min_hyperparams = None

+        best_indices = []
+
        if self.gas_phase_property_key is None:
            n_rmse = 0
        else:
@@ -225,28 +269,34 @@ class HyperParamOptimization:
        hyperparam_chunks = [
            self.hyperparam_combinations[i:i + n_proc] for i in range(0, n_hyp, n_proc)
        ]
-        for chunk in hyperparam_chunks:
+        for j, chunk in enumerate(hyperparam_chunks):
            # create learners using a single thread to save memory
            pool = multiprocessing.Pool(n_proc)
            for i, errors in enumerate(starstarmap(pool, self.getRSMEForHyperParams, chunk)):
+                index = j * n_proc + i + 1
                file = open(self.outfile, "a")
                file.write(delim_string)
-                file.write("Hyperparamters:\n")
+                file.write(f"##### Index: {index:>{5}}\n")
+                file.write(delim_string)
+                file.write("Hyperparameters:\n")
                writeDictToFile(file, chunk[i])
                self.writeRSME(file, errors)
                if errors[n_rmse] < min_rsme:
                    min_rsme = errors[n_rmse]
                    min_errors = errors
                    min_hyperparams = chunk[i]
-                    file.write("This is the new minimum!\n\n")
-                file.write(delim_string)
+                    best_indices.append(index)
+                    file.write("-- This is the new minimum! --\n\n")
                file.close()
            pool.close()

        # write final output
        file = open(self.outfile, "a")
        file.write("Finished iterating through all hyperparameter combinations\n\n")
-        file.write("Result with best RSME:\n\n")
+        file.write("Indices with the ten lowest RSMEs:\n")
+        file.write(str(best_indices[-10:]) + "\n\n")
+        file.write(delim_string)
+        file.write("##### Result with best RSME:\n\n")
        file.write(delim_string)
        file.write("Hyperparamters:\n")
        writeDictToFile(file, min_hyperparams)