Source code for cvopt.model_selection._search

import numpy as np

from hyperopt import fmin, tpe, hp
from GPyOpt.methods import BayesianOptimization

from ._base import BaseSearcher, fit_and_score, mk_feature_select_index, mk_objfunc
from ._ga import gamin
from ..utils._base import compress
from ..utils._logger import CVSummarizer, NoteBookVisualizer

[docs]class SimpleoptCV():
    """
    Each cross validation optimizer class's wrapper.

    This class allow unified handling in different type backend.

    For each backend optimizer class, refer to each class`s page.

    Parameters
    ----------
    estimator
        scikit-learn estimator like.

    param_distributions: dict.
        Search space.

    scoring: string or sklearn.metrics.make_scorer.
        Evaluation index of search.
        When scoring is None, use stimator default scorer and this score greater is better.
        
    cv: scikit-learn cross-validator or int(number of folds), default=5.
        Cross validation setting.

    max_iter: int, default=32.
        Number of search.

    random_state: int or None, default=None.
        The seed used by the random number generator.

    n_jobs: int, default=1.
        Number of jobs to run in parallel.

    pre_dispatch: int or string, default="2*n_jobs".
        Controls the number of jobs that get dispatched during parallel.

    verbose: int(0, 1 or 2), default=0.
        Controls the verbosity
        
        0: don't display status.

        1: display status by stdout.
        
        2: display status by graph.

    logdir: str or None, default=None.
        Path of directory to save log file.
        When logdir is None,  log is not saved.
        
        [directory structure]
        
        logdir
        
        |-cv_results
        
        | |-{model_id}.csv                                      : search log
        
        | ...

        |-cv_results_graph
        
        | |-{model_id}.html                                     : search log(graph)
        
        | ...
        
        |-estimators_{model_id}
        
            |-{model_id}_index{search count}_split{fold count}.pkl: an estimator which is fitted fold train data
            
            ...
            
            |-{model_id}_index{search count}_test.pkl             : an estimator which is fitted whole train data.

    save_estimator: int, default=0.
        estimator save setting.
        
        0: An estimator is not saved.
        
        1: An estimator which is fitted fold train data is saved per cv-fold.
        
        2: In addition to 1, an estimator which is fitted whole train data is saved per cv.

    saver: str or function, default="sklearn".
        estimator`s saver.
        
        * `sklearn`: use `sklearn.externals.joblib.dump`. Basically for scikit-learn.

        * function: function whose variable are model class and save path.

        Examples
        --------
        >>> def saver(model, path):
        >>>     save_model(model, path+".h5")

    model_id: str or None, default=None.
        This is used to log filename.
        When model_id is None, this is generated by date time.

    cloner: str or function, default="sklearn".
        estimator`s cloner.
        
        * `sklearn`: use try:`sklearn.base.clone`, except:`copy.deepcopy`. Basically for scikit-learn.

        * function: function whose variable is model.

        Examples
        --------
        >>> def cloner(model):
        >>>     clone_model(model)

    refit: bool, default=True.
        Refit an estimator using the best found parameters on all train data(=X).

    backend: str, default="hyperopt".
        backend optimeizer. Supports the following back ends.

        * `hyperopt`: Sequential Model Based Global Optimization

        * `bayesopt`: Bayesian Optimization

        * `gaopt`: Genetic Algorithm

        * `randomopt`: Random Search

    Attributes
    ----------
    cv_results_ : dict of numpy (masked) ndarrays
        A dict with keys as column headers and values as columns, that can be
        imported into a pandas ``DataFrame``.

    best_estimator_ : estimator or dict
        Estimator that was chosen by the search.

    best_score_ : float
        Cross-validated score of the best_estimator.

    best_params_ : dict
        Parameter setting that gave the best results on the hold out data.
    """      
    def __init__(self, estimator, param_distributions, 
                 scoring=None, cv=5, max_iter=32, 
                 random_state=None, n_jobs=1, pre_dispatch="2*n_jobs", 
                 verbose=0, logdir=None, save_estimator=0, saver="sklearn", model_id=None, 
                 cloner="sklearn", refit=True, backend="hyperopt", **kwargs): 
        if backend == "hyperopt":
            self.optcv = HyperoptCV(estimator, param_distributions, 
                                    scoring=scoring, cv=cv, max_iter=max_iter, random_state=random_state, 
                                    n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose, logdir=logdir, 
                                    save_estimator=save_estimator, saver=saver, model_id=model_id, 
                                    cloner=cloner, refit=refit, 
                                    **kwargs)
        elif backend == "bayesopt":
            self.optcv = BayesoptCV(estimator, param_distributions, 
                                    scoring=scoring, cv=cv, max_iter=max_iter, random_state=random_state, 
                                    n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose, logdir=logdir, 
                                    save_estimator=save_estimator, saver=saver, model_id=model_id, refit=refit, 
                                    cloner=cloner, 
                                    **kwargs)
        elif backend == "gaopt":
            self.optcv = GAoptCV(estimator, param_distributions, 
                                 scoring=scoring, cv=cv, max_iter=max_iter, random_state=random_state, 
                                 n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose, logdir=logdir, 
                                 save_estimator=save_estimator, saver=saver, model_id=model_id, refit=refit, 
                                 cloner=cloner, 
                                 **kwargs)
        elif backend == "randomopt":
            self.optcv = RandomoptCV(estimator, param_distributions, 
                                     scoring=scoring, cv=cv, max_iter=max_iter, random_state=random_state, 
                                     n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose, logdir=logdir, 
                                     save_estimator=save_estimator, saver=saver, model_id=model_id, refit=refit, 
                                     cloner=cloner, 
                                     **kwargs)
        else:
            raise Exception("`backend` "+str(backend)+" is not supported.")

        self.backend = backend
            
    def __getattr__(self, name):
        return getattr(self.optcv, name)

        

[docs]class HyperoptCV(BaseSearcher):
    """
    Cross validation optimize by Hyperopt(Sequential Model Based Global Optimization).

    Parameters
    ----------
    estimator
        scikit-learn estimator like.

    param_distributions: dict.
        Search space.

    scoring: string or sklearn.metrics.make_scorer.
        Evaluation index of search.
        When scoring is None, use stimator default scorer and this score greater is better.
        
    cv: scikit-learn cross-validator or int(number of folds), default=5.
        Cross validation setting.

    max_iter: int, default=32.
        Number of search.

    random_state: int or None, default=None.
        The seed used by the random number generator.

    n_jobs: int, default=1.
        Number of jobs to run in parallel.

    pre_dispatch: int or string, default="2*n_jobs".
        Controls the number of jobs that get dispatched during parallel.

    verbose: int(0, 1 or 2), default=0.
        Controls the verbosity
        
        0: don't display status.

        1: display status by stdout.
        
        2: display status by graph.

    logdir: str or None, default=None.
        Path of directory to save log file.
        When logdir is None,  log is not saved.
        
        [directory structure]
        
        logdir
        
        |-cv_results
        
        | |-{model_id}.csv                                      : search log
        
        | ...

        |-cv_results_graph
        
        | |-{model_id}.html                                     : search log(graph)
        
        | ...
        
        |-estimators_{model_id}
        
            |-{model_id}_index{search count}_split{fold count}.pkl: an estimator which is fitted fold train data
            
            ...
            
            |-{model_id}_index{search count}_test.pkl             : an estimator which is fitted whole train data.

    save_estimator: int, default=0.
        estimator save setting.
        
        0: An estimator is not saved.
        
        1: An estimator which is fitted fold train data is saved per cv-fold.
        
        2: In addition to 1, an estimator which is fitted whole train data is saved per cv.

    saver: str or function, default="sklearn".
        estimator`s saver.
        
        * `sklearn`: use `sklearn.externals.joblib.dump`. Basically for scikit-learn.

        * function: function whose variable are model class and save path.

        Examples
        --------
        >>> def saver(model, path):
        >>>     save_model(model, path+".h5")

    model_id: str or None, default=None.
        This is used to log filename.
        When model_id is None, this is generated by date time.

    cloner: str or function, default="sklearn".
        estimator`s cloner.
        
        * `sklearn`: use try:`sklearn.base.clone`, except:`copy.deepcopy`. Basically for scikit-learn.

        * function: function whose variable is model.

        Examples
        --------
        >>> def cloner(model):
        >>>     clone_model(model)

    refit: bool, default=True.
        Refit an estimator using the best found parameters on all train data(=X).

    algo: hyperopt search algorithm class, default=tpe.suggest.
        Hyperopt's parameter. Search algorithm.

    Attributes
    ----------
    cv_results_ : dict of numpy (masked) ndarrays
        A dict with keys as column headers and values as columns, that can be
        imported into a pandas ``DataFrame``.

    best_estimator_ : estimator or dict
        Estimator that was chosen by the search.

    best_score_ : float
        Cross-validated score of the best_estimator.

    best_params_ : dict
        Parameter setting that gave the best results on the hold out data.
    """
    def __init__(self, estimator, param_distributions, 
                 scoring=None, cv=5, max_iter=32, 
                 random_state=None, n_jobs=1, pre_dispatch="2*n_jobs", 
                 verbose=0, logdir=None, save_estimator=0, saver="sklearn", model_id=None, refit=True, 
                 cloner="sklearn", algo=tpe.suggest):
        super().__init__(estimator=estimator, param_distributions=param_distributions, 
                         scoring=scoring, cv=cv,  n_jobs=n_jobs, pre_dispatch=pre_dispatch, 
                         verbose=verbose, logdir=logdir, save_estimator=save_estimator, saver=saver, 
                         model_id=model_id, refit=refit, cloner="sklearn", backend="hyperopt")

        self.max_iter = max_iter
        self.algo = algo
        if random_state is None:
            self.random_state = random_state
        else:
            self.random_state = np.random.RandomState(int(random_state))
        self.search_algo = "hyperopt"

[docs]    def fit(self, X, y=None, validation_data=None, groups=None, 
            feature_groups=None, min_n_features=2):
        """
        Run fit.

        Parameters
        ----------       
        X :numpy.array, pandas.DataFrame or scipy.sparse, shape(axis=0) = (n_samples)
            Features. Detail depends on estimator.

        y: np.ndarray or pd.core.frame.DataFrame, shape(axis=0) = (n_samples) or None, default=None.
            Target variable. detail depends on estimator.

        validation_data: tuple(X, y) or None, default=None.
            Data to compute validation score. detail depends on estimator.
            When validation_data is None, computing validation score is not run.

        groups: array-like, shape = (n_samples,)  or None, default=None.
            Group labels for the samples used while splitting the dataset into train/test set.
            (input of scikit-learn cross-validator)

        feature_groups: array-like, shape = (n_samples,) or None, default=None.
            Group labels for the features used while fearture select.
            When feature_groups is None, fearture selection is not run.

            When feature_group's value is -1, this group's features always are used.

        min_n_features: int, default=2.
            When number of X's feature cols is less than min_n_features, return search failure.
            
            e.g. If estimator has columns sampling function, use this option to avoid X become too small and error.
        """
        X, y, Xvalid, yvalid, cv, param_distributions = self._preproc_fit(X=X, y=y, validation_data=validation_data, feature_groups=feature_groups)

        obj = mk_objfunc(X=X, y=y, groups=groups, feature_groups=feature_groups, feature_axis=BaseSearcher.feature_axis, 
                         estimator=self.estimator, scoring=self.scoring, cv=cv, 
                         param_distributions=param_distributions, backend=self.backend, failedscore=np.nan, 
                         saver=self.saver, cloner=self._cloner, score_summarizer=BaseSearcher.score_summarizer, 
                         Xvalid=Xvalid, yvalid=yvalid, n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch, 
                         cvsummarizer=self._cvs, save_estimator=self.save_estimator, min_n_features=min_n_features)

        try :
            fmin(obj, param_distributions, algo=self.algo, max_evals=self.max_iter, rstate=self.random_state)
        except KeyboardInterrupt:
            pass

        self._postproc_fit(X=X, y=y, feature_groups=feature_groups, 
                           best_params=self._cvs.best_params_, best_score=self._cvs.best_score_)
        return self



[docs]class BayesoptCV(BaseSearcher):
    """
    Cross validation optimizer by Gpyopt.BayesianOptimization.

    Parameters
    ----------
    estimator
        scikit-learn estimator like.

    param_distributions: dict.
        Search space.

    scoring: string or sklearn.metrics.make_scorer.
        Evaluation index of search.
        When scoring is None, use stimator default scorer and this score greater is better.
        
    cv: scikit-learn cross-validator or int(number of folds), default=5.
        Cross validation setting.

    max_iter: int, default=32.
        Number of search.

    random_state: int or None, default=None.
        The seed used by the random number generator.

    n_jobs: int, default=1.
        Number of jobs to run in parallel.

    pre_dispatch: int or string, default="2*n_jobs".
        Controls the number of jobs that get dispatched during parallel.

    verbose: int(0, 1 or 2), default=0.
        Controls the verbosity
        
        0: don't display status.

        1: display status by stdout.
        
        2: display status by graph.

    logdir: str or None, default=None.
        Path of directory to save log file.
        When logdir is None,  log is not saved.
        
        [directory structure]
        
        logdir
        
        |-cv_results
        
        | |-{model_id}.csv                                      : search log
        
        | ...

        |-cv_results_graph
        
        | |-{model_id}.html                                     : search log(graph)
        
        | ...
        
        |-estimators_{model_id}
        
            |-{model_id}_index{search count}_split{fold count}.pkl: an estimator which is fitted fold train data
            
            ...
            
            |-{model_id}_index{search count}_test.pkl             : an estimator which is fitted whole train data.

    save_estimator: int, default=0.
        estimator save setting.
        
        0: An estimator is not saved.
        
        1: An estimator which is fitted fold train data is saved per cv-fold.
        
        2: In addition to 1, an estimator which is fitted whole train data is saved per cv.

    saver: str or function, default="sklearn".
        estimator`s saver.
        
        * `sklearn`: use `sklearn.externals.joblib.dump`. Basically for scikit-learn.

        * function: function whose variable are model class and save path.

        Examples
        --------
        >>> def saver(model, path):
        >>>     save_model(model, path+".h5")

    model_id: str or None, default=None.
        This is used to log filename.
        When model_id is None, this is generated by date time.

    cloner: str or function, default="sklearn".
        estimator`s cloner.
        
        * `sklearn`: use try:`sklearn.base.clone`, except:`copy.deepcopy`. Basically for scikit-learn.

        * function: function whose variable is model.

        Examples
        --------
        >>> def cloner(model):
        >>>     clone_model(model)

    refit: bool, default=True.
        Refit an estimator using the best found parameters on all train data(=X).

    max_time: float, default=numpy.inf.
        GpyOpt`s parameter. Maximum exploration horizon in seconds.

    model_type: str, default="GP".
        GpyOpt`s parameter. Type of model to use as surrogate.

        * 'GP', standard Gaussian process.

        * 'GP_MCMC',  Gaussian process with prior in the hyper-parameters.

        * 'sparseGP', sparse Gaussian process.

        * 'warperdGP', warped Gaussian process.

        * 'InputWarpedGP', input warped Gaussian process

        * 'RF', random forest (scikit-learn).

    initial_params: numpy.array or None, default=None.
        GpyOpt`s parameter. Initial inputs of the Gpy model.

    initial_score: numpy.array or None, default=None.
        GpyOpt`s parameter. Initial outputs of the Gpy model.

    initial_design_numdata: int, default=5.
        GpyOpt`s parameter. Number of initial points that are collected jointly before start running the optimization.

    initial_design_type: str, default="random".
        GpyOpt`s parameter. Type of initial design.

        * 'random', to collect points in random locations.

        * 'latin', to collect points in a Latin hypercube (discrete variables are sampled randomly.)

    acquisition_type: str, default="EI".
        GpyOpt`s parameter. Type of acquisition function to use.

        * 'EI', expected improvement.

        * 'EI_MCMC', integrated expected improvement (requires GP_MCMC model).

        * 'MPI', maximum probability of improvement.

        * 'MPI_MCMC', maximum probability of improvement (requires GP_MCMC model).

        * 'LCB', GP-Lower confidence bound.

        * 'LCB_MCMC', integrated GP-Lower confidence bound (requires GP_MCMC model).

    normalize_Y: bool, default=True.
        GpyOpt`s parameter. Whether to normalize the outputs before performing any optimization.

    exact_feval: bool, default=False.
        GpyOpt`s parameter. Whether the outputs are exact.

    acquisition_optimizer_type: str. default="lbfgs".
        GpyOpt`s parameter. Type of acquisition function to use.

        * 'lbfgs': L-BFGS.

        * 'DIRECT': Dividing Rectangles.

        * 'CMA': covariance matrix adaptation.

    model_update_interval: int. default=1.
        GpyOpt`s parameter. Interval of collected observations after which the model is updated.

    evaluator_type: str, default="sequential".
        GpyOpt`s parameter. Determines the way the objective is evaluated (all methods are equivalent if the batch size is one).

        * 'sequential', sequential evaluations.

        * 'random': synchronous batch that selects the first element as in a sequential policy and the rest randomly.

        * 'local_penalization': batch method proposed in (Gonzalez et al. 2016).

        * 'thompson_sampling': batch method using Thompson sampling.

    batch_size: int, default=1. 
        GpyOpt`s parameter. Size of the batch in which the objective is evaluated.

    Attributes
    ----------
    cv_results_ : dict of numpy (masked) ndarrays
        A dict with keys as column headers and values as columns, that can be
        imported into a pandas ``DataFrame``.

    best_estimator_ : estimator or dict
        Estimator that was chosen by the search.

    best_score_ : float
        Cross-validated score of the best_estimator.

    best_params_ : dict
        Parameter setting that gave the best results on the hold out data.
    """

    def __init__(self, estimator, param_distributions, 
                 scoring=None, cv=5, max_iter=32, 
                 random_state=None, n_jobs=1, pre_dispatch="2*n_jobs", 
                 verbose=0, logdir=None, save_estimator=0, saver="sklearn", model_id=None, 
                 cloner="sklearn", refit=True, 
                 max_time=np.inf, model_type="GP", initial_params=None, initial_score=None, 
                 initial_design_numdata=5, initial_design_type="random", 
                 acquisition_type="EI", normalize_Y=True, exact_feval=False, 
                 acquisition_optimizer_type="lbfgs", model_update_interval=1, 
                 evaluator_type="sequential", batch_size=1):

        super().__init__(estimator=estimator, param_distributions=param_distributions, 
                         scoring=scoring, cv=cv, n_jobs=n_jobs, pre_dispatch=pre_dispatch, 
                         verbose=verbose, logdir=logdir, save_estimator=save_estimator, saver=saver, 
                         model_id=model_id,  cloner= cloner, refit=refit, backend="bayesopt")
        
        self.random_state = random_state
        self.max_iter = max_iter
        self.max_time = max_time
        self.model_type = model_type
        self.initial_params = initial_params
        self.initial_score = initial_score
        self.initial_design_numdata = initial_design_numdata
        self.initial_design_type = initial_design_type
        self.acquisition_type = acquisition_type
        self.normalize_Y = normalize_Y
        self.exact_feval = exact_feval
        self.acquisition_optimizer_type = acquisition_optimizer_type
        self.model_update_interval = model_update_interval
        self.evaluator_type = evaluator_type
        self.batch_size = batch_size
        
        self.failedscore = None
        self.search_algo = "bayesopt"

[docs]    def fit(self, X, y=None, validation_data=None, groups=None, 
            feature_groups=None, min_n_features=2):
        """
        Run fit.

        Parameters
        ----------       
        X :numpy.array, pandas.DataFrame or scipy.sparse, shape(axis=0) = (n_samples)
            Features. Detail depends on estimator.

        y: np.ndarray or pd.core.frame.DataFrame, shape(axis=0) = (n_samples) or None, default=None.
            Target variable. detail depends on estimator.

        validation_data: tuple(X, y) or None, default=None.
            Data to compute validation score. detail depends on estimator.
            When validation_data is None, computing validation score is not run.

        groups: array-like, shape = (n_samples,)  or None, default=None.
            Group labels for the samples used while splitting the dataset into train/test set.
            (input of scikit-learn cross-validator)

        feature_groups: array-like, shape = (n_samples,) or None, default=None.
            Group labels for the features used while fearture select.
            When feature_groups is None, fearture selection is not run.

            When feature_group's value is -1, this group's features always are used.

        min_n_features: int, default=2.
            When number of X's feature cols is less than min_n_features, return search failure.
            
            e.g. If estimator has columns sampling function, use this option to avoid X become too small and error.
        """
        X, y, Xvalid, yvalid, cv, param_distributions = self._preproc_fit(X=X, y=y, validation_data=validation_data, feature_groups=feature_groups)
        np.random.seed(self.random_state)

        if self.failedscore is None:
            # If search is failed, Return random score.
            # random score is fixed at first fit.
            self.failedscore = self._random_scoring(X, y)

        obj = mk_objfunc(X=X, y=y, groups=groups, feature_groups=feature_groups, feature_axis=BaseSearcher.feature_axis, 
                         estimator=self.estimator, scoring=self.scoring, cv=cv, 
                         param_distributions=param_distributions, backend=self.backend, failedscore=self.failedscore, 
                         saver=self.saver,  cloner=self._cloner, score_summarizer=BaseSearcher.score_summarizer, 
                         Xvalid=Xvalid, yvalid=yvalid, n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch, 
                         cvsummarizer=self._cvs, save_estimator=self.save_estimator, min_n_features=min_n_features)

        self.opt = BayesianOptimization(obj, domain=param_distributions, constraints=None, cost_withGradients=None, 
                                        model_type=self.model_type, X=self.initial_params, Y=self.initial_score,
                                        initial_design_numdata=self.initial_design_numdata, 
                                        initial_design_type=self.initial_design_type, 
                                        acquisition_type=self.acquisition_type, normalize_Y=self.normalize_Y,
                                        exact_feval=self.exact_feval, acquisition_optimizer_type=self.acquisition_optimizer_type, 
                                        model_update_interval=self.model_update_interval, evaluator_type=self.evaluator_type, 
                                        batch_size=self.batch_size, num_cores=1, verbosity=False, verbosity_model=False, 
                                        maximize=False, de_duplication=False)   

        try :
            self.opt.run_optimization(max_iter=self.max_iter, max_time=self.max_time)
        except KeyboardInterrupt:
            pass
        
        self._postproc_fit(X=X, y=y, feature_groups=feature_groups, 
                           best_params=self._cvs.best_params_, best_score=self._cvs.best_score_)
        return self


[docs]class GAoptCV(BaseSearcher):
    """
    Cross validation optimizer by Genetic Algorithm.

    Parameters
    ----------
    estimator
        scikit-learn estimator like.

    param_distributions: dict.
        Search space.

    scoring: string or sklearn.metrics.make_scorer.
        Evaluation index of search.
        When scoring is None, use stimator default scorer and this score greater is better.
        
    cv: scikit-learn cross-validator or int(number of folds), default=5.
        Cross validation setting.

    max_iter: int, default=32.
        Number of search.

    random_state: int or None, default=None.
        The seed used by the random number generator.

    n_jobs: int, default=1.
        Number of jobs to run in parallel.

    pre_dispatch: int or string, default="2*n_jobs".
        Controls the number of jobs that get dispatched during parallel.

    verbose: int(0, 1 or 2), default=0.
        Controls the verbosity
        
        0: don't display status.

        1: display status by stdout.
        
        2: display status by graph.

    logdir: str or None, default=None.
        Path of directory to save log file.
        When logdir is None,  log is not saved.
        
        [directory structure]
        
        logdir
        
        |-cv_results
        
        | |-{model_id}.csv                                      : search log
        
        | ...

        |-cv_results_graph
        
        | |-{model_id}.html                                     : search log(graph)
        
        | ...
        
        |-estimators_{model_id}
        
            |-{model_id}_index{search count}_split{fold count}.pkl: an estimator which is fitted fold train data
            
            ...
            
            |-{model_id}_index{search count}_test.pkl             : an estimator which is fitted whole train data.

    save_estimator: int, default=0.
        estimator save setting.
        
        0: An estimator is not saved.
        
        1: An estimator which is fitted fold train data is saved per cv-fold.
        
        2: In addition to 1, an estimator which is fitted whole train data is saved per cv.

    saver: str or function, default="sklearn".
        estimator`s saver.
        
        * `sklearn`: use `sklearn.externals.joblib.dump`. Basically for scikit-learn.

        * function: function whose variable are model class and save path.

        Examples
        --------
        >>> def saver(model, path):
        >>>     save_model(model, path+".h5")

    model_id: str or None, default=None.
        This is used to log filename.
        When model_id is None, this is generated by date time.

    cloner: str or function, default="sklearn".
        estimator`s cloner.
        
        * `sklearn`: use try:`sklearn.base.clone`, except:`copy.deepcopy`. Basically for scikit-learn.

        * function: function whose variable is model.

        Examples
        --------
        >>> def cloner(model):
        >>>     clone_model(model)

    refit: bool, default=True.
        Refit an estimator using the best found parameters on all train data(=X).

    iter_pergeneration: int, default=8.
        Genetic algorithm's parameter. Number of iteration per generation (it corresponds to number of population.).

    param_crossover_proba: float or function, default=0.5.
        Genetic algorithm's parameter. Probability which a certain parameter becomes another parent value.

        If this value 0 or 1, paramaters is not changed by crossover.

        Function whose variable is number of generation could be passed to this variable.
        Number of generation' s start is 0. But create population by random sampling in generation 0, so this function is used from generation 1.

        Examples
        --------
        >>> def f(generaion):
        >>>     return 0.5 / generaion

    param_mutation_proba: float or function, default=0.01.
        Genetic algorithm's parameter. Probability which a certain parameter is mutated.

        Function whose variable is number of generation Could be passed to this variable.

    random_sampling_proba: float or function, default=0.01.
        Genetic algorithm's parameter. In a certain generation, probability which individual is created by random sampling.

        Function whose variable is number of generation Could be passed to this variable.

    Attributes
    ----------
    cv_results_ : dict of numpy (masked) ndarrays
        A dict with keys as column headers and values as columns, that can be
        imported into a pandas ``DataFrame``.

    best_estimator_ : estimator or dict
        Estimator that was chosen by the search.

    best_score_ : float
        Cross-validated score of the best_estimator.

    best_params_ : dict
        Parameter setting that gave the best results on the hold out data.
    """
    def __init__(self, estimator, param_distributions, 
                 scoring=None, cv=5, max_iter=32, 
                 random_state=None, n_jobs=1, pre_dispatch="2*n_jobs", 
                 verbose=0, logdir=None, save_estimator=0, saver="sklearn", model_id=None, 
                 cloner="sklearn", refit=True, 
                 iter_pergeneration=8, param_crossover_proba=0.5, param_mutation_proba=0.01, 
                 random_sampling_proba=0.01):
        super().__init__(estimator=estimator, param_distributions=param_distributions, 
                         scoring=scoring, cv=cv,  n_jobs=n_jobs, pre_dispatch=pre_dispatch, 
                         verbose=verbose, logdir=logdir, save_estimator=save_estimator, saver=saver, 
                         model_id=model_id, cloner=cloner, refit=refit, backend="gaopt")

        self.max_iter = max_iter
        self.iter_pergeneration = iter_pergeneration
        self.param_crossover_proba = param_crossover_proba
        self.param_mutation_proba = param_mutation_proba
        self.random_sampling_proba = random_sampling_proba
        if random_state is None:
            self.random_state = random_state
        else:
            self.random_state = np.random.RandomState(int(random_state))
        self.search_algo = "gaopt"

[docs]    def fit(self, X, y=None, validation_data=None, groups=None, 
            feature_groups=None, min_n_features=2):
        """
        Run fit.

        Parameters
        ----------       
        X :numpy.array, pandas.DataFrame or scipy.sparse, shape(axis=0) = (n_samples)
            Features. Detail depends on estimator.

        y: np.ndarray or pd.core.frame.DataFrame, shape(axis=0) = (n_samples) or None, default=None.
            Target variable. detail depends on estimator.

        validation_data: tuple(X, y) or None, default=None.
            Data to compute validation score. detail depends on estimator.
            When validation_data is None, computing validation score is not run.

        groups: array-like, shape = (n_samples,)  or None, default=None.
            Group labels for the samples used while splitting the dataset into train/test set.
            (input of scikit-learn cross-validator)

        feature_groups: array-like, shape = (n_samples,) or None, default=None.
            Group labels for the features used while fearture select.
            When feature_groups is None, fearture selection is not run.

            When feature_group's value is -1, this group's features always are used.

        min_n_features: int, default=2.
            When number of X's feature cols is less than min_n_features, return search failure.
            
            e.g. If estimator has columns sampling function, use this option to avoid X become too small and error.
        """
        X, y, Xvalid, yvalid, cv, param_distributions = self._preproc_fit(X=X, y=y, validation_data=validation_data, feature_groups=feature_groups)
        np.random.seed(self.random_state)

        obj = mk_objfunc(X=X, y=y, groups=groups, feature_groups=feature_groups, feature_axis=BaseSearcher.feature_axis, 
                         estimator=self.estimator, scoring=self.scoring, cv=cv, 
                         param_distributions=param_distributions, backend=self.backend, failedscore=np.nan, 
                         saver=self.saver, cloner=self._cloner, score_summarizer=BaseSearcher.score_summarizer, 
                         Xvalid=Xvalid, yvalid=yvalid, n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch, 
                         cvsummarizer=self._cvs, save_estimator=self.save_estimator, min_n_features=min_n_features)

        try :
            gamin(obj, param_distributions, max_iter=self.max_iter, iter_pergeneration=self.iter_pergeneration, 
                  param_crossover_proba=self.param_crossover_proba, param_mutation_proba=self.param_mutation_proba, 
                  random_sampling_proba=self.random_sampling_proba, cvsummarizer=self._cvs)
        except KeyboardInterrupt:
            pass

        self._postproc_fit(X=X, y=y, feature_groups=feature_groups, 
                           best_params=self._cvs.best_params_, best_score=self._cvs.best_score_)
        return self



[docs]class RandomoptCV(BaseSearcher):
    """
    Cross validation optimizer by Random Search.

    Parameters
    ----------
    estimator
        scikit-learn estimator like.

    param_distributions: dict.
        Search space.

    scoring: string or sklearn.metrics.make_scorer.
        Evaluation index of search.
        When scoring is None, use stimator default scorer and this score greater is better.
        
    cv: scikit-learn cross-validator or int(number of folds), default=5.
        Cross validation setting.

    max_iter: int, default=32.
        Number of search.

    random_state: int or None, default=None.
        The seed used by the random number generator.

    n_jobs: int, default=1.
        Number of jobs to run in parallel.

    pre_dispatch: int or string, default="2*n_jobs".
        Controls the number of jobs that get dispatched during parallel.

    verbose: int(0, 1 or 2), default=0.
        Controls the verbosity
        
        0: don't display status.

        1: display status by stdout.
        
        2: display status by graph.

    logdir: str or None, default=None.
        Path of directory to save log file.
        When logdir is None,  log is not saved.
        
        [directory structure]
        
        logdir
        
        |-cv_results
        
        | |-{model_id}.csv                                      : search log
        
        | ...

        |-cv_results_graph
        
        | |-{model_id}.html                                     : search log(graph)
        
        | ...
        
        |-estimators_{model_id}
        
            |-{model_id}_index{search count}_split{fold count}.pkl: an estimator which is fitted fold train data
            
            ...
            
            |-{model_id}_index{search count}_test.pkl             : an estimator which is fitted whole train data.

    save_estimator: int, default=0.
        estimator save setting.
        
        0: An estimator is not saved.
        
        1: An estimator which is fitted fold train data is saved per cv-fold.
        
        2: In addition to 1, an estimator which is fitted whole train data is saved per cv.

    saver: str or function, default="sklearn".
        estimator`s saver.
        
        * `sklearn`: use `sklearn.externals.joblib.dump`. Basically for scikit-learn.

        * function: function whose variable are model class and save path.

        Examples
        --------
        >>> def saver(model, path):
        >>>     save_model(model, path+".h5")

    model_id: str or None, default=None.
        This is used to log filename.
        When model_id is None, this is generated by date time.

    cloner: str or function, default="sklearn".
        estimator`s cloner.
        
        * `sklearn`: use try:`sklearn.base.clone`, except:`copy.deepcopy`. Basically for scikit-learn.

        * function: function whose variable is model.

        Examples
        --------
        >>> def cloner(model):
        >>>     clone_model(model)

    refit: bool, default=True.
        Refit an estimator using the best found parameters on all train data(=X).

    Attributes
    ----------
    cv_results_ : dict of numpy (masked) ndarrays
        A dict with keys as column headers and values as columns, that can be
        imported into a pandas ``DataFrame``.

    best_estimator_ : estimator or dict
        Estimator that was chosen by the search.

    best_score_ : float
        Cross-validated score of the best_estimator.

    best_params_ : dict
        Parameter setting that gave the best results on the hold out data.
    """
    def __init__(self, estimator, param_distributions, 
                 scoring=None, cv=5, max_iter=32, 
                 random_state=None, n_jobs=1, pre_dispatch="2*n_jobs", 
                 verbose=0, logdir=None, save_estimator=0, saver="sklearn", model_id=None, 
                 cloner="sklearn", refit=True):
        super().__init__(estimator=estimator, param_distributions=param_distributions, 
                         scoring=scoring, cv=cv,  n_jobs=n_jobs, pre_dispatch=pre_dispatch, 
                         verbose=verbose, logdir=logdir, save_estimator=save_estimator, saver=saver, 
                         model_id=model_id, cloner=cloner, refit=refit, backend="gaopt")

        self.max_iter = max_iter
        if random_state is None:
            self.random_state = random_state
        else:
            self.random_state = np.random.RandomState(int(random_state))
        self.search_algo = "randomopt"

[docs]    def fit(self, X, y=None, validation_data=None, groups=None, 
            feature_groups=None, min_n_features=2):
        """
        Run fit.

        Parameters
        ----------       
        X :numpy.array, pandas.DataFrame or scipy.sparse, shape(axis=0) = (n_samples)
            Features. Detail depends on estimator.

        y: np.ndarray or pd.core.frame.DataFrame, shape(axis=0) = (n_samples) or None, default=None.
            Target variable. detail depends on estimator.

        validation_data: tuple(X, y) or None, default=None.
            Data to compute validation score. detail depends on estimator.
            When validation_data is None, computing validation score is not run.

        groups: array-like, shape = (n_samples,)  or None, default=None.
            Group labels for the samples used while splitting the dataset into train/test set.
            (input of scikit-learn cross-validator)

        feature_groups: array-like, shape = (n_samples,) or None, default=None.
            Group labels for the features used while fearture select.
            When feature_groups is None, fearture selection is not run.

            When feature_group's value is -1, this group's features always are used.

        min_n_features: int, default=2.
            When number of X's feature cols is less than min_n_features, return search failure.
            
            e.g. If estimator has columns sampling function, use this option to avoid X become too small and error.
        """
        X, y, Xvalid, yvalid, cv, param_distributions = self._preproc_fit(X=X, y=y, validation_data=validation_data, feature_groups=feature_groups)
        np.random.seed(self.random_state)

        obj = mk_objfunc(X=X, y=y, groups=groups, feature_groups=feature_groups, feature_axis=BaseSearcher.feature_axis, 
                         estimator=self.estimator, scoring=self.scoring, cv=cv, 
                         param_distributions=param_distributions, backend=self.backend, failedscore=np.nan, 
                         saver=self.saver, cloner=self._cloner, score_summarizer=BaseSearcher.score_summarizer, 
                         Xvalid=Xvalid, yvalid=yvalid, n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch, 
                         cvsummarizer=self._cvs, save_estimator=self.save_estimator, min_n_features=min_n_features)

        try :
            gamin(obj, param_distributions, max_iter=self.max_iter, iter_pergeneration=1, 
                  param_crossover_proba=0, param_mutation_proba=0, 
                  random_sampling_proba=1, cvsummarizer=self._cvs)
        except KeyboardInterrupt:
            pass

        self._postproc_fit(X=X, y=y, feature_groups=feature_groups, 
                           best_params=self._cvs.best_params_, best_score=self._cvs.best_score_)
        return self