import numpy as np
from hyperopt import fmin, tpe, hp
from GPyOpt.methods import BayesianOptimization
from ._base import BaseSearcher, fit_and_score, mk_feature_select_index, mk_objfunc
from ._ga import gamin
from ..utils._base import compress
from ..utils._logger import CVSummarizer, NoteBookVisualizer
[docs]class SimpleoptCV():
"""
Each cross validation optimizer class's wrapper.
This class allow unified handling in different type backend.
For each backend optimizer class, refer to each class`s page.
Parameters
----------
estimator
scikit-learn estimator like.
param_distributions: dict.
Search space.
scoring: string or sklearn.metrics.make_scorer.
Evaluation index of search.
When scoring is None, use stimator default scorer and this score greater is better.
cv: scikit-learn cross-validator or int(number of folds), default=5.
Cross validation setting.
max_iter: int, default=32.
Number of search.
random_state: int or None, default=None.
The seed used by the random number generator.
n_jobs: int, default=1.
Number of jobs to run in parallel.
pre_dispatch: int or string, default="2*n_jobs".
Controls the number of jobs that get dispatched during parallel.
verbose: int(0, 1 or 2), default=0.
Controls the verbosity
0: don't display status.
1: display status by stdout.
2: display status by graph.
logdir: str or None, default=None.
Path of directory to save log file.
When logdir is None, log is not saved.
[directory structure]
logdir
|-cv_results
| |-{model_id}.csv : search log
| ...
|-cv_results_graph
| |-{model_id}.html : search log(graph)
| ...
|-estimators_{model_id}
|-{model_id}_index{search count}_split{fold count}.pkl: an estimator which is fitted fold train data
...
|-{model_id}_index{search count}_test.pkl : an estimator which is fitted whole train data.
save_estimator: int, default=0.
estimator save setting.
0: An estimator is not saved.
1: An estimator which is fitted fold train data is saved per cv-fold.
2: In addition to 1, an estimator which is fitted whole train data is saved per cv.
saver: str or function, default="sklearn".
estimator`s saver.
* `sklearn`: use `sklearn.externals.joblib.dump`. Basically for scikit-learn.
* function: function whose variable are model class and save path.
Examples
--------
>>> def saver(model, path):
>>> save_model(model, path+".h5")
model_id: str or None, default=None.
This is used to log filename.
When model_id is None, this is generated by date time.
cloner: str or function, default="sklearn".
estimator`s cloner.
* `sklearn`: use try:`sklearn.base.clone`, except:`copy.deepcopy`. Basically for scikit-learn.
* function: function whose variable is model.
Examples
--------
>>> def cloner(model):
>>> clone_model(model)
refit: bool, default=True.
Refit an estimator using the best found parameters on all train data(=X).
backend: str, default="hyperopt".
backend optimeizer. Supports the following back ends.
* `hyperopt`: Sequential Model Based Global Optimization
* `bayesopt`: Bayesian Optimization
* `gaopt`: Genetic Algorithm
* `randomopt`: Random Search
Attributes
----------
cv_results_ : dict of numpy (masked) ndarrays
A dict with keys as column headers and values as columns, that can be
imported into a pandas ``DataFrame``.
best_estimator_ : estimator or dict
Estimator that was chosen by the search.
best_score_ : float
Cross-validated score of the best_estimator.
best_params_ : dict
Parameter setting that gave the best results on the hold out data.
"""
def __init__(self, estimator, param_distributions,
scoring=None, cv=5, max_iter=32,
random_state=None, n_jobs=1, pre_dispatch="2*n_jobs",
verbose=0, logdir=None, save_estimator=0, saver="sklearn", model_id=None,
cloner="sklearn", refit=True, backend="hyperopt", **kwargs):
if backend == "hyperopt":
self.optcv = HyperoptCV(estimator, param_distributions,
scoring=scoring, cv=cv, max_iter=max_iter, random_state=random_state,
n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose, logdir=logdir,
save_estimator=save_estimator, saver=saver, model_id=model_id,
cloner=cloner, refit=refit,
**kwargs)
elif backend == "bayesopt":
self.optcv = BayesoptCV(estimator, param_distributions,
scoring=scoring, cv=cv, max_iter=max_iter, random_state=random_state,
n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose, logdir=logdir,
save_estimator=save_estimator, saver=saver, model_id=model_id, refit=refit,
cloner=cloner,
**kwargs)
elif backend == "gaopt":
self.optcv = GAoptCV(estimator, param_distributions,
scoring=scoring, cv=cv, max_iter=max_iter, random_state=random_state,
n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose, logdir=logdir,
save_estimator=save_estimator, saver=saver, model_id=model_id, refit=refit,
cloner=cloner,
**kwargs)
elif backend == "randomopt":
self.optcv = RandomoptCV(estimator, param_distributions,
scoring=scoring, cv=cv, max_iter=max_iter, random_state=random_state,
n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose, logdir=logdir,
save_estimator=save_estimator, saver=saver, model_id=model_id, refit=refit,
cloner=cloner,
**kwargs)
else:
raise Exception("`backend` "+str(backend)+" is not supported.")
self.backend = backend
def __getattr__(self, name):
return getattr(self.optcv, name)
[docs]class HyperoptCV(BaseSearcher):
"""
Cross validation optimize by Hyperopt(Sequential Model Based Global Optimization).
Parameters
----------
estimator
scikit-learn estimator like.
param_distributions: dict.
Search space.
scoring: string or sklearn.metrics.make_scorer.
Evaluation index of search.
When scoring is None, use stimator default scorer and this score greater is better.
cv: scikit-learn cross-validator or int(number of folds), default=5.
Cross validation setting.
max_iter: int, default=32.
Number of search.
random_state: int or None, default=None.
The seed used by the random number generator.
n_jobs: int, default=1.
Number of jobs to run in parallel.
pre_dispatch: int or string, default="2*n_jobs".
Controls the number of jobs that get dispatched during parallel.
verbose: int(0, 1 or 2), default=0.
Controls the verbosity
0: don't display status.
1: display status by stdout.
2: display status by graph.
logdir: str or None, default=None.
Path of directory to save log file.
When logdir is None, log is not saved.
[directory structure]
logdir
|-cv_results
| |-{model_id}.csv : search log
| ...
|-cv_results_graph
| |-{model_id}.html : search log(graph)
| ...
|-estimators_{model_id}
|-{model_id}_index{search count}_split{fold count}.pkl: an estimator which is fitted fold train data
...
|-{model_id}_index{search count}_test.pkl : an estimator which is fitted whole train data.
save_estimator: int, default=0.
estimator save setting.
0: An estimator is not saved.
1: An estimator which is fitted fold train data is saved per cv-fold.
2: In addition to 1, an estimator which is fitted whole train data is saved per cv.
saver: str or function, default="sklearn".
estimator`s saver.
* `sklearn`: use `sklearn.externals.joblib.dump`. Basically for scikit-learn.
* function: function whose variable are model class and save path.
Examples
--------
>>> def saver(model, path):
>>> save_model(model, path+".h5")
model_id: str or None, default=None.
This is used to log filename.
When model_id is None, this is generated by date time.
cloner: str or function, default="sklearn".
estimator`s cloner.
* `sklearn`: use try:`sklearn.base.clone`, except:`copy.deepcopy`. Basically for scikit-learn.
* function: function whose variable is model.
Examples
--------
>>> def cloner(model):
>>> clone_model(model)
refit: bool, default=True.
Refit an estimator using the best found parameters on all train data(=X).
algo: hyperopt search algorithm class, default=tpe.suggest.
Hyperopt's parameter. Search algorithm.
Attributes
----------
cv_results_ : dict of numpy (masked) ndarrays
A dict with keys as column headers and values as columns, that can be
imported into a pandas ``DataFrame``.
best_estimator_ : estimator or dict
Estimator that was chosen by the search.
best_score_ : float
Cross-validated score of the best_estimator.
best_params_ : dict
Parameter setting that gave the best results on the hold out data.
"""
def __init__(self, estimator, param_distributions,
scoring=None, cv=5, max_iter=32,
random_state=None, n_jobs=1, pre_dispatch="2*n_jobs",
verbose=0, logdir=None, save_estimator=0, saver="sklearn", model_id=None, refit=True,
cloner="sklearn", algo=tpe.suggest):
super().__init__(estimator=estimator, param_distributions=param_distributions,
scoring=scoring, cv=cv, n_jobs=n_jobs, pre_dispatch=pre_dispatch,
verbose=verbose, logdir=logdir, save_estimator=save_estimator, saver=saver,
model_id=model_id, refit=refit, cloner="sklearn", backend="hyperopt")
self.max_iter = max_iter
self.algo = algo
if random_state is None:
self.random_state = random_state
else:
self.random_state = np.random.RandomState(int(random_state))
self.search_algo = "hyperopt"
[docs] def fit(self, X, y=None, validation_data=None, groups=None,
feature_groups=None, min_n_features=2):
"""
Run fit.
Parameters
----------
X :numpy.array, pandas.DataFrame or scipy.sparse, shape(axis=0) = (n_samples)
Features. Detail depends on estimator.
y: np.ndarray or pd.core.frame.DataFrame, shape(axis=0) = (n_samples) or None, default=None.
Target variable. detail depends on estimator.
validation_data: tuple(X, y) or None, default=None.
Data to compute validation score. detail depends on estimator.
When validation_data is None, computing validation score is not run.
groups: array-like, shape = (n_samples,) or None, default=None.
Group labels for the samples used while splitting the dataset into train/test set.
(input of scikit-learn cross-validator)
feature_groups: array-like, shape = (n_samples,) or None, default=None.
Group labels for the features used while fearture select.
When feature_groups is None, fearture selection is not run.
When feature_group's value is -1, this group's features always are used.
min_n_features: int, default=2.
When number of X's feature cols is less than min_n_features, return search failure.
e.g. If estimator has columns sampling function, use this option to avoid X become too small and error.
"""
X, y, Xvalid, yvalid, cv, param_distributions = self._preproc_fit(X=X, y=y, validation_data=validation_data, feature_groups=feature_groups)
obj = mk_objfunc(X=X, y=y, groups=groups, feature_groups=feature_groups, feature_axis=BaseSearcher.feature_axis,
estimator=self.estimator, scoring=self.scoring, cv=cv,
param_distributions=param_distributions, backend=self.backend, failedscore=np.nan,
saver=self.saver, cloner=self._cloner, score_summarizer=BaseSearcher.score_summarizer,
Xvalid=Xvalid, yvalid=yvalid, n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch,
cvsummarizer=self._cvs, save_estimator=self.save_estimator, min_n_features=min_n_features)
try :
fmin(obj, param_distributions, algo=self.algo, max_evals=self.max_iter, rstate=self.random_state)
except KeyboardInterrupt:
pass
self._postproc_fit(X=X, y=y, feature_groups=feature_groups,
best_params=self._cvs.best_params_, best_score=self._cvs.best_score_)
return self
[docs]class BayesoptCV(BaseSearcher):
"""
Cross validation optimizer by Gpyopt.BayesianOptimization.
Parameters
----------
estimator
scikit-learn estimator like.
param_distributions: dict.
Search space.
scoring: string or sklearn.metrics.make_scorer.
Evaluation index of search.
When scoring is None, use stimator default scorer and this score greater is better.
cv: scikit-learn cross-validator or int(number of folds), default=5.
Cross validation setting.
max_iter: int, default=32.
Number of search.
random_state: int or None, default=None.
The seed used by the random number generator.
n_jobs: int, default=1.
Number of jobs to run in parallel.
pre_dispatch: int or string, default="2*n_jobs".
Controls the number of jobs that get dispatched during parallel.
verbose: int(0, 1 or 2), default=0.
Controls the verbosity
0: don't display status.
1: display status by stdout.
2: display status by graph.
logdir: str or None, default=None.
Path of directory to save log file.
When logdir is None, log is not saved.
[directory structure]
logdir
|-cv_results
| |-{model_id}.csv : search log
| ...
|-cv_results_graph
| |-{model_id}.html : search log(graph)
| ...
|-estimators_{model_id}
|-{model_id}_index{search count}_split{fold count}.pkl: an estimator which is fitted fold train data
...
|-{model_id}_index{search count}_test.pkl : an estimator which is fitted whole train data.
save_estimator: int, default=0.
estimator save setting.
0: An estimator is not saved.
1: An estimator which is fitted fold train data is saved per cv-fold.
2: In addition to 1, an estimator which is fitted whole train data is saved per cv.
saver: str or function, default="sklearn".
estimator`s saver.
* `sklearn`: use `sklearn.externals.joblib.dump`. Basically for scikit-learn.
* function: function whose variable are model class and save path.
Examples
--------
>>> def saver(model, path):
>>> save_model(model, path+".h5")
model_id: str or None, default=None.
This is used to log filename.
When model_id is None, this is generated by date time.
cloner: str or function, default="sklearn".
estimator`s cloner.
* `sklearn`: use try:`sklearn.base.clone`, except:`copy.deepcopy`. Basically for scikit-learn.
* function: function whose variable is model.
Examples
--------
>>> def cloner(model):
>>> clone_model(model)
refit: bool, default=True.
Refit an estimator using the best found parameters on all train data(=X).
max_time: float, default=numpy.inf.
GpyOpt`s parameter. Maximum exploration horizon in seconds.
model_type: str, default="GP".
GpyOpt`s parameter. Type of model to use as surrogate.
* 'GP', standard Gaussian process.
* 'GP_MCMC', Gaussian process with prior in the hyper-parameters.
* 'sparseGP', sparse Gaussian process.
* 'warperdGP', warped Gaussian process.
* 'InputWarpedGP', input warped Gaussian process
* 'RF', random forest (scikit-learn).
initial_params: numpy.array or None, default=None.
GpyOpt`s parameter. Initial inputs of the Gpy model.
initial_score: numpy.array or None, default=None.
GpyOpt`s parameter. Initial outputs of the Gpy model.
initial_design_numdata: int, default=5.
GpyOpt`s parameter. Number of initial points that are collected jointly before start running the optimization.
initial_design_type: str, default="random".
GpyOpt`s parameter. Type of initial design.
* 'random', to collect points in random locations.
* 'latin', to collect points in a Latin hypercube (discrete variables are sampled randomly.)
acquisition_type: str, default="EI".
GpyOpt`s parameter. Type of acquisition function to use.
* 'EI', expected improvement.
* 'EI_MCMC', integrated expected improvement (requires GP_MCMC model).
* 'MPI', maximum probability of improvement.
* 'MPI_MCMC', maximum probability of improvement (requires GP_MCMC model).
* 'LCB', GP-Lower confidence bound.
* 'LCB_MCMC', integrated GP-Lower confidence bound (requires GP_MCMC model).
normalize_Y: bool, default=True.
GpyOpt`s parameter. Whether to normalize the outputs before performing any optimization.
exact_feval: bool, default=False.
GpyOpt`s parameter. Whether the outputs are exact.
acquisition_optimizer_type: str. default="lbfgs".
GpyOpt`s parameter. Type of acquisition function to use.
* 'lbfgs': L-BFGS.
* 'DIRECT': Dividing Rectangles.
* 'CMA': covariance matrix adaptation.
model_update_interval: int. default=1.
GpyOpt`s parameter. Interval of collected observations after which the model is updated.
evaluator_type: str, default="sequential".
GpyOpt`s parameter. Determines the way the objective is evaluated (all methods are equivalent if the batch size is one).
* 'sequential', sequential evaluations.
* 'random': synchronous batch that selects the first element as in a sequential policy and the rest randomly.
* 'local_penalization': batch method proposed in (Gonzalez et al. 2016).
* 'thompson_sampling': batch method using Thompson sampling.
batch_size: int, default=1.
GpyOpt`s parameter. Size of the batch in which the objective is evaluated.
Attributes
----------
cv_results_ : dict of numpy (masked) ndarrays
A dict with keys as column headers and values as columns, that can be
imported into a pandas ``DataFrame``.
best_estimator_ : estimator or dict
Estimator that was chosen by the search.
best_score_ : float
Cross-validated score of the best_estimator.
best_params_ : dict
Parameter setting that gave the best results on the hold out data.
"""
def __init__(self, estimator, param_distributions,
scoring=None, cv=5, max_iter=32,
random_state=None, n_jobs=1, pre_dispatch="2*n_jobs",
verbose=0, logdir=None, save_estimator=0, saver="sklearn", model_id=None,
cloner="sklearn", refit=True,
max_time=np.inf, model_type="GP", initial_params=None, initial_score=None,
initial_design_numdata=5, initial_design_type="random",
acquisition_type="EI", normalize_Y=True, exact_feval=False,
acquisition_optimizer_type="lbfgs", model_update_interval=1,
evaluator_type="sequential", batch_size=1):
super().__init__(estimator=estimator, param_distributions=param_distributions,
scoring=scoring, cv=cv, n_jobs=n_jobs, pre_dispatch=pre_dispatch,
verbose=verbose, logdir=logdir, save_estimator=save_estimator, saver=saver,
model_id=model_id, cloner= cloner, refit=refit, backend="bayesopt")
self.random_state = random_state
self.max_iter = max_iter
self.max_time = max_time
self.model_type = model_type
self.initial_params = initial_params
self.initial_score = initial_score
self.initial_design_numdata = initial_design_numdata
self.initial_design_type = initial_design_type
self.acquisition_type = acquisition_type
self.normalize_Y = normalize_Y
self.exact_feval = exact_feval
self.acquisition_optimizer_type = acquisition_optimizer_type
self.model_update_interval = model_update_interval
self.evaluator_type = evaluator_type
self.batch_size = batch_size
self.failedscore = None
self.search_algo = "bayesopt"
[docs] def fit(self, X, y=None, validation_data=None, groups=None,
feature_groups=None, min_n_features=2):
"""
Run fit.
Parameters
----------
X :numpy.array, pandas.DataFrame or scipy.sparse, shape(axis=0) = (n_samples)
Features. Detail depends on estimator.
y: np.ndarray or pd.core.frame.DataFrame, shape(axis=0) = (n_samples) or None, default=None.
Target variable. detail depends on estimator.
validation_data: tuple(X, y) or None, default=None.
Data to compute validation score. detail depends on estimator.
When validation_data is None, computing validation score is not run.
groups: array-like, shape = (n_samples,) or None, default=None.
Group labels for the samples used while splitting the dataset into train/test set.
(input of scikit-learn cross-validator)
feature_groups: array-like, shape = (n_samples,) or None, default=None.
Group labels for the features used while fearture select.
When feature_groups is None, fearture selection is not run.
When feature_group's value is -1, this group's features always are used.
min_n_features: int, default=2.
When number of X's feature cols is less than min_n_features, return search failure.
e.g. If estimator has columns sampling function, use this option to avoid X become too small and error.
"""
X, y, Xvalid, yvalid, cv, param_distributions = self._preproc_fit(X=X, y=y, validation_data=validation_data, feature_groups=feature_groups)
np.random.seed(self.random_state)
if self.failedscore is None:
# If search is failed, Return random score.
# random score is fixed at first fit.
self.failedscore = self._random_scoring(X, y)
obj = mk_objfunc(X=X, y=y, groups=groups, feature_groups=feature_groups, feature_axis=BaseSearcher.feature_axis,
estimator=self.estimator, scoring=self.scoring, cv=cv,
param_distributions=param_distributions, backend=self.backend, failedscore=self.failedscore,
saver=self.saver, cloner=self._cloner, score_summarizer=BaseSearcher.score_summarizer,
Xvalid=Xvalid, yvalid=yvalid, n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch,
cvsummarizer=self._cvs, save_estimator=self.save_estimator, min_n_features=min_n_features)
self.opt = BayesianOptimization(obj, domain=param_distributions, constraints=None, cost_withGradients=None,
model_type=self.model_type, X=self.initial_params, Y=self.initial_score,
initial_design_numdata=self.initial_design_numdata,
initial_design_type=self.initial_design_type,
acquisition_type=self.acquisition_type, normalize_Y=self.normalize_Y,
exact_feval=self.exact_feval, acquisition_optimizer_type=self.acquisition_optimizer_type,
model_update_interval=self.model_update_interval, evaluator_type=self.evaluator_type,
batch_size=self.batch_size, num_cores=1, verbosity=False, verbosity_model=False,
maximize=False, de_duplication=False)
try :
self.opt.run_optimization(max_iter=self.max_iter, max_time=self.max_time)
except KeyboardInterrupt:
pass
self._postproc_fit(X=X, y=y, feature_groups=feature_groups,
best_params=self._cvs.best_params_, best_score=self._cvs.best_score_)
return self
[docs]class GAoptCV(BaseSearcher):
"""
Cross validation optimizer by Genetic Algorithm.
Parameters
----------
estimator
scikit-learn estimator like.
param_distributions: dict.
Search space.
scoring: string or sklearn.metrics.make_scorer.
Evaluation index of search.
When scoring is None, use stimator default scorer and this score greater is better.
cv: scikit-learn cross-validator or int(number of folds), default=5.
Cross validation setting.
max_iter: int, default=32.
Number of search.
random_state: int or None, default=None.
The seed used by the random number generator.
n_jobs: int, default=1.
Number of jobs to run in parallel.
pre_dispatch: int or string, default="2*n_jobs".
Controls the number of jobs that get dispatched during parallel.
verbose: int(0, 1 or 2), default=0.
Controls the verbosity
0: don't display status.
1: display status by stdout.
2: display status by graph.
logdir: str or None, default=None.
Path of directory to save log file.
When logdir is None, log is not saved.
[directory structure]
logdir
|-cv_results
| |-{model_id}.csv : search log
| ...
|-cv_results_graph
| |-{model_id}.html : search log(graph)
| ...
|-estimators_{model_id}
|-{model_id}_index{search count}_split{fold count}.pkl: an estimator which is fitted fold train data
...
|-{model_id}_index{search count}_test.pkl : an estimator which is fitted whole train data.
save_estimator: int, default=0.
estimator save setting.
0: An estimator is not saved.
1: An estimator which is fitted fold train data is saved per cv-fold.
2: In addition to 1, an estimator which is fitted whole train data is saved per cv.
saver: str or function, default="sklearn".
estimator`s saver.
* `sklearn`: use `sklearn.externals.joblib.dump`. Basically for scikit-learn.
* function: function whose variable are model class and save path.
Examples
--------
>>> def saver(model, path):
>>> save_model(model, path+".h5")
model_id: str or None, default=None.
This is used to log filename.
When model_id is None, this is generated by date time.
cloner: str or function, default="sklearn".
estimator`s cloner.
* `sklearn`: use try:`sklearn.base.clone`, except:`copy.deepcopy`. Basically for scikit-learn.
* function: function whose variable is model.
Examples
--------
>>> def cloner(model):
>>> clone_model(model)
refit: bool, default=True.
Refit an estimator using the best found parameters on all train data(=X).
iter_pergeneration: int, default=8.
Genetic algorithm's parameter. Number of iteration per generation (it corresponds to number of population.).
param_crossover_proba: float or function, default=0.5.
Genetic algorithm's parameter. Probability which a certain parameter becomes another parent value.
If this value 0 or 1, paramaters is not changed by crossover.
Function whose variable is number of generation could be passed to this variable.
Number of generation' s start is 0. But create population by random sampling in generation 0, so this function is used from generation 1.
Examples
--------
>>> def f(generaion):
>>> return 0.5 / generaion
param_mutation_proba: float or function, default=0.01.
Genetic algorithm's parameter. Probability which a certain parameter is mutated.
Function whose variable is number of generation Could be passed to this variable.
random_sampling_proba: float or function, default=0.01.
Genetic algorithm's parameter. In a certain generation, probability which individual is created by random sampling.
Function whose variable is number of generation Could be passed to this variable.
Attributes
----------
cv_results_ : dict of numpy (masked) ndarrays
A dict with keys as column headers and values as columns, that can be
imported into a pandas ``DataFrame``.
best_estimator_ : estimator or dict
Estimator that was chosen by the search.
best_score_ : float
Cross-validated score of the best_estimator.
best_params_ : dict
Parameter setting that gave the best results on the hold out data.
"""
def __init__(self, estimator, param_distributions,
scoring=None, cv=5, max_iter=32,
random_state=None, n_jobs=1, pre_dispatch="2*n_jobs",
verbose=0, logdir=None, save_estimator=0, saver="sklearn", model_id=None,
cloner="sklearn", refit=True,
iter_pergeneration=8, param_crossover_proba=0.5, param_mutation_proba=0.01,
random_sampling_proba=0.01):
super().__init__(estimator=estimator, param_distributions=param_distributions,
scoring=scoring, cv=cv, n_jobs=n_jobs, pre_dispatch=pre_dispatch,
verbose=verbose, logdir=logdir, save_estimator=save_estimator, saver=saver,
model_id=model_id, cloner=cloner, refit=refit, backend="gaopt")
self.max_iter = max_iter
self.iter_pergeneration = iter_pergeneration
self.param_crossover_proba = param_crossover_proba
self.param_mutation_proba = param_mutation_proba
self.random_sampling_proba = random_sampling_proba
if random_state is None:
self.random_state = random_state
else:
self.random_state = np.random.RandomState(int(random_state))
self.search_algo = "gaopt"
[docs] def fit(self, X, y=None, validation_data=None, groups=None,
feature_groups=None, min_n_features=2):
"""
Run fit.
Parameters
----------
X :numpy.array, pandas.DataFrame or scipy.sparse, shape(axis=0) = (n_samples)
Features. Detail depends on estimator.
y: np.ndarray or pd.core.frame.DataFrame, shape(axis=0) = (n_samples) or None, default=None.
Target variable. detail depends on estimator.
validation_data: tuple(X, y) or None, default=None.
Data to compute validation score. detail depends on estimator.
When validation_data is None, computing validation score is not run.
groups: array-like, shape = (n_samples,) or None, default=None.
Group labels for the samples used while splitting the dataset into train/test set.
(input of scikit-learn cross-validator)
feature_groups: array-like, shape = (n_samples,) or None, default=None.
Group labels for the features used while fearture select.
When feature_groups is None, fearture selection is not run.
When feature_group's value is -1, this group's features always are used.
min_n_features: int, default=2.
When number of X's feature cols is less than min_n_features, return search failure.
e.g. If estimator has columns sampling function, use this option to avoid X become too small and error.
"""
X, y, Xvalid, yvalid, cv, param_distributions = self._preproc_fit(X=X, y=y, validation_data=validation_data, feature_groups=feature_groups)
np.random.seed(self.random_state)
obj = mk_objfunc(X=X, y=y, groups=groups, feature_groups=feature_groups, feature_axis=BaseSearcher.feature_axis,
estimator=self.estimator, scoring=self.scoring, cv=cv,
param_distributions=param_distributions, backend=self.backend, failedscore=np.nan,
saver=self.saver, cloner=self._cloner, score_summarizer=BaseSearcher.score_summarizer,
Xvalid=Xvalid, yvalid=yvalid, n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch,
cvsummarizer=self._cvs, save_estimator=self.save_estimator, min_n_features=min_n_features)
try :
gamin(obj, param_distributions, max_iter=self.max_iter, iter_pergeneration=self.iter_pergeneration,
param_crossover_proba=self.param_crossover_proba, param_mutation_proba=self.param_mutation_proba,
random_sampling_proba=self.random_sampling_proba, cvsummarizer=self._cvs)
except KeyboardInterrupt:
pass
self._postproc_fit(X=X, y=y, feature_groups=feature_groups,
best_params=self._cvs.best_params_, best_score=self._cvs.best_score_)
return self
[docs]class RandomoptCV(BaseSearcher):
"""
Cross validation optimizer by Random Search.
Parameters
----------
estimator
scikit-learn estimator like.
param_distributions: dict.
Search space.
scoring: string or sklearn.metrics.make_scorer.
Evaluation index of search.
When scoring is None, use stimator default scorer and this score greater is better.
cv: scikit-learn cross-validator or int(number of folds), default=5.
Cross validation setting.
max_iter: int, default=32.
Number of search.
random_state: int or None, default=None.
The seed used by the random number generator.
n_jobs: int, default=1.
Number of jobs to run in parallel.
pre_dispatch: int or string, default="2*n_jobs".
Controls the number of jobs that get dispatched during parallel.
verbose: int(0, 1 or 2), default=0.
Controls the verbosity
0: don't display status.
1: display status by stdout.
2: display status by graph.
logdir: str or None, default=None.
Path of directory to save log file.
When logdir is None, log is not saved.
[directory structure]
logdir
|-cv_results
| |-{model_id}.csv : search log
| ...
|-cv_results_graph
| |-{model_id}.html : search log(graph)
| ...
|-estimators_{model_id}
|-{model_id}_index{search count}_split{fold count}.pkl: an estimator which is fitted fold train data
...
|-{model_id}_index{search count}_test.pkl : an estimator which is fitted whole train data.
save_estimator: int, default=0.
estimator save setting.
0: An estimator is not saved.
1: An estimator which is fitted fold train data is saved per cv-fold.
2: In addition to 1, an estimator which is fitted whole train data is saved per cv.
saver: str or function, default="sklearn".
estimator`s saver.
* `sklearn`: use `sklearn.externals.joblib.dump`. Basically for scikit-learn.
* function: function whose variable are model class and save path.
Examples
--------
>>> def saver(model, path):
>>> save_model(model, path+".h5")
model_id: str or None, default=None.
This is used to log filename.
When model_id is None, this is generated by date time.
cloner: str or function, default="sklearn".
estimator`s cloner.
* `sklearn`: use try:`sklearn.base.clone`, except:`copy.deepcopy`. Basically for scikit-learn.
* function: function whose variable is model.
Examples
--------
>>> def cloner(model):
>>> clone_model(model)
refit: bool, default=True.
Refit an estimator using the best found parameters on all train data(=X).
Attributes
----------
cv_results_ : dict of numpy (masked) ndarrays
A dict with keys as column headers and values as columns, that can be
imported into a pandas ``DataFrame``.
best_estimator_ : estimator or dict
Estimator that was chosen by the search.
best_score_ : float
Cross-validated score of the best_estimator.
best_params_ : dict
Parameter setting that gave the best results on the hold out data.
"""
def __init__(self, estimator, param_distributions,
scoring=None, cv=5, max_iter=32,
random_state=None, n_jobs=1, pre_dispatch="2*n_jobs",
verbose=0, logdir=None, save_estimator=0, saver="sklearn", model_id=None,
cloner="sklearn", refit=True):
super().__init__(estimator=estimator, param_distributions=param_distributions,
scoring=scoring, cv=cv, n_jobs=n_jobs, pre_dispatch=pre_dispatch,
verbose=verbose, logdir=logdir, save_estimator=save_estimator, saver=saver,
model_id=model_id, cloner=cloner, refit=refit, backend="gaopt")
self.max_iter = max_iter
if random_state is None:
self.random_state = random_state
else:
self.random_state = np.random.RandomState(int(random_state))
self.search_algo = "randomopt"
[docs] def fit(self, X, y=None, validation_data=None, groups=None,
feature_groups=None, min_n_features=2):
"""
Run fit.
Parameters
----------
X :numpy.array, pandas.DataFrame or scipy.sparse, shape(axis=0) = (n_samples)
Features. Detail depends on estimator.
y: np.ndarray or pd.core.frame.DataFrame, shape(axis=0) = (n_samples) or None, default=None.
Target variable. detail depends on estimator.
validation_data: tuple(X, y) or None, default=None.
Data to compute validation score. detail depends on estimator.
When validation_data is None, computing validation score is not run.
groups: array-like, shape = (n_samples,) or None, default=None.
Group labels for the samples used while splitting the dataset into train/test set.
(input of scikit-learn cross-validator)
feature_groups: array-like, shape = (n_samples,) or None, default=None.
Group labels for the features used while fearture select.
When feature_groups is None, fearture selection is not run.
When feature_group's value is -1, this group's features always are used.
min_n_features: int, default=2.
When number of X's feature cols is less than min_n_features, return search failure.
e.g. If estimator has columns sampling function, use this option to avoid X become too small and error.
"""
X, y, Xvalid, yvalid, cv, param_distributions = self._preproc_fit(X=X, y=y, validation_data=validation_data, feature_groups=feature_groups)
np.random.seed(self.random_state)
obj = mk_objfunc(X=X, y=y, groups=groups, feature_groups=feature_groups, feature_axis=BaseSearcher.feature_axis,
estimator=self.estimator, scoring=self.scoring, cv=cv,
param_distributions=param_distributions, backend=self.backend, failedscore=np.nan,
saver=self.saver, cloner=self._cloner, score_summarizer=BaseSearcher.score_summarizer,
Xvalid=Xvalid, yvalid=yvalid, n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch,
cvsummarizer=self._cvs, save_estimator=self.save_estimator, min_n_features=min_n_features)
try :
gamin(obj, param_distributions, max_iter=self.max_iter, iter_pergeneration=1,
param_crossover_proba=0, param_mutation_proba=0,
random_sampling_proba=1, cvsummarizer=self._cvs)
except KeyboardInterrupt:
pass
self._postproc_fit(X=X, y=y, feature_groups=feature_groups,
best_params=self._cvs.best_params_, best_score=self._cvs.best_score_)
return self