Source code for cvopt.utils._logoperator

import os, warnings
import pandas as pd, numpy as np

from ._base import chk_Xy, make_loader, to_label
from ..model_selection import _setting as st


[docs]def extract_params(logdir, model_id, target_index, feature_groups=None): """ Extract parameters from cvopt logfile. Parameters ---------- logdir: str. cvopt's log directory path. model_id: str. cvopt's model id. target_index: int. Logfile index(start:0). Parameters correspond to index is extracted. feature_groups: array-like, shape = (n_samples,) or None, default=None. cvopt feature_groups. When feature_groups is None, feature_param and feature_select_flag in returns is None. feature select flag is bool vector. If this value is True, optimizer recommend using corresponding column. Returns ------- estimator_params: dict estimator parameters of the target model. feature_params: dict or None feature parameters of the target model. feature_select_flag: numpy array or None feature select flag of the target model. """ logfile = pd.read_csv(os.path.join(logdir, "cv_results", model_id+".csv")) logfile.set_index("index", inplace=True) if not logfile.index.is_unique: raise Exception("%s index must be unique" %os.path.join(logdir, "cv_results", model_id+".csv")) params = eval(logfile.loc[target_index, "params"]) estimator_params = dict() feature_params = dict() feature_select_flag = np.array(feature_groups).astype(str) for key in params.keys(): if st.FEATURE_SELECT_PARAMNAME_PREFIX in key: feature_params[key] = params[key] feature_group_id = key.split(st.FEATURE_SELECT_PARAMNAME_PREFIX)[-1] feature_select_flag = np.where(feature_select_flag==feature_group_id, feature_params[key], feature_select_flag) else: estimator_params[key] = params[key] if feature_groups is None: if len(feature_params) > 0: warnings.warn("This log file include feature select setting. Please specify feature_groups if necessary.") return estimator_params, None, None else: if len(feature_params) == 0: warnings.warn("This log file don't include feature select setting. So return is (estimator_params, None, None)") return estimator_params, None, None feature_select_flag = np.where(feature_select_flag=="True", True, False) return estimator_params, feature_params, feature_select_flag
[docs]def mk_metafeature(X, y, logdir, model_id, target_index, cv, validation_data=None, feature_groups=None, estimator_method="predict", merge=True, loader="sklearn"): """ Make meta feature for stacking(https://mlwave.com/kaggle-ensembling-guide/) Parameters ---------- X :np.ndarray or pd.core.frame.DataFrame, shape(axis=0) = (n_samples) Features that was used in optimizer training. Detail depends on estimator. Meta feature correspond to X is made using cross validation's estimator. y: np.ndarray or pd.core.frame.DataFrame, shape(axis=0) = (n_samples) or None, default=None. Target variable that was used in optimizer training. Detail depends on estimator. logdir: str. cvopt's log directory path. model_id: str. cvopt's model id. target_index: int. Logfile index(start:0). The estimator correspond to index is used to make meta feature. cv: scikit-learn cross-validator Cross validation setting that was used in optimizer training. validation_data: tuple(X, y) or None, default=None. Detail depends on estimator. Meta feature correspond to validation_data is made using the estimator which is fitted whole train data. feature_groups: array-like, shape = (n_samples,) or None, default=None. cvopt feature_groups that was used in optimizer training. estimator_method: str, default="predict". Using estimator's method to make meta feature. merge: bool, default=True. if True, return matrix which result per cv is merged into. loader: str or function, default="sklearn". estimator`s loader. * `sklearn`: use `sklearn.externals.joblib.load`. Basically for scikit-learn. * function: function whose variable is estimator`s path. Returns ------- X_meta or X_meta, X_meta_validation_data: np.ndarray or tuple of np.ndarray. When validation_data is input, return tuple. """ loader = make_loader(loader) X = chk_Xy(X, none_error=True, ravel_1d=False, msg_sjt="X") y = chk_Xy(y, none_error=False, ravel_1d=True, msg_sjt="y") if validation_data is not None: Xvalid = chk_Xy(validation_data[0], none_error=False, ravel_1d=False, msg_sjt="Xvalid") yvalid = chk_Xy(validation_data[1], none_error=False, ravel_1d=True, msg_sjt="yvalid") if feature_groups is not None: _, _, feature_select_flag = extract_params(logdir=logdir, model_id=model_id, target_index=target_index, feature_groups=feature_groups) X = X[:, feature_select_flag] if validation_data is not None: Xvalid = Xvalid[:, feature_select_flag] X_meta = [] X_ind = [] estdir = os.path.join(logdir, "estimators", model_id) name_prefix = model_id + "_index" + "{0:05d}".format(target_index) #estimator = loader(os.path.join(estdir, name_prefix+"_split"+"{0:02d}".format(0))) #cv = check_cv(cv, y, classifier=is_classifier(estimator)) for i, (_, test_index) in enumerate(cv.split(X, to_label(y))): estimator = loader(os.path.join(estdir, name_prefix+"_split"+"{0:02d}".format(i))) X_meta.append(getattr(estimator, estimator_method)(X[test_index])) X_ind.append(test_index) if merge: X_meta = np.concatenate(X_meta, axis=0) X_ind = np.concatenate(X_ind, axis=0) X_meta = X_meta[np.argsort(X_ind)] if validation_data is None: return X_meta else: estimator = loader(os.path.join(estdir, name_prefix+"_test")) return X_meta, getattr(estimator, estimator_method)(Xvalid)