Source code for pyrfm.linear_model.sdca

# Author: Kyohei Atarashi
# License: BSD-2-Clause

import numpy as np
from scipy import sparse
from sklearn.utils import check_random_state
from .loss_fast import Squared, SquaredHinge, Logistic, Hinge
from .base import BaseLinear, LinearClassifierMixin, LinearRegressorMixin
from sklearn.kernel_approximation import RBFSampler
from .sdca_fast import _sdca_fast
from ..dataset_fast import get_dataset
from ..random_feature.random_features_fast import get_fast_random_feature
import warnings


class BaseSDCAEstimator(BaseLinear):
    LOSSES = {
        'squared': Squared(),
        'squared_hinge': SquaredHinge(),
        'logistic': Logistic(),
        'hinge': Hinge()
    }
    stochastic = True

    def __init__(self, transformer=RBFSampler(), loss='squared_hinge',
                 C=1.0, alpha=1.0, l1_ratio=0, intercept_decay=0.1,
                 normalize=False, fit_intercept=True, max_iter=100, tol=1e-6,
                 warm_start=False, random_state=None, verbose=True,
                 fast_solver=True, shuffle=True):
        self.transformer = transformer
        self.loss = loss
        self.C = C
        self.alpha = alpha
        self.intercept_decay = intercept_decay
        self.l1_ratio = l1_ratio
        self.normalize= normalize
        self.fit_intercept = fit_intercept
        self.max_iter = max_iter
        self.tol = tol
        self.warm_start = warm_start
        self.random_state = random_state
        self.verbose = verbose
        self.fast_solver = fast_solver
        self.shuffle = shuffle

    def _valid_params(self):
        super(BaseSDCAEstimator, self)._valid_params()
        if self.alpha*(1-self.l1_ratio) == 0:
            raise ValueError("alpha*(1-l1_ratio)/C = 0. SDCA needs a strongly"
                             "convex regularizer (alpha*(1-l1_ration)/C must"
                             "be bigger than 0).")
    def _init_params(self, X, y):
        super(BaseSDCAEstimator, self)._init_params(X, y)
        n_components = self.transformer.n_components
        n_features = X.shape[1]
        n_samples = X.shape[0]
        if not (self.warm_start and hasattr(self, "dual_coef_")):
            self.dual_coef_ = np.zeros(n_samples)
        else:
            if len(self.dual_coef_) != n_samples:
                warnings.warn("The number of training data is different from "
                              "the previous one. dual_coef_ is reset.")
                self.dual_coef_ = np.zeros(n_samples)
    
    def fit(self, X, y):
        """Fit model according to X and y.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------
        self : classifier
            Returns self.
        """
        X, y = self._check_X_y(X, y, accept_sparse=['csr'])
        n_samples, n_features = X.shape
        # init primal parameters, mean/var vectors and t_
        self._valid_params()
        self._init_params(X, y)

        loss = self.LOSSES[self.loss]
        alpha = self.alpha / self.C
        intercept_decay = self.intercept_decay / self.C
        random_state = check_random_state(self.random_state)

        is_sparse = sparse.issparse(X)
        it = _sdca_fast(self.coef_, self.dual_coef_, self.intercept_,
                        get_dataset(X, order='c'), X, y,
                        self.mean_, self.var_, loss, alpha, intercept_decay,
                        self.l1_ratio, self.t_, self.max_iter, self.tol,
                        is_sparse, self.verbose, self.fit_intercept,
                        self.shuffle, random_state, self.transformer,
                        get_fast_random_feature(self.transformer))
        if self.t_ == 1: # for sgd initialization
            self.t_ += n_samples
        self.t_ += n_samples*(it+1)

        return self


[docs]class SDCAClassifier(BaseSDCAEstimator, LinearClassifierMixin): """Stochastic dual coordinate ascent solver for linear classifier with random feature maps. Random feature mapping is computed just before computing prediction and gradient. minimize \sum_{i=1}^{n} loss(x_i, y_i) + alpha/C*reg Parameters ---------- transformer : scikit-learn Transformer object (default=RBFSampler()) A scikit-learn TransformerMixin object. transformer must have (1) n_components attribute, (2) fit(X, y), and (3) transform(X) methods. loss : str (default="squared_hinge") Which loss function to use. Following losses can be used: - 'squared_hinge' - 'hinge' - 'logistic' C : double (default=1.0) Weight of the loss term. alpha : double (default=1.0) Weight of the penalty term. intercept_decay : double (default=1e-1) Weight of the penalty term for intercept. l1_ratio : double (default=0) Ratio of L1 regularizer. Weight of L1 regularizer is alpha * l1_ratio and that of L2 regularizer is 0.5 * alpha * (1-l1_ratio). - l1_ratio = 0 : Ridge. - l1_ratio = 1 : Lasso. - Otherwise : Elastic Net. normalize : bool (default=False) Whether normalize random features or not. If true, the adam solver computes running mean and variance at learning, and uses them for inference. fit_intercept : bool (default=True) Whether to fit intercept (bias term) or not. max_iter : int (default=100) Maximum number of iterations. tol : double (default=1e-6) Tolerance of stopping criterion. If sum of absolute val of update in one epoch is lower than tol, the adam solver stops learning. warm_start : bool (default=False) Whether to activate warm-start or not. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. verbose : bool (default=True) Verbose mode or not. fast_solver : bool (default=True) Use cython fast solver or not. This argument is valid when transformer is implemented in random_features_fast.pyx/pxd shuffle : bool (default=True) Whether to shuffle data before each epoch or not. Attributes ---------- self.coef_ : array, shape (n_components, ) The learned coefficients of the linear model. self.dual_coef_ : array, shape (n_samples, ) self.intercept_ : array, shape (1, ) The learned intercept (bias) of the linear model. self.mean_, self.var_ : array or None, shape (n_components, ) The running mean and variances of random feature vectors. They are used if normalize=True (they are None if False). self.t_ : int The number of iteration. References --------- [1] Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization. Shai Shalev-Schwartz and Tong Zhang. JMLR 2013 (vol 14), pp. 567-599. """ LOSSES = { 'squared_hinge': SquaredHinge(), 'logistic': Logistic(), 'hinge': Hinge(), 'log': Logistic() } def __init__(self, transformer=RBFSampler(), loss='squared_hinge', C=1.0, alpha=1.0, l1_ratio=0, intercept_decay=1e-1, normalize=False, fit_intercept=True, max_iter=100, tol=1e-6, warm_start=False, random_state=None, verbose=True, fast_solver=True, shuffle=True): super(SDCAClassifier, self).__init__( transformer, loss, C, alpha, l1_ratio, intercept_decay, normalize, fit_intercept, max_iter, tol, warm_start, random_state, verbose, fast_solver, shuffle )
[docs]class SDCARegressor(BaseSDCAEstimator, LinearRegressorMixin): """Stochastic dual coordinate ascent solver for linear regression with random feature maps. Random feature mapping is computed just before computing prediction and gradient. minimize \sum_{i=1}^{n} loss(x_i, y_i) + alpha/C*reg Parameters ---------- transformer : scikit-learn Transformer object (default=RBFSampler()) A scikit-learn TransformerMixin object. transformer must have (1) n_components attribute, (2) fit(X, y), and (3) transform(X) methods. loss : str (default="squared") Which loss function to use. Following losses can be used: - 'squared' C : double (default=1.0) Weight of the loss term. alpha : double (default=1.0) Weight of the penalty term. intercept_decay : double (default=1e-1) Weight of the penalty term for intercept. l1_ratio : double (default=0) Ratio of L1 regularizer. Weight of L1 regularizer is alpha * l1_ratio and that of L2 regularizer is 0.5 * alpha * (1-l1_ratio). - l1_ratio = 0 : Ridge. - l1_ratio = 1 : Lasso. - Otherwise : Elastic Net. normalize : bool (default=False) Whether normalize random features or not. If true, the adam solver computes running mean and variance at learning, and uses them for inference. fit_intercept : bool (default=True) Whether to fit intercept (bias term) or not. max_iter : int (default=100) Maximum number of iterations. tol : double (default=1e-6) Tolerance of stopping criterion. If sum of absolute val of update in one epoch is lower than tol, the adam solver stops learning. warm_start : bool (default=False) Whether to activate warm-start or not. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. verbose : bool (default=True) Verbose mode or not. fast_solver : bool (default=True) Use cython fast solver or not. This argument is valid when transformer is implemented in random_features_fast.pyx/pxd shuffle : bool (default=True) Whether to shuffle data before each epoch or not. Attributes ---------- self.coef_ : array, shape (n_components, ) The learned coefficients of the linear model. self.dual_coef_ : array, shape (n_samples, ) self.intercept_ : array, shape (1, ) The learned intercept (bias) of the linear model. self.mean_, self.var_ : array or None, shape (n_components, ) The running mean and variances of random feature vectors. They are used if normalize=True (they are None if False). self.t_ : int The number of iteration. References --------- [1] Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization. Shai Shalev-Schwartz and Tong Zhang. JMLR 2013 (vol 14), pp. 567-599. """ LOSSES = { 'squared': Squared(), } def __init__(self, transformer=RBFSampler(), loss='squared', C=1.0, alpha=1.0, l1_ratio=0., intercept_decay=1e-1, normalize=False, fit_intercept=True, max_iter=100, tol=1e-6, warm_start=False, random_state=None, verbose=True, fast_solver=True, shuffle=True): super(SDCARegressor, self).__init__( transformer, loss, C, alpha, l1_ratio, intercept_decay, normalize, fit_intercept, max_iter, tol, warm_start, random_state, verbose, fast_solver, shuffle )