Source code for pyrfm.linear_model.sdca

# Author: Kyohei Atarashi
# License: BSD-2-Clause

import numpy as np
from scipy import sparse
from sklearn.utils import check_random_state
from .loss_fast import Squared, SquaredHinge, Logistic, Hinge
from .base import BaseLinear, LinearClassifierMixin, LinearRegressorMixin
from sklearn.kernel_approximation import RBFSampler
from .sdca_fast import _sdca_fast
from ..dataset_fast import get_dataset
from ..random_feature.random_features_fast import get_fast_random_feature
import warnings


class BaseSDCAEstimator(BaseLinear):
    LOSSES = {
        'squared': Squared(),
        'squared_hinge': SquaredHinge(),
        'logistic': Logistic(),
        'hinge': Hinge()
    }
    stochastic = True

    def __init__(self, transformer=RBFSampler(), loss='squared_hinge',
                 C=1.0, alpha=1.0, l1_ratio=0, intercept_decay=0.1,
                 normalize=False, fit_intercept=True, max_iter=100, tol=1e-6,
                 warm_start=False, random_state=None, verbose=True,
                 fast_solver=True, shuffle=True):
        self.transformer = transformer
        self.loss = loss
        self.C = C
        self.alpha = alpha
        self.intercept_decay = intercept_decay
        self.l1_ratio = l1_ratio
        self.normalize= normalize
        self.fit_intercept = fit_intercept
        self.max_iter = max_iter
        self.tol = tol
        self.warm_start = warm_start
        self.random_state = random_state
        self.verbose = verbose
        self.fast_solver = fast_solver
        self.shuffle = shuffle

    def _valid_params(self):
        super(BaseSDCAEstimator, self)._valid_params()
        if self.alpha*(1-self.l1_ratio) == 0:
            raise ValueError("alpha*(1-l1_ratio)/C = 0. SDCA needs a strongly"
                             "convex regularizer (alpha*(1-l1_ration)/C must"
                             "be bigger than 0).")
    def _init_params(self, X, y):
        super(BaseSDCAEstimator, self)._init_params(X, y)
        n_components = self.transformer.n_components
        n_features = X.shape[1]
        n_samples = X.shape[0]
        if not (self.warm_start and hasattr(self, "dual_coef_")):
            self.dual_coef_ = np.zeros(n_samples)
        else:
            if len(self.dual_coef_) != n_samples:
                warnings.warn("The number of training data is different from "
                              "the previous one. dual_coef_ is reset.")
                self.dual_coef_ = np.zeros(n_samples)
    
    def fit(self, X, y):
        """Fit model according to X and y.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------
        self : classifier
            Returns self.
        """
        X, y = self._check_X_y(X, y, accept_sparse=['csr'])
        n_samples, n_features = X.shape
        # init primal parameters, mean/var vectors and t_
        self._valid_params()
        self._init_params(X, y)

        loss = self.LOSSES[self.loss]
        alpha = self.alpha / self.C
        intercept_decay = self.intercept_decay / self.C
        random_state = check_random_state(self.random_state)

        is_sparse = sparse.issparse(X)
        it = _sdca_fast(self.coef_, self.dual_coef_, self.intercept_,
                        get_dataset(X, order='c'), X, y,
                        self.mean_, self.var_, loss, alpha, intercept_decay,
                        self.l1_ratio, self.t_, self.max_iter, self.tol,
                        is_sparse, self.verbose, self.fit_intercept,
                        self.shuffle, random_state, self.transformer,
                        get_fast_random_feature(self.transformer))
        if self.t_ == 1: # for sgd initialization
            self.t_ += n_samples
        self.t_ += n_samples*(it+1)

        return self


[docs]class SDCAClassifier(BaseSDCAEstimator, LinearClassifierMixin):
    """Stochastic dual coordinate ascent solver for linear classifier with
    random feature maps.

    Random feature mapping is computed just before computing prediction and
    gradient.
    minimize  \sum_{i=1}^{n} loss(x_i, y_i) + alpha/C*reg

    Parameters
    ----------
    transformer : scikit-learn Transformer object (default=RBFSampler())
        A scikit-learn TransformerMixin object.
        transformer must have (1) n_components attribute, (2) fit(X, y),
        and (3) transform(X) methods.

    loss : str (default="squared_hinge")
        Which loss function to use. Following losses can be used:

        - 'squared_hinge'

        - 'hinge'

        - 'logistic'

    C : double (default=1.0)
        Weight of the loss term.

    alpha : double (default=1.0)
        Weight of the penalty term.

    intercept_decay : double (default=1e-1)
        Weight of the penalty term for intercept.

    l1_ratio : double (default=0)
        Ratio of L1 regularizer.
        Weight of L1 regularizer is alpha * l1_ratio and that of L2 regularizer
        is 0.5 * alpha * (1-l1_ratio).

        - l1_ratio = 0 : Ridge.

        - l1_ratio = 1 : Lasso.

        - Otherwise : Elastic Net.

    normalize : bool (default=False)
        Whether normalize random features or not.
        If true, the adam solver computes running mean and variance
        at learning, and uses them for inference.

    fit_intercept : bool (default=True)
        Whether to fit intercept (bias term) or not.

    max_iter : int (default=100)
        Maximum number of iterations.

    tol : double (default=1e-6)
        Tolerance of stopping criterion.
        If sum of absolute val of update in one epoch is lower than tol,
        the adam solver stops learning.

    warm_start : bool (default=False)
        Whether to activate warm-start or not.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    verbose : bool (default=True)
        Verbose mode or not.

    fast_solver : bool (default=True)
        Use cython fast solver or not. This argument is valid when transformer
        is implemented in random_features_fast.pyx/pxd

    shuffle : bool (default=True)
        Whether to shuffle data before each epoch or not.

    Attributes
    ----------
    self.coef_ : array, shape (n_components, )
        The learned coefficients of the linear model.

    self.dual_coef_ : array, shape (n_samples, )

    self.intercept_ : array, shape (1, )
        The learned intercept (bias) of the linear model.

    self.mean_, self.var_ : array or None, shape (n_components, )
        The running mean and variances of random feature vectors.
        They are used if normalize=True (they are None if False).

    self.t_ : int
        The number of iteration.

    References
    ---------
    [1] Stochastic Dual Coordinate Ascent Methods for Regularized Loss
    Minimization.
    Shai Shalev-Schwartz and Tong Zhang.
    JMLR 2013 (vol 14), pp. 567-599.

    """
    LOSSES = {
        'squared_hinge': SquaredHinge(),
        'logistic': Logistic(),
        'hinge': Hinge(),
        'log': Logistic()
    }

    def __init__(self, transformer=RBFSampler(), loss='squared_hinge',
                 C=1.0, alpha=1.0, l1_ratio=0, intercept_decay=1e-1,
                 normalize=False, fit_intercept=True, max_iter=100, tol=1e-6,
                 warm_start=False, random_state=None, verbose=True,
                 fast_solver=True, shuffle=True):
        super(SDCAClassifier, self).__init__(
            transformer, loss, C, alpha, l1_ratio, intercept_decay, normalize,
            fit_intercept, max_iter, tol, warm_start, random_state, verbose,
            fast_solver, shuffle
        )


[docs]class SDCARegressor(BaseSDCAEstimator, LinearRegressorMixin):
    """Stochastic dual coordinate ascent solver for linear regression with
    random feature maps.

    Random feature mapping is computed just before computing prediction and
    gradient.
    minimize  \sum_{i=1}^{n} loss(x_i, y_i) + alpha/C*reg

    Parameters
    ----------
    transformer : scikit-learn Transformer object (default=RBFSampler())
        A scikit-learn TransformerMixin object.
        transformer must have (1) n_components attribute, (2) fit(X, y),
        and (3) transform(X) methods.

    loss : str (default="squared")
        Which loss function to use. Following losses can be used:

        - 'squared'

    C : double (default=1.0)
        Weight of the loss term.

    alpha : double (default=1.0)
        Weight of the penalty term.

    intercept_decay : double (default=1e-1)
        Weight of the penalty term for intercept.

    l1_ratio : double (default=0)
        Ratio of L1 regularizer.
        Weight of L1 regularizer is alpha * l1_ratio and that of L2 regularizer
        is 0.5 * alpha * (1-l1_ratio).

        - l1_ratio = 0 : Ridge.

        - l1_ratio = 1 : Lasso.

        - Otherwise : Elastic Net.

    normalize : bool (default=False)
        Whether normalize random features or not.
        If true, the adam solver computes running mean and variance
        at learning, and uses them for inference.

    fit_intercept : bool (default=True)
        Whether to fit intercept (bias term) or not.

    max_iter : int (default=100)
        Maximum number of iterations.

    tol : double (default=1e-6)
        Tolerance of stopping criterion.
        If sum of absolute val of update in one epoch is lower than tol,
        the adam solver stops learning.

    warm_start : bool (default=False)
        Whether to activate warm-start or not.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    verbose : bool (default=True)
        Verbose mode or not.

    fast_solver : bool (default=True)
        Use cython fast solver or not. This argument is valid when transformer
        is implemented in random_features_fast.pyx/pxd

    shuffle : bool (default=True)
        Whether to shuffle data before each epoch or not.

    Attributes
    ----------
    self.coef_ : array, shape (n_components, )
        The learned coefficients of the linear model.

    self.dual_coef_ : array, shape (n_samples, )

    self.intercept_ : array, shape (1, )
        The learned intercept (bias) of the linear model.

    self.mean_, self.var_ : array or None, shape (n_components, )
        The running mean and variances of random feature vectors.
        They are used if normalize=True (they are None if False).

    self.t_ : int
        The number of iteration.

    References
    ---------
    [1] Stochastic Dual Coordinate Ascent Methods for Regularized Loss
    Minimization.
    Shai Shalev-Schwartz and Tong Zhang.
    JMLR 2013 (vol 14), pp. 567-599.

    """
    LOSSES = {
        'squared': Squared(),
    }

    def __init__(self, transformer=RBFSampler(), loss='squared',
                 C=1.0, alpha=1.0, l1_ratio=0., intercept_decay=1e-1,
                 normalize=False, fit_intercept=True, max_iter=100, tol=1e-6,
                 warm_start=False, random_state=None, verbose=True,
                 fast_solver=True, shuffle=True):
        super(SDCARegressor, self).__init__(
            transformer, loss, C, alpha, l1_ratio, intercept_decay, normalize,
            fit_intercept, max_iter, tol, warm_start, random_state, verbose,
            fast_solver, shuffle
        )