Source code for pyrfm.random_feature.random_kernel

# Author: Kyohei Atarashi
# License: BSD-2-Clause

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.extmath import safe_sparse_dot
from scipy.sparse import csr_matrix, issparse
from math import sqrt
from ..kernels import anova, all_subsets, anova_fast, pairwise
from .utils import get_random_matrix
from .utils_random_fast import get_subfeatures_indices
import warnings
from scipy.special import comb


def _anova(degree=2):
    return lambda X, Y, dense_output: anova(X, Y, degree, dense_output)


def _anova_fast(degree=2):
    return lambda X, Y, dense_output: anova_fast(X, Y, degree, dense_output)


def dot():
    return lambda X, Y, dense_output: safe_sparse_dot(X, Y.T, dense_output)


def _pairwise(symmetric=False):
    return lambda X, Y, dense_output: pairwise(X, Y, dense_output, symmetric)


[docs]class RandomKernel(BaseEstimator, TransformerMixin):
    """Approximates feature map of the ANOVA/all-subsets kernel by 
    Random Kernel Feature map.

    Parameters
    ----------
    n_components : int (default=100)
        Number of Monte Carlo samples per original features.
        Equals the dimensionality of the computed (mapped) feature space.

    kernel : str (default="anova")
        Kernel to be approximated.
        "anova", "anova_cython", "all-subsets", "dot", or "pairwise"
        can be used.

    degree : int (default=2)
        Parameter of the ANOVA kernel.

    distribution : str, (default="rademacher")
        Distribution for random_weights_.
        "rademacher", "gaussian", "laplace", "uniform", or "sparse_rademacher"
        can be used.

    dense_output : bool (default=True)
        Whether randomized feature matrix is dense or sparse.
        For kernel='anova', if dense_output = False,
        distribution='sparse_rademacher', and X is sparse matrix, output random
        feature matrix will become sparse matrix.
        For kernel='anova_cython', if dense_output=False, output random feature
        matrix will become sparse matrix.

    p_sparse : float (default=0.)
        Sparsity parameter for "sparse_rademacher" distribution.
        If p_sparse = 0, "sparse_rademacher" is equivalent to "rademacher".

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    Attributes
    ----------
    random_weights_ : array, shape (n_features, n_components)
        The sampled basis.

    References
    ----------
    [1] Random Feature Maps for the Itemset Kernel.
    Kyohei Atarashi, Subhransu Maji, and Satoshi Oyama.
    In AAAI 2019.
    (https://www.aaai.org/ojs/index.php/AAAI/article/view/4188)

    """
    def __init__(self, n_components=100, kernel='anova', degree=2,
                 distribution='rademacher', dense_output=True, p_sparse=0.,
                 random_state=None):
        self.n_components = n_components
        self.kernel = kernel
        self.degree = degree
        self.distribution = distribution
        self.dense_output = dense_output
        self.p_sparse = p_sparse
        self.random_state = random_state

[docs]    def fit(self, X, y=None):
        """Generate random weights according to n_features.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.

        Returns
        -------
        self : object
            Returns the transformer.
        """
        random_state = check_random_state(self.random_state)
        X = check_array(X, accept_sparse=True)
        n_samples, n_features = X.shape
        size = (n_features, self.n_components)
        distribution = self.distribution.lower()
        self.random_weights_ = get_random_matrix(random_state, distribution,
                                                 size, self.p_sparse)
        return self

[docs]    def transform(self, X):
        """Apply the approximate feature map to X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            New data, where n_samples is the number of samples
            and n_features is the number of features.

        Returns
        -------
        X_new : array-like, shape (n_samples, n_components)
        """
        check_is_fitted(self, "random_weights_")
        X = check_array(X, accept_sparse=['csr'])
        if isinstance(self.kernel, str):
            if self.kernel == 'anova':
                kernel_ = _anova(self.degree)
            elif self.kernel == 'anova_cython':
                kernel_ = _anova_fast(self.degree)
            elif self.kernel == 'all_subsets':
                kernel_ = all_subsets
            elif self.kernel == 'dot':
                kernel_ = dot()
            else:
                raise ValueError('Kernel {} is not supported. '
                                 'Use "anova", "anova_cython", "all_subsets", '
                                 '"dot", or "pairwise".'
                                 .format(self.kernel))
        else:
            kernel_ = self.kernel
        dense_output = self.dense_output
        # for sparse output
        if not dense_output:
            if not (issparse(self.random_weights_) and issparse(X)):
                warnings.warn("dense_output=False is valid only when both "
                              "X and random_weights_ are sparse. "
                              "dense_output is changed to True now.")
                dense_output = True

        output = kernel_(X, self.random_weights_.T, dense_output)
        output /= sqrt(self.n_components)
        return output

    def _remove_bases(self, indices):
        self.random_weights_ = np.delete(self.random_weights_, indices, axis=1)
        self.n_components = self.random_weights_.shape[1]
        return True


[docs]class SubfeatureRandomKernel(BaseEstimator, TransformerMixin):
    """Approximates feature map of the ANOVA/all-subsets kernel
    by Subfeature Random Kernel Feature map.

    Parameters
    ----------
    n_components : int (default=100)
        Number of Monte Carlo samples per original features.
        Equals the dimensionality of the computed (mapped) feature space.

    n_sub_features : int (default=5)
        Number of subfeatures.

    kernel : str (default="anova")
        Kernel to be approximated.
        "anova", "anova_cython", "all-subsets", "dot", or "pairwise"
        can be used.

    degree : int (default=2)
        Parameter of the ANOVA kernel.

    distribution : str, (default="rademacher")
        Distribution for random_weights_.
        "rademacher", "gaussian", "laplace", "uniform", or "sparse_rademacher"
        can be used.

    dense_output : bool (default=False)
        Whether randomized feature matrix is dense or sparse.
        For kernel='anova', if dense_output = False,
        distribution='sparse_rademacher', and X is sparse matrix, output random
        feature matrix will become sparse matrix.
        For kernel='anova_cython', if dense_output=False, output random feature
        matrix will become sparse matrix.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    Attributes
    ----------
    random_weights_ : csr_matrix, shape (n_features, n_components)
        The sampled basis.

    References
    ----------
    [1] Sparse Random Feature Maps for the Item-multiset Kernel.
    Kyohei Atarashi, Satoshi Oyama, and Masahito Kurihara.

    """
    def __init__(self, n_components=100, n_sub_features=5, kernel='anova',
                 degree=2, distribution='rademacher',
                 dense_output=False, random_state=None):
        self.n_components = n_components
        self.n_sub_features = n_sub_features
        self.degree = degree
        self.kernel = kernel
        self.distribution = distribution
        self.dense_output = dense_output
        self.random_state = random_state

[docs]    def fit(self, X, y=None):
        """Generate random weights according to n_features.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.

        Returns
        -------
        self : object
            Returns the transformer.
        """
        if self.kernel not in ['anova', 'anova_cython']:
            raise ValueError("RandomSubsetKernel now does not support"
                             " {} kernel.".format(self.kernel))

        if self.n_sub_features < self.degree:
            raise ValueError("n_sub_features < degree.")

        random_state = check_random_state(self.random_state)
        X = check_array(X, accept_sparse=True)
        n_samples, n_features = X.shape
        size = (self.n_sub_features * self.n_components, )
        distribution = self.distribution.lower()
        data = get_random_matrix(random_state, distribution, size=size)
        col = np.repeat(np.arange(self.n_components), self.n_sub_features)
        row = get_subfeatures_indices(self.n_components, n_features,
                                      self.n_sub_features, random_state)
        shape = (n_features, self.n_components)
        self.random_weights_ = csr_matrix((data, (row, col)), shape=shape)
        return self

[docs]    def transform(self, X):
        """Apply the approximate feature map to X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            New data, where n_samples is the number of samples
            and n_features is the number of features.

        Returns
        -------
        X_new : array-like, shape (n_samples, n_components)
        """
        check_is_fitted(self, "random_weights_")
        X = check_array(X, accept_sparse=True)
        n_samples, n_features = X.shape

        if isinstance(self.kernel, str):
            if self.kernel == 'anova':
                kernel_ = _anova(self.degree)
            elif self.kernel == 'anova_cython':
                kernel_ = _anova_fast(self.degree)
            elif self.kernel == 'all_subsets':
                kernel_ = all_subsets
            elif self.kernel == 'dot':
                kernel_ = dot()
            else:
                raise ValueError('Kernel {} is not supported. '
                                 'Use "anova", "anova_cython", "all_subsets", '
                                 '"dot", or "pairwise".'
                                 .format(self.kernel))
        else:
            kernel_ = self.kernel
        dense_output = self.dense_output
        # for sparse output
        if not dense_output:
            if not (issparse(self.random_weights_) and issparse(X)):
                warnings.warn("dense_output=False is valid only when both "
                              "X and random_weights_ are sparse. "
                              "dense_output is changed to True now.")
                dense_output = True

        const = comb(n_features, self.degree)
        const /= comb(self.n_sub_features, self.degree)
        output = kernel_(X, self.random_weights_.T, dense_output)
        output /= sqrt(self.n_components)
        output *= sqrt(const)
        return output

    def _remove_bases(self, indices):
        self.random_weights_ = np.delete(self.random_weights_, indices, axis=1)
        self.n_components = self.random_weights_.shape[1]
        return True