# Author: Kyohei Atarashi
# License: BSD-2-Clause
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.extmath import safe_sparse_dot
from scipy.sparse import csr_matrix, issparse
from math import sqrt
from ..kernels import anova, all_subsets, anova_fast, pairwise
from .utils import get_random_matrix
from .utils_random_fast import get_subfeatures_indices
import warnings
from scipy.special import comb
def _anova(degree=2):
return lambda X, Y, dense_output: anova(X, Y, degree, dense_output)
def _anova_fast(degree=2):
return lambda X, Y, dense_output: anova_fast(X, Y, degree, dense_output)
def dot():
return lambda X, Y, dense_output: safe_sparse_dot(X, Y.T, dense_output)
def _pairwise(symmetric=False):
return lambda X, Y, dense_output: pairwise(X, Y, dense_output, symmetric)
[docs]class RandomKernel(BaseEstimator, TransformerMixin):
"""Approximates feature map of the ANOVA/all-subsets kernel by
Random Kernel Feature map.
Parameters
----------
n_components : int (default=100)
Number of Monte Carlo samples per original features.
Equals the dimensionality of the computed (mapped) feature space.
kernel : str (default="anova")
Kernel to be approximated.
"anova", "anova_cython", "all-subsets", "dot", or "pairwise"
can be used.
degree : int (default=2)
Parameter of the ANOVA kernel.
distribution : str, (default="rademacher")
Distribution for random_weights_.
"rademacher", "gaussian", "laplace", "uniform", or "sparse_rademacher"
can be used.
dense_output : bool (default=True)
Whether randomized feature matrix is dense or sparse.
For kernel='anova', if dense_output = False,
distribution='sparse_rademacher', and X is sparse matrix, output random
feature matrix will become sparse matrix.
For kernel='anova_cython', if dense_output=False, output random feature
matrix will become sparse matrix.
p_sparse : float (default=0.)
Sparsity parameter for "sparse_rademacher" distribution.
If p_sparse = 0, "sparse_rademacher" is equivalent to "rademacher".
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Attributes
----------
random_weights_ : array, shape (n_features, n_components)
The sampled basis.
References
----------
[1] Random Feature Maps for the Itemset Kernel.
Kyohei Atarashi, Subhransu Maji, and Satoshi Oyama.
In AAAI 2019.
(https://www.aaai.org/ojs/index.php/AAAI/article/view/4188)
"""
def __init__(self, n_components=100, kernel='anova', degree=2,
distribution='rademacher', dense_output=True, p_sparse=0.,
random_state=None):
self.n_components = n_components
self.kernel = kernel
self.degree = degree
self.distribution = distribution
self.dense_output = dense_output
self.p_sparse = p_sparse
self.random_state = random_state
[docs] def fit(self, X, y=None):
"""Generate random weights according to n_features.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Training data, where n_samples is the number of samples
and n_features is the number of features.
Returns
-------
self : object
Returns the transformer.
"""
random_state = check_random_state(self.random_state)
X = check_array(X, accept_sparse=True)
n_samples, n_features = X.shape
size = (n_features, self.n_components)
distribution = self.distribution.lower()
self.random_weights_ = get_random_matrix(random_state, distribution,
size, self.p_sparse)
return self
def _remove_bases(self, indices):
self.random_weights_ = np.delete(self.random_weights_, indices, axis=1)
self.n_components = self.random_weights_.shape[1]
return True
[docs]class SubfeatureRandomKernel(BaseEstimator, TransformerMixin):
"""Approximates feature map of the ANOVA/all-subsets kernel
by Subfeature Random Kernel Feature map.
Parameters
----------
n_components : int (default=100)
Number of Monte Carlo samples per original features.
Equals the dimensionality of the computed (mapped) feature space.
n_sub_features : int (default=5)
Number of subfeatures.
kernel : str (default="anova")
Kernel to be approximated.
"anova", "anova_cython", "all-subsets", "dot", or "pairwise"
can be used.
degree : int (default=2)
Parameter of the ANOVA kernel.
distribution : str, (default="rademacher")
Distribution for random_weights_.
"rademacher", "gaussian", "laplace", "uniform", or "sparse_rademacher"
can be used.
dense_output : bool (default=False)
Whether randomized feature matrix is dense or sparse.
For kernel='anova', if dense_output = False,
distribution='sparse_rademacher', and X is sparse matrix, output random
feature matrix will become sparse matrix.
For kernel='anova_cython', if dense_output=False, output random feature
matrix will become sparse matrix.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Attributes
----------
random_weights_ : csr_matrix, shape (n_features, n_components)
The sampled basis.
References
----------
[1] Sparse Random Feature Maps for the Item-multiset Kernel.
Kyohei Atarashi, Satoshi Oyama, and Masahito Kurihara.
"""
def __init__(self, n_components=100, n_sub_features=5, kernel='anova',
degree=2, distribution='rademacher',
dense_output=False, random_state=None):
self.n_components = n_components
self.n_sub_features = n_sub_features
self.degree = degree
self.kernel = kernel
self.distribution = distribution
self.dense_output = dense_output
self.random_state = random_state
[docs] def fit(self, X, y=None):
"""Generate random weights according to n_features.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Training data, where n_samples is the number of samples
and n_features is the number of features.
Returns
-------
self : object
Returns the transformer.
"""
if self.kernel not in ['anova', 'anova_cython']:
raise ValueError("RandomSubsetKernel now does not support"
" {} kernel.".format(self.kernel))
if self.n_sub_features < self.degree:
raise ValueError("n_sub_features < degree.")
random_state = check_random_state(self.random_state)
X = check_array(X, accept_sparse=True)
n_samples, n_features = X.shape
size = (self.n_sub_features * self.n_components, )
distribution = self.distribution.lower()
data = get_random_matrix(random_state, distribution, size=size)
col = np.repeat(np.arange(self.n_components), self.n_sub_features)
row = get_subfeatures_indices(self.n_components, n_features,
self.n_sub_features, random_state)
shape = (n_features, self.n_components)
self.random_weights_ = csr_matrix((data, (row, col)), shape=shape)
return self
def _remove_bases(self, indices):
self.random_weights_ = np.delete(self.random_weights_, indices, axis=1)
self.n_components = self.random_weights_.shape[1]
return True