Source code for pyrfm.random_feature.subsampled_random_hadamard

# Author: Kyohei Atarashi
# License: BSD-2-Clause

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
from .utils import next_pow_of_two, get_random_matrix


def _get_random_matrix(distribution):
    return lambda rng, size: get_random_matrix(rng, distribution, size)


[docs]class SubsampledRandomHadamard(BaseEstimator, TransformerMixin):
    """Approximates feature maps of the product between random matrix and
    feature vectors by Subsampled Randomized Hadamard Transform.

    This class can be used as a sub-routine for approximating the product
    between random matrix and feature vectors in some random features.
    Subsampled Randomized Hadamard Transform uses diagonal matrices, the
    Walsh-Hadamard matrix, and submatrix of the identity matrix for
    approximating the matrix-vector product.

    Parameters
    ----------
    n_components : int (default=100)
        Number of Monte Carlo samples per original features.
        Equals the dimensionality of the computed (mapped) feature space.
    
    distribution : str or function (default="rademacher")
        A function for sampling random bases.
        Its arguments must be random_state and size.
        For str, "gaussian" (or "normal"), "rademacher", "laplace", or
        "uniform" can be used.
    
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If np.RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    Attributes
    ----------
    random_weights_ : array, shape (n_features)
        The sampled basis.
        It is sampled by using self.distribution, which is the rademacher
        distribution default.

    random_indices_rows_ : array, shape (n_components)
        The indices of rows sampled from [0, \ldots, n_features_padded-1]
        uniformly, where n_features_padded is the smallest power of two number
        that is bigger than n_features.

    References
    ----------
    [1] Improved Analysis of the Subsampled Randomized Hadamard Transform.
    Joel A Tropp.
    Advances in Adaptive Data Analysis,
    (https://arxiv.org/pdf/1011.1595.pdf)

    """

    def __init__(self, n_components=100, distribution="rademacher",
                 random_state=None):
        self.n_components = n_components
        self.distribution = distribution
        self.random_state = random_state

[docs]    def fit(self, X, y=None):
        """Generate random weights according to n_features.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.

        Returns
        -------
        self : object
            Returns the transformer.
        """
        random_state = check_random_state(self.random_state)
        X = check_array(X, accept_sparse=True)
        n_samples, n_features = X.shape
        if isinstance(self.distribution, str):
            distribution = _get_random_matrix(self.distribution)
        else:
            distribution = self.distribution

        n_features_padded = next_pow_of_two(n_features)
        if n_features_padded < self.n_components:
            raise ValueError("n_components is bigger than next power of two "
                             "of n_features.")
        self.random_weights_ = distribution(random_state, n_features)
        self.random_weights_ = self.random_weights_.astype(np.float64)
        perm = random_state.permutation(n_features_padded).astype(np.int32)
        self.random_indices_rows_ = perm[:self.n_components]

        return self

[docs]    def transform(self, X):
        """Apply the approximate feature map to X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            New data, where n_samples is the number of samples
            and n_features is the number of features.

        Returns
        -------
        X_new : array-like, shape (n_samples, n_components)
        """
        from .random_features_fast import transform_all_fast
        check_is_fitted(self, "random_weights_")
        X = check_array(X, accept_sparse=True)
        return transform_all_fast(X, self)