scmkl.estimate_sigma

 1import numpy as np
 2import scipy
 3
 4from scmkl.calculate_z import _process_data
 5
 6
 7def estimate_sigma(adata, n_features = 5000):
 8    '''
 9    Calculate kernel widths to inform distribution for projection of 
10    Fourier Features. Calculates one sigma per group of features.
11
12    Parameters
13    ----------
14    **adata** : *AnnData* 
15        > Created by `create_adata`.
16    
17    **n_features** : *int*  
18        > Number of random features to include when estimating sigma. 
19        Will be scaled for the whole pathway set according to a 
20        heuristic. Used for scalability.
21    
22    Returns
23    -------
24    **adata** : *AnnData*
25        > Key added `adata.uns['sigma']`.
26
27    Examples
28    --------
29    >>> adata = scmkl.estimate_sigma(adata)
30    >>> adata.uns['sigma']
31    array([10.4640895 , 10.82011454,  6.16769438,  9.86156855, ...])
32    '''
33 
34    sigma_list = []
35
36    # Loop over every group in group_dict
37    for group_features in adata.uns['group_dict'].values():
38
39        # Select only features in that group and downsample for scalability
40        num_group_features = len(group_features)
41        group_array = np.array(list(group_features))
42        n_feats = min([n_features, num_group_features])
43        group_features = adata.uns['seed_obj'].choice(group_array, n_feats, 
44                                                      replace = False) 
45
46        # Use on the train data to estimate sigma
47        X_train = adata[adata.uns['train_indices'], group_features].X
48        X_train = _process_data(X_train = X_train, 
49                                scale_data = adata.uns['scale_data'], 
50                                return_dense = True)
51        
52        # Sample cells for scalability
53        sample_idx = np.arange(X_train.shape[0])
54        n_samples = np.min((2000, X_train.shape[0]))
55        distance_indices = adata.uns['seed_obj'].choice(sample_idx, n_samples)
56
57        # Calculate Distance Matrix with specified metric
58        sigma = scipy.spatial.distance.cdist(X_train[distance_indices,:], 
59                                             X_train[distance_indices,:], 
60                                             adata.uns['distance_metric'])
61        sigma = np.mean(sigma)
62
63        # sigma = 0 is numerically unusable in later steps
64        # Using such a small sigma will result in wide distribution, and 
65        # typically a non-predictive Z
66        if sigma == 0:
67            sigma += 1e-5
68
69        if n_features < num_group_features:
70            # Heuristic we calculated to account for fewer features used in 
71            # distance calculation
72            sigma = sigma * num_group_features / n_features 
73
74        sigma_list.append(sigma)
75    
76    adata.uns['sigma'] = np.array(sigma_list)
77        
78    return adata
def estimate_sigma(adata, n_features=5000):
 8def estimate_sigma(adata, n_features = 5000):
 9    '''
10    Calculate kernel widths to inform distribution for projection of 
11    Fourier Features. Calculates one sigma per group of features.
12
13    Parameters
14    ----------
15    **adata** : *AnnData* 
16        > Created by `create_adata`.
17    
18    **n_features** : *int*  
19        > Number of random features to include when estimating sigma. 
20        Will be scaled for the whole pathway set according to a 
21        heuristic. Used for scalability.
22    
23    Returns
24    -------
25    **adata** : *AnnData*
26        > Key added `adata.uns['sigma']`.
27
28    Examples
29    --------
30    >>> adata = scmkl.estimate_sigma(adata)
31    >>> adata.uns['sigma']
32    array([10.4640895 , 10.82011454,  6.16769438,  9.86156855, ...])
33    '''
34 
35    sigma_list = []
36
37    # Loop over every group in group_dict
38    for group_features in adata.uns['group_dict'].values():
39
40        # Select only features in that group and downsample for scalability
41        num_group_features = len(group_features)
42        group_array = np.array(list(group_features))
43        n_feats = min([n_features, num_group_features])
44        group_features = adata.uns['seed_obj'].choice(group_array, n_feats, 
45                                                      replace = False) 
46
47        # Use on the train data to estimate sigma
48        X_train = adata[adata.uns['train_indices'], group_features].X
49        X_train = _process_data(X_train = X_train, 
50                                scale_data = adata.uns['scale_data'], 
51                                return_dense = True)
52        
53        # Sample cells for scalability
54        sample_idx = np.arange(X_train.shape[0])
55        n_samples = np.min((2000, X_train.shape[0]))
56        distance_indices = adata.uns['seed_obj'].choice(sample_idx, n_samples)
57
58        # Calculate Distance Matrix with specified metric
59        sigma = scipy.spatial.distance.cdist(X_train[distance_indices,:], 
60                                             X_train[distance_indices,:], 
61                                             adata.uns['distance_metric'])
62        sigma = np.mean(sigma)
63
64        # sigma = 0 is numerically unusable in later steps
65        # Using such a small sigma will result in wide distribution, and 
66        # typically a non-predictive Z
67        if sigma == 0:
68            sigma += 1e-5
69
70        if n_features < num_group_features:
71            # Heuristic we calculated to account for fewer features used in 
72            # distance calculation
73            sigma = sigma * num_group_features / n_features 
74
75        sigma_list.append(sigma)
76    
77    adata.uns['sigma'] = np.array(sigma_list)
78        
79    return adata

Calculate kernel widths to inform distribution for projection of Fourier Features. Calculates one sigma per group of features.

Parameters

adata : AnnData

Created by create_adata.

n_features : int

Number of random features to include when estimating sigma. Will be scaled for the whole pathway set according to a heuristic. Used for scalability.

Returns

adata : AnnData

Key added adata.uns['sigma'].

Examples

>>> adata = scmkl.estimate_sigma(adata)
>>> adata.uns['sigma']
array([10.4640895 , 10.82011454,  6.16769438,  9.86156855, ...])