scmkl.estimate_sigma
1import numpy as np 2import scipy 3 4from scmkl.calculate_z import _process_data 5 6 7def estimate_sigma(adata, n_features = 5000): 8 ''' 9 Calculate kernel widths to inform distribution for projection of 10 Fourier Features. Calculates one sigma per group of features. 11 12 Parameters 13 ---------- 14 **adata** : *AnnData* 15 > Created by `create_adata`. 16 17 **n_features** : *int* 18 > Number of random features to include when estimating sigma. 19 Will be scaled for the whole pathway set according to a 20 heuristic. Used for scalability. 21 22 Returns 23 ------- 24 **adata** : *AnnData* 25 > Key added `adata.uns['sigma']`. 26 27 Examples 28 -------- 29 >>> adata = scmkl.estimate_sigma(adata) 30 >>> adata.uns['sigma'] 31 array([10.4640895 , 10.82011454, 6.16769438, 9.86156855, ...]) 32 ''' 33 34 sigma_list = [] 35 36 # Loop over every group in group_dict 37 for group_features in adata.uns['group_dict'].values(): 38 39 # Select only features in that group and downsample for scalability 40 num_group_features = len(group_features) 41 group_array = np.array(list(group_features)) 42 n_feats = min([n_features, num_group_features]) 43 group_features = adata.uns['seed_obj'].choice(group_array, n_feats, 44 replace = False) 45 46 # Use on the train data to estimate sigma 47 X_train = adata[adata.uns['train_indices'], group_features].X 48 X_train = _process_data(X_train = X_train, 49 scale_data = adata.uns['scale_data'], 50 return_dense = True) 51 52 # Sample cells for scalability 53 sample_idx = np.arange(X_train.shape[0]) 54 n_samples = np.min((2000, X_train.shape[0])) 55 distance_indices = adata.uns['seed_obj'].choice(sample_idx, n_samples) 56 57 # Calculate Distance Matrix with specified metric 58 sigma = scipy.spatial.distance.cdist(X_train[distance_indices,:], 59 X_train[distance_indices,:], 60 adata.uns['distance_metric']) 61 sigma = np.mean(sigma) 62 63 # sigma = 0 is numerically unusable in later steps 64 # Using such a small sigma will result in wide distribution, and 65 # typically a non-predictive Z 66 if sigma == 0: 67 sigma += 1e-5 68 69 if n_features < num_group_features: 70 # Heuristic we calculated to account for fewer features used in 71 # distance calculation 72 sigma = sigma * num_group_features / n_features 73 74 sigma_list.append(sigma) 75 76 adata.uns['sigma'] = np.array(sigma_list) 77 78 return adata
def
estimate_sigma(adata, n_features=5000):
8def estimate_sigma(adata, n_features = 5000): 9 ''' 10 Calculate kernel widths to inform distribution for projection of 11 Fourier Features. Calculates one sigma per group of features. 12 13 Parameters 14 ---------- 15 **adata** : *AnnData* 16 > Created by `create_adata`. 17 18 **n_features** : *int* 19 > Number of random features to include when estimating sigma. 20 Will be scaled for the whole pathway set according to a 21 heuristic. Used for scalability. 22 23 Returns 24 ------- 25 **adata** : *AnnData* 26 > Key added `adata.uns['sigma']`. 27 28 Examples 29 -------- 30 >>> adata = scmkl.estimate_sigma(adata) 31 >>> adata.uns['sigma'] 32 array([10.4640895 , 10.82011454, 6.16769438, 9.86156855, ...]) 33 ''' 34 35 sigma_list = [] 36 37 # Loop over every group in group_dict 38 for group_features in adata.uns['group_dict'].values(): 39 40 # Select only features in that group and downsample for scalability 41 num_group_features = len(group_features) 42 group_array = np.array(list(group_features)) 43 n_feats = min([n_features, num_group_features]) 44 group_features = adata.uns['seed_obj'].choice(group_array, n_feats, 45 replace = False) 46 47 # Use on the train data to estimate sigma 48 X_train = adata[adata.uns['train_indices'], group_features].X 49 X_train = _process_data(X_train = X_train, 50 scale_data = adata.uns['scale_data'], 51 return_dense = True) 52 53 # Sample cells for scalability 54 sample_idx = np.arange(X_train.shape[0]) 55 n_samples = np.min((2000, X_train.shape[0])) 56 distance_indices = adata.uns['seed_obj'].choice(sample_idx, n_samples) 57 58 # Calculate Distance Matrix with specified metric 59 sigma = scipy.spatial.distance.cdist(X_train[distance_indices,:], 60 X_train[distance_indices,:], 61 adata.uns['distance_metric']) 62 sigma = np.mean(sigma) 63 64 # sigma = 0 is numerically unusable in later steps 65 # Using such a small sigma will result in wide distribution, and 66 # typically a non-predictive Z 67 if sigma == 0: 68 sigma += 1e-5 69 70 if n_features < num_group_features: 71 # Heuristic we calculated to account for fewer features used in 72 # distance calculation 73 sigma = sigma * num_group_features / n_features 74 75 sigma_list.append(sigma) 76 77 adata.uns['sigma'] = np.array(sigma_list) 78 79 return adata
Calculate kernel widths to inform distribution for projection of Fourier Features. Calculates one sigma per group of features.
Parameters
adata : AnnData
Created by
create_adata
.
n_features : int
Number of random features to include when estimating sigma. Will be scaled for the whole pathway set according to a heuristic. Used for scalability.
Returns
adata : AnnData
Key added
adata.uns['sigma']
.
Examples
>>> adata = scmkl.estimate_sigma(adata)
>>> adata.uns['sigma']
array([10.4640895 , 10.82011454, 6.16769438, 9.86156855, ...])