scmkl.estimate_sigma
1import numpy as np 2import scipy 3from sklearn.decomposition import TruncatedSVD, PCA 4 5from scmkl.calculate_z import _process_data 6from scmkl.tfidf_normalize import _tfidf 7 8def estimate_sigma(adata, n_features = 5000): 9 ''' 10 Calculate kernel widths to inform distribution for projection of 11 Fourier Features. Calculates one sigma per group of features. 12 13 Parameters 14 ---------- 15 **adata** : *AnnData* 16 > Created by `create_adata`. 17 18 **n_features** : *int* 19 > Number of random features to include when estimating sigma. 20 Will be scaled for the whole pathway set according to a 21 heuristic. Used for scalability. 22 23 Returns 24 ------- 25 **adata** : *AnnData* 26 > Key added `adata.uns['sigma']`. 27 28 Examples 29 -------- 30 >>> adata = scmkl.estimate_sigma(adata) 31 >>> adata.uns['sigma'] 32 array([10.4640895 , 10.82011454, 6.16769438, 9.86156855, ...]) 33 ''' 34 sigma_list = [] 35 36 # Loop over every group in group_dict 37 for m, group_features in enumerate(adata.uns['group_dict'].values()): 38 39 # Select only features in that group and downsample for scalability 40 num_group_features = len(group_features) 41 group_array = np.array(list(group_features)) 42 n_feats = min([n_features, num_group_features]) 43 group_features = adata.uns['seed_obj'].choice(group_array, n_feats, 44 replace = False) 45 46 # Use on the train data to estimate sigma 47 X_train = adata[adata.uns['train_indices'], group_features].X 48 49 50 # Sample cells for scalability 51 sample_idx = np.arange(X_train.shape[0]) 52 n_samples = np.min((2000, X_train.shape[0])) 53 distance_indices = adata.uns['seed_obj'].choice(sample_idx, n_samples, 54 replace = False) 55 56 X_train = _process_data(X_train = X_train, 57 scale_data = adata.uns['scale_data'], 58 return_dense = True) 59 60 61 if adata.uns['tfidf']: 62 X_train = _tfidf(X_train, mode = 'normalize') 63 64 if adata.uns['reduction'].lower() == 'svd': 65 66 SVD_func = TruncatedSVD(n_components = np.min([50, X_train.shape[1]]), random_state = 1) 67 X_train = SVD_func.fit_transform(scipy.sparse.csr_array(X_train[distance_indices,:]))[:,1:] 68 69 elif adata.uns['reduction'].lower() == 'pca': 70 PCA_func = PCA(n_components = np.min([50, X_train.shape[1]]), random_state = 1) 71 X_train = PCA_func.fit_transform(np.asarray(X_train[distance_indices,:])) 72 73 elif adata.uns['reduction'].lower() == 'linear': 74 75 X_train = X_train[distance_indices] @ adata.uns['seed_obj'].choice([0,1], [0.02, 0.98]).reshape((len(distance_indices), 50)) 76 77 else: 78 X_train = X_train[distance_indices, :] 79 80 if scipy.sparse.issparse(X_train): 81 X_train = X_train.toarray() 82 83 # Calculate Distance Matrix with specified metric 84 sigma = scipy.spatial.distance.cdist(X_train, 85 X_train, 86 adata.uns['distance_metric']) 87 sigma = np.mean(sigma) 88 89 # sigma = 0 is numerically unusable in later steps 90 # Using such a small sigma will result in wide distribution, and 91 # typically a non-predictive Z 92 if sigma == 0: 93 sigma += 1e-5 94 95 if n_features < num_group_features: 96 # Heuristic we calculated to account for fewer features used in 97 # distance calculation 98 sigma = sigma * num_group_features / n_features 99 100 sigma_list.append(sigma) 101 102 adata.uns['sigma'] = np.array(sigma_list) 103 104 return adata
def
estimate_sigma(adata, n_features=5000):
9def estimate_sigma(adata, n_features = 5000): 10 ''' 11 Calculate kernel widths to inform distribution for projection of 12 Fourier Features. Calculates one sigma per group of features. 13 14 Parameters 15 ---------- 16 **adata** : *AnnData* 17 > Created by `create_adata`. 18 19 **n_features** : *int* 20 > Number of random features to include when estimating sigma. 21 Will be scaled for the whole pathway set according to a 22 heuristic. Used for scalability. 23 24 Returns 25 ------- 26 **adata** : *AnnData* 27 > Key added `adata.uns['sigma']`. 28 29 Examples 30 -------- 31 >>> adata = scmkl.estimate_sigma(adata) 32 >>> adata.uns['sigma'] 33 array([10.4640895 , 10.82011454, 6.16769438, 9.86156855, ...]) 34 ''' 35 sigma_list = [] 36 37 # Loop over every group in group_dict 38 for m, group_features in enumerate(adata.uns['group_dict'].values()): 39 40 # Select only features in that group and downsample for scalability 41 num_group_features = len(group_features) 42 group_array = np.array(list(group_features)) 43 n_feats = min([n_features, num_group_features]) 44 group_features = adata.uns['seed_obj'].choice(group_array, n_feats, 45 replace = False) 46 47 # Use on the train data to estimate sigma 48 X_train = adata[adata.uns['train_indices'], group_features].X 49 50 51 # Sample cells for scalability 52 sample_idx = np.arange(X_train.shape[0]) 53 n_samples = np.min((2000, X_train.shape[0])) 54 distance_indices = adata.uns['seed_obj'].choice(sample_idx, n_samples, 55 replace = False) 56 57 X_train = _process_data(X_train = X_train, 58 scale_data = adata.uns['scale_data'], 59 return_dense = True) 60 61 62 if adata.uns['tfidf']: 63 X_train = _tfidf(X_train, mode = 'normalize') 64 65 if adata.uns['reduction'].lower() == 'svd': 66 67 SVD_func = TruncatedSVD(n_components = np.min([50, X_train.shape[1]]), random_state = 1) 68 X_train = SVD_func.fit_transform(scipy.sparse.csr_array(X_train[distance_indices,:]))[:,1:] 69 70 elif adata.uns['reduction'].lower() == 'pca': 71 PCA_func = PCA(n_components = np.min([50, X_train.shape[1]]), random_state = 1) 72 X_train = PCA_func.fit_transform(np.asarray(X_train[distance_indices,:])) 73 74 elif adata.uns['reduction'].lower() == 'linear': 75 76 X_train = X_train[distance_indices] @ adata.uns['seed_obj'].choice([0,1], [0.02, 0.98]).reshape((len(distance_indices), 50)) 77 78 else: 79 X_train = X_train[distance_indices, :] 80 81 if scipy.sparse.issparse(X_train): 82 X_train = X_train.toarray() 83 84 # Calculate Distance Matrix with specified metric 85 sigma = scipy.spatial.distance.cdist(X_train, 86 X_train, 87 adata.uns['distance_metric']) 88 sigma = np.mean(sigma) 89 90 # sigma = 0 is numerically unusable in later steps 91 # Using such a small sigma will result in wide distribution, and 92 # typically a non-predictive Z 93 if sigma == 0: 94 sigma += 1e-5 95 96 if n_features < num_group_features: 97 # Heuristic we calculated to account for fewer features used in 98 # distance calculation 99 sigma = sigma * num_group_features / n_features 100 101 sigma_list.append(sigma) 102 103 adata.uns['sigma'] = np.array(sigma_list) 104 105 return adata
Calculate kernel widths to inform distribution for projection of Fourier Features. Calculates one sigma per group of features.
Parameters
adata : AnnData
Created by
create_adata
.
n_features : int
Number of random features to include when estimating sigma. Will be scaled for the whole pathway set according to a heuristic. Used for scalability.
Returns
adata : AnnData
Key added
adata.uns['sigma']
.
Examples
>>> adata = scmkl.estimate_sigma(adata)
>>> adata.uns['sigma']
array([10.4640895 , 10.82011454, 6.16769438, 9.86156855, ...])