scmkl.calculate_z
1import numpy as np 2import scipy 3import anndata as ad 4 5from scmkl.tfidf_normalize import tfidf_train_test 6from scmkl.estimate_sigma import est_group_sigma, get_batches 7from scmkl.data_processing import process_data, get_group_mat, sample_cells 8from scmkl.projections import gaussian_trans, laplacian_trans, cauchy_trans 9 10 11def get_z_indices(m, D): 12 """ 13 Takes the number associated with the group as `m` and returns the 14 indices for cos and sin functions to be applied. 15 16 Parameters 17 ---------- 18 m : int 19 The chronological number of the group being processed. 20 21 D : int 22 The number of dimensions per group. 23 24 Returns 25 ------- 26 cos_idx, sin_idx : np.ndarray, np.ndarray 27 The indices for cos and sin projections in overall Z matrix. 28 """ 29 x_idx = np.arange(m*2*D ,(m + 1)*2*D) 30 cos_idx = x_idx[:len(x_idx)//2] 31 sin_idx = x_idx[len(x_idx)//2:] 32 33 return cos_idx, sin_idx 34 35 36def calc_groupz(X_train, X_test, adata, D, sigma, proj_func): 37 """ 38 Calculates the Z matrix for grouping. 39 40 Parameters 41 ---------- 42 X_train : np.ndarray 43 The filtered data matrix to calculate train Z mat for. 44 45 X_test : np.ndarray 46 The filtered data matrix to calculate test Z mat for. 47 48 adata : anndata.AnnData 49 AnnData object containing `seed_obj` in `.uns` attribute. 50 51 D : int 52 Number of dimensions per grouping. 53 54 sigma : float 55 Kernel width for grouping. 56 57 proj_func : function 58 The projection direction function to be applied to data. 59 60 Returns 61 ------- 62 train_projections, test_projections : np.ndarray, np.ndarray 63 Training and testing Z matrices for group. 64 """ 65 if scipy.sparse.issparse(X_train): 66 X_train = X_train.toarray().astype(np.float16) 67 X_test = X_test.toarray().astype(np.float16) 68 69 W = proj_func(X_train, sigma, adata.uns['seed_obj'], D) 70 71 train_projection = np.matmul(X_train, W) 72 test_projection = np.matmul(X_test, W) 73 74 return train_projection, test_projection 75 76 77def calculate_z(adata, n_features=5000, batches=10, 78 batch_size=100) -> ad.AnnData: 79 """ 80 Function to calculate Z matrices for all groups in both training 81 and testing data. 82 83 Parameters 84 ---------- 85 adata : ad.AnnData 86 created by `scmkl.create_adata()` with `adata.uns.keys()`: 87 `'train_indices'`, and `'test_indices'`. 88 89 n_features : int 90 Number of random feature to use when calculating Z; used for 91 scalability. 92 93 batches : int 94 The number of batches to use for the distance calculation. 95 This will average the result of `batches` distance calculations 96 of `batch_size` randomly sampled cells. More batches will converge 97 to population distance values at the cost of scalability. 98 99 batch_size : int 100 The number of cells to include per batch for distance 101 calculations. Higher batch size will converge to population 102 distance values at the cost of scalability. 103 If `batches*batch_size > num_training_cells`, 104 `batch_size` will be reduced to 105 `int(num_training_cells / batches)`. 106 107 Returns 108 ------- 109 adata : ad.AnnData 110 `adata` with Z matrices accessible with `adata.uns['Z_train']` 111 and `adata.uns['Z_test']`. 112 113 Examples 114 -------- 115 >>> adata = scmkl.estimate_sigma(adata) 116 >>> adata = scmkl.calculate_z(adata) 117 >>> adata.uns.keys() 118 dict_keys(['Z_train', 'Z_test', 'sigmas', 'train_indices', 119 'test_indices']) 120 """ 121 # Number of groupings taking from group_dict 122 n_pathway = len(adata.uns['group_dict'].keys()) 123 D = adata.uns['D'] 124 125 sq_i_d = np.sqrt(1/D) 126 127 # Capturing training and testing sizes 128 train_len = len(adata.uns['train_indices']) 129 test_len = len(adata.uns['test_indices']) 130 131 if batch_size * batches > len(adata.uns['train_indices']): 132 old_batch_size = batch_size 133 batch_size = int(len(adata.uns['train_indices'])/batches) 134 print("Specified batch size required too many cells for " 135 "independent batches. Reduced batch size from " 136 f"{old_batch_size} to {batch_size}") 137 138 if 'sigma' not in adata.uns.keys(): 139 n_samples = np.min((2000, adata.uns['train_indices'].shape[0])) 140 sample_range = np.arange(n_samples) 141 batch_idx = get_batches(sample_range, adata.uns['seed_obj'], 142 batches=batches, batch_size=batch_size) 143 sigma_indices = sample_cells(adata.uns['train_indices'], n_samples, adata.uns['seed_obj']) 144 145 # Create Arrays to store concatenated group Zs 146 # Each group of features will have a corresponding entry in each array 147 n_cols = 2*adata.uns['D']*n_pathway 148 Z_train = np.zeros((train_len, n_cols)) 149 Z_test = np.zeros((test_len, n_cols)) 150 151 152 # Setting kernel function 153 match adata.uns['kernel_type'].lower(): 154 case 'gaussian': 155 proj_func = gaussian_trans 156 case 'laplacian': 157 proj_func = laplacian_trans 158 case 'cauchy': 159 proj_func = cauchy_trans 160 161 162 # Loop over each of the groups and creating Z for each 163 sigma_list = list() 164 for m, group_features in enumerate(adata.uns['group_dict'].values()): 165 166 n_group_features = len(group_features) 167 168 X_train, X_test = get_group_mat(adata, n_features, group_features, 169 n_group_features, process_test=True) 170 171 if adata.uns['tfidf']: 172 X_train, X_test = tfidf_train_test(X_train, X_test) 173 174 # Data filtering, and transformation according to given data_type 175 # Will remove low variance (< 1e5) features regardless of data_type 176 # If scale_data will log scale and z-score the data 177 X_train, X_test = process_data(X_train=X_train, X_test=X_test, 178 scale_data=adata.uns['scale_data'], 179 return_dense=True) 180 181 # Getting sigma 182 if 'sigma' in adata.uns.keys(): 183 sigma = adata.uns['sigma'][m] 184 else: 185 sigma = est_group_sigma(adata, X_train, n_group_features, 186 n_features, batch_idx=batch_idx) 187 sigma_list.append(sigma) 188 189 assert sigma > 0, "Sigma must be more than 0" 190 train_projection, test_projection = calc_groupz(X_train, X_test, 191 adata, D, sigma, 192 proj_func) 193 194 # Store group Z in whole-Z object 195 # Preserves order to be able to extract meaningful groups 196 cos_idx, sin_idx = get_z_indices(m, D) 197 198 Z_train[0:, cos_idx] = np.cos(train_projection) 199 Z_train[0:, sin_idx] = np.sin(train_projection) 200 201 Z_test[0:, cos_idx] = np.cos(test_projection) 202 Z_test[0:, sin_idx] = np.sin(test_projection) 203 204 adata.uns['Z_train'] = Z_train*sq_i_d 205 adata.uns['Z_test'] = Z_test*sq_i_d 206 207 if 'sigma' not in adata.uns.keys(): 208 adata.uns['sigma'] = np.array(sigma_list) 209 210 return adata
def
get_z_indices(m, D):
12def get_z_indices(m, D): 13 """ 14 Takes the number associated with the group as `m` and returns the 15 indices for cos and sin functions to be applied. 16 17 Parameters 18 ---------- 19 m : int 20 The chronological number of the group being processed. 21 22 D : int 23 The number of dimensions per group. 24 25 Returns 26 ------- 27 cos_idx, sin_idx : np.ndarray, np.ndarray 28 The indices for cos and sin projections in overall Z matrix. 29 """ 30 x_idx = np.arange(m*2*D ,(m + 1)*2*D) 31 cos_idx = x_idx[:len(x_idx)//2] 32 sin_idx = x_idx[len(x_idx)//2:] 33 34 return cos_idx, sin_idx
Takes the number associated with the group as m
and returns the
indices for cos and sin functions to be applied.
Parameters
- m (int): The chronological number of the group being processed.
- D (int): The number of dimensions per group.
Returns
- cos_idx, sin_idx (np.ndarray, np.ndarray): The indices for cos and sin projections in overall Z matrix.
def
calc_groupz(X_train, X_test, adata, D, sigma, proj_func):
37def calc_groupz(X_train, X_test, adata, D, sigma, proj_func): 38 """ 39 Calculates the Z matrix for grouping. 40 41 Parameters 42 ---------- 43 X_train : np.ndarray 44 The filtered data matrix to calculate train Z mat for. 45 46 X_test : np.ndarray 47 The filtered data matrix to calculate test Z mat for. 48 49 adata : anndata.AnnData 50 AnnData object containing `seed_obj` in `.uns` attribute. 51 52 D : int 53 Number of dimensions per grouping. 54 55 sigma : float 56 Kernel width for grouping. 57 58 proj_func : function 59 The projection direction function to be applied to data. 60 61 Returns 62 ------- 63 train_projections, test_projections : np.ndarray, np.ndarray 64 Training and testing Z matrices for group. 65 """ 66 if scipy.sparse.issparse(X_train): 67 X_train = X_train.toarray().astype(np.float16) 68 X_test = X_test.toarray().astype(np.float16) 69 70 W = proj_func(X_train, sigma, adata.uns['seed_obj'], D) 71 72 train_projection = np.matmul(X_train, W) 73 test_projection = np.matmul(X_test, W) 74 75 return train_projection, test_projection
Calculates the Z matrix for grouping.
Parameters
- X_train (np.ndarray): The filtered data matrix to calculate train Z mat for.
- X_test (np.ndarray): The filtered data matrix to calculate test Z mat for.
- adata (anndata.AnnData):
AnnData object containing
seed_obj
in.uns
attribute. - D (int): Number of dimensions per grouping.
- sigma (float): Kernel width for grouping.
- proj_func (function): The projection direction function to be applied to data.
Returns
- train_projections, test_projections (np.ndarray, np.ndarray): Training and testing Z matrices for group.
def
calculate_z( adata, n_features=5000, batches=10, batch_size=100) -> anndata._core.anndata.AnnData:
78def calculate_z(adata, n_features=5000, batches=10, 79 batch_size=100) -> ad.AnnData: 80 """ 81 Function to calculate Z matrices for all groups in both training 82 and testing data. 83 84 Parameters 85 ---------- 86 adata : ad.AnnData 87 created by `scmkl.create_adata()` with `adata.uns.keys()`: 88 `'train_indices'`, and `'test_indices'`. 89 90 n_features : int 91 Number of random feature to use when calculating Z; used for 92 scalability. 93 94 batches : int 95 The number of batches to use for the distance calculation. 96 This will average the result of `batches` distance calculations 97 of `batch_size` randomly sampled cells. More batches will converge 98 to population distance values at the cost of scalability. 99 100 batch_size : int 101 The number of cells to include per batch for distance 102 calculations. Higher batch size will converge to population 103 distance values at the cost of scalability. 104 If `batches*batch_size > num_training_cells`, 105 `batch_size` will be reduced to 106 `int(num_training_cells / batches)`. 107 108 Returns 109 ------- 110 adata : ad.AnnData 111 `adata` with Z matrices accessible with `adata.uns['Z_train']` 112 and `adata.uns['Z_test']`. 113 114 Examples 115 -------- 116 >>> adata = scmkl.estimate_sigma(adata) 117 >>> adata = scmkl.calculate_z(adata) 118 >>> adata.uns.keys() 119 dict_keys(['Z_train', 'Z_test', 'sigmas', 'train_indices', 120 'test_indices']) 121 """ 122 # Number of groupings taking from group_dict 123 n_pathway = len(adata.uns['group_dict'].keys()) 124 D = adata.uns['D'] 125 126 sq_i_d = np.sqrt(1/D) 127 128 # Capturing training and testing sizes 129 train_len = len(adata.uns['train_indices']) 130 test_len = len(adata.uns['test_indices']) 131 132 if batch_size * batches > len(adata.uns['train_indices']): 133 old_batch_size = batch_size 134 batch_size = int(len(adata.uns['train_indices'])/batches) 135 print("Specified batch size required too many cells for " 136 "independent batches. Reduced batch size from " 137 f"{old_batch_size} to {batch_size}") 138 139 if 'sigma' not in adata.uns.keys(): 140 n_samples = np.min((2000, adata.uns['train_indices'].shape[0])) 141 sample_range = np.arange(n_samples) 142 batch_idx = get_batches(sample_range, adata.uns['seed_obj'], 143 batches=batches, batch_size=batch_size) 144 sigma_indices = sample_cells(adata.uns['train_indices'], n_samples, adata.uns['seed_obj']) 145 146 # Create Arrays to store concatenated group Zs 147 # Each group of features will have a corresponding entry in each array 148 n_cols = 2*adata.uns['D']*n_pathway 149 Z_train = np.zeros((train_len, n_cols)) 150 Z_test = np.zeros((test_len, n_cols)) 151 152 153 # Setting kernel function 154 match adata.uns['kernel_type'].lower(): 155 case 'gaussian': 156 proj_func = gaussian_trans 157 case 'laplacian': 158 proj_func = laplacian_trans 159 case 'cauchy': 160 proj_func = cauchy_trans 161 162 163 # Loop over each of the groups and creating Z for each 164 sigma_list = list() 165 for m, group_features in enumerate(adata.uns['group_dict'].values()): 166 167 n_group_features = len(group_features) 168 169 X_train, X_test = get_group_mat(adata, n_features, group_features, 170 n_group_features, process_test=True) 171 172 if adata.uns['tfidf']: 173 X_train, X_test = tfidf_train_test(X_train, X_test) 174 175 # Data filtering, and transformation according to given data_type 176 # Will remove low variance (< 1e5) features regardless of data_type 177 # If scale_data will log scale and z-score the data 178 X_train, X_test = process_data(X_train=X_train, X_test=X_test, 179 scale_data=adata.uns['scale_data'], 180 return_dense=True) 181 182 # Getting sigma 183 if 'sigma' in adata.uns.keys(): 184 sigma = adata.uns['sigma'][m] 185 else: 186 sigma = est_group_sigma(adata, X_train, n_group_features, 187 n_features, batch_idx=batch_idx) 188 sigma_list.append(sigma) 189 190 assert sigma > 0, "Sigma must be more than 0" 191 train_projection, test_projection = calc_groupz(X_train, X_test, 192 adata, D, sigma, 193 proj_func) 194 195 # Store group Z in whole-Z object 196 # Preserves order to be able to extract meaningful groups 197 cos_idx, sin_idx = get_z_indices(m, D) 198 199 Z_train[0:, cos_idx] = np.cos(train_projection) 200 Z_train[0:, sin_idx] = np.sin(train_projection) 201 202 Z_test[0:, cos_idx] = np.cos(test_projection) 203 Z_test[0:, sin_idx] = np.sin(test_projection) 204 205 adata.uns['Z_train'] = Z_train*sq_i_d 206 adata.uns['Z_test'] = Z_test*sq_i_d 207 208 if 'sigma' not in adata.uns.keys(): 209 adata.uns['sigma'] = np.array(sigma_list) 210 211 return adata
Function to calculate Z matrices for all groups in both training and testing data.
Parameters
- adata (ad.AnnData):
created by
scmkl.create_adata
withadata.uns.keys()
:'train_indices'
, and'test_indices'
. - n_features (int): Number of random feature to use when calculating Z; used for scalability.
- batches (int):
The number of batches to use for the distance calculation.
This will average the result of
batches
distance calculations ofbatch_size
randomly sampled cells. More batches will converge to population distance values at the cost of scalability. - batch_size (int):
The number of cells to include per batch for distance
calculations. Higher batch size will converge to population
distance values at the cost of scalability.
If
batches*batch_size > num_training_cells
,batch_size
will be reduced toint(num_training_cells / batches)
.
Returns
- adata (ad.AnnData):
adata
with Z matrices accessible withadata.uns['Z_train']
andadata.uns['Z_test']
.
Examples
>>> adata = scmkl.estimate_sigma(adata)
>>> adata = scmkl.calculate_z(adata)
>>> adata.uns.keys()
dict_keys(['Z_train', 'Z_test', 'sigmas', 'train_indices',
'test_indices'])