scmkl.calculate_z
1import numpy as np 2import scipy 3import anndata as ad 4from sklearn.decomposition import TruncatedSVD, PCA 5from scmkl.tfidf_normalize import _tfidf_train_test 6 7def _sparse_var(X, axis = None): 8 ''' 9 Function to calculate variance on a scipy sparse matrix. 10 11 Parameters 12 ---------- 13 X : A scipy sparse or numpy array 14 axis : Determines which axis variance is calculated on. Same usage 15 as Numpy. 16 axis = 0 => column variances 17 axis = 1 => row variances 18 axis = None => total variance (calculated on all data) 19 20 Returns 21 ------- 22 var : Variance values calculated over the given axis 23 ''' 24 25 # E[X^2] - E[X]^2 26 if scipy.sparse.issparse(X): 27 exp_mean = (X.power(2).mean(axis = axis)) 28 sq_mean = np.square(X.mean(axis = axis)) 29 var = np.array(exp_mean - sq_mean) 30 else: 31 var = np.var(X, axis = axis) 32 33 return var.ravel() 34 35 36def _process_data(X_train, X_test = None, scale_data = True, 37 return_dense = True): 38 ''' 39 Function to preprocess data matrix according to type of data 40 (counts- e.g. rna, or binary- atac). Will process test data 41 according to parameters calculated from test data 42 43 Parameters 44 ---------- 45 X_train : A scipy sparse or numpy array 46 X_train : A scipy sparse or numpy array 47 data_type : 'counts' or 'binary'. Determines what preprocessing is 48 applied to the data. Log transforms and standard scales 49 counts data TFIDF filters ATAC data to remove 50 uninformative columns 51 52 Returns 53 ------- 54 X_train, X_test : Numpy arrays with the process train/test data 55 respectively. 56 ''' 57 if X_test is None: 58 # Creates dummy matrix to for the sake of calculation without 59 # increasing computational time 60 X_test = X_train[:1,:] 61 orig_test = None 62 else: 63 orig_test = 'given' 64 65 # Remove features that have no variance in the training data 66 # (will be uniformative) 67 var = _sparse_var(X_train, axis = 0) 68 variable_features = np.where(var > 1e-5)[0] 69 70 X_train = X_train[:,variable_features] 71 X_test = X_test[:, variable_features] 72 73 #Data processing according to data type 74 if scale_data: 75 76 if scipy.sparse.issparse(X_train): 77 X_train = X_train.log1p() 78 X_test = X_test.log1p() 79 else: 80 X_train = np.log1p(X_train) 81 X_test = np.log1p(X_test) 82 83 #Center and scale count data 84 train_means = np.mean(X_train, 0) 85 train_sds = np.sqrt(var[variable_features]) 86 87 # Perform transformation on test data according to parameters 88 # of the training data 89 X_train = (X_train - train_means) / train_sds 90 X_test = (X_test - train_means) / train_sds 91 92 93 if return_dense and scipy.sparse.issparse(X_train): 94 X_train = X_train.toarray() 95 X_test = X_test.toarray() 96 97 98 if orig_test is None: 99 return X_train 100 else: 101 return X_train, X_test 102 103 104def calculate_z(adata, n_features = 5000) -> ad.AnnData: 105 ''' 106 Function to calculate Z matrix. 107 108 Parameters 109 ---------- 110 **adata** : *AnnData* 111 > created by `create_adata()` with `adata.uns.keys()` 112 `'sigma'`, `'train_indices'`, and `'test_indices'`. 113 `'sigma'` key can be added by running `estimate_sigma()` on 114 adata. 115 116 **n_features** : *int* 117 > Number of random feature to use when calculating Z- used for 118 scalability. 119 120 Returns 121 ------- 122 **adata** : *AnnData* 123 > adata with Z matrices accessible with `adata.uns['Z_train']` 124 and `adata.uns['Z_test']`. 125 126 Examples 127 -------- 128 >>> adata = scmkl.estimate_sigma(adata) 129 >>> adata = scmkl.calculate_z(adata) 130 >>> adata.uns.keys() 131 dict_keys(['Z_train', 'Z_test', 'sigmas', 'train_indices', 132 'test_indices']) 133 ''' 134 assert np.all(adata.uns['sigma'] > 0), 'Sigma must be positive' 135 136 # Number of groupings taking from group_dict 137 n_pathway = len(adata.uns['group_dict'].keys()) 138 D = adata.uns['D'] 139 140 # Capturing training and testing indices 141 train_idx = np.array(adata.uns['train_indices'], dtype = np.int_) 142 test_idx = np.array(adata.uns['test_indices'], dtype = np.int_) 143 144 # Create Arrays to store concatenated group Z 145 # Each group of features will have a corresponding entry in each array 146 n_cols = 2 * adata.uns['D'] * n_pathway 147 148 Z_train = np.zeros((train_idx.shape[0], n_cols)) 149 Z_test = np.zeros((test_idx.shape[0], n_cols)) 150 151 152 # Loop over each of the groups and creating Z for each 153 for m, group_features in enumerate(adata.uns['group_dict'].values()): 154 155 #Extract features from mth group 156 num_group_features = len(group_features) 157 158 # Sample up to n_features features- important for scalability if 159 # using large groupings 160 # Will use all features if the grouping contains fewer than n_features 161 number_features = np.min([n_features, num_group_features]) 162 group_array = np.array(list(group_features)) 163 group_features = adata.uns['seed_obj'].choice(group_array, 164 number_features, 165 replace = False) 166 167 # Create data arrays containing only features within this group 168 X_train = adata[adata.uns['train_indices'],:][:, group_features].X 169 X_test = adata[adata.uns['test_indices'],:][:, group_features].X 170 171 if adata.uns['tfidf']: 172 X_train, X_test = _tfidf_train_test(X_train, X_test) 173 174 # Data filtering, and transformation according to given data_type 175 # Will remove low variance (< 1e5) features regardless of data_type 176 # If given data_type is 'counts' will log scale and z-score the data 177 X_train, X_test = _process_data(X_train = X_train, X_test = X_test, 178 scale_data = adata.uns['scale_data'], 179 return_dense = True) 180 181 if adata.uns['reduction'].lower() == 'svd': 182 183 SVD_func = TruncatedSVD(n_components = np.min([50, X_train.shape[1]]), random_state = 1) 184 185 # Remove first component as it corresponds with sequencing depth 186 X_train = SVD_func.fit_transform(scipy.sparse.csr_array(X_train))[:, 1:] 187 X_test = SVD_func.transform(scipy.sparse.csr_array(X_test))[:, 1:] 188 189 elif adata.uns['reduction'].lower() == 'pca': 190 PCA_func = PCA(n_components = np.min([50, X_train.shape[1]]), random_state = 1) 191 192 X_train = PCA_func.fit_transform(np.asarray(X_train)) 193 X_test = PCA_func.transform(np.asarray(X_test)) 194 195 elif adata.uns['reduction'] == 'linear': 196 197 X_train = X_train @ adata.uns['seed_obj'].choice([0,1], p = [0.02, 0.98], size = X_train.shape[1] * 50).reshape((X_train.shape[1], 50)) 198 X_test = X_test @ adata.uns['seed_obj'].choice([0,1], p = [0.02, 0.98], size = X_test.shape[1] * 50).reshape((X_train.shape[1], 50)) 199 200 if scipy.sparse.issparse(X_train): 201 X_train = X_train.toarray().astype(np.float16) 202 X_test = X_test.toarray().astype(np.float16) 203 204 # Extract pre-calculated sigma used for approximating kernel 205 adjusted_sigma = adata.uns['sigma'][m] 206 207 # Calculates approximate kernel according to chosen kernel function 208 # Distribution data comes from Fourier Transform of kernel function 209 if adata.uns['kernel_type'].lower() == 'gaussian': 210 211 gamma = 1/(2*adjusted_sigma**2) 212 sigma_p = 0.5*np.sqrt(2*gamma) 213 214 W = adata.uns['seed_obj'].normal(0, sigma_p, X_train.shape[1] * D) 215 W = W.reshape((X_train.shape[1]), D) 216 217 elif adata.uns['kernel_type'].lower() == 'laplacian': 218 219 gamma = 1 / (2 * adjusted_sigma) 220 221 W = adata.uns['seed_obj'].standard_cauchy(X_train.shape[1] * D) 222 W = gamma * W.reshape((X_train.shape[1], D)) 223 224 elif adata.uns['kernel_type'].lower() == 'cauchy': 225 226 gamma = 1 / (2 * adjusted_sigma ** 2) 227 b = 0.5 * np.sqrt(gamma) 228 229 W = adata.uns['seed_obj'].laplace(0, b, X_train.shape[1] * D) 230 W = W.reshape((X_train.shape[1], D)) 231 232 233 train_projection = np.matmul(X_train, W) 234 test_projection = np.matmul(X_test, W) 235 236 # Store group Z in whole-Z object. 237 # Preserves order to be able to extract meaningful groups 238 x_idx = np.arange( m * 2 * D , (m + 1) * 2 * D) 239 sq_i_d = np.sqrt(1/D) 240 241 Z_train[0:, x_idx] = sq_i_d * np.hstack((np.cos(train_projection), 242 np.sin(train_projection))) 243 Z_test[0:, x_idx] = sq_i_d * np.hstack((np.cos(test_projection), 244 np.sin(test_projection))) 245 246 adata.uns['Z_train'] = Z_train 247 adata.uns['Z_test'] = Z_test 248 249 return adata
def
calculate_z(adata, n_features=5000) -> anndata._core.anndata.AnnData:
105def calculate_z(adata, n_features = 5000) -> ad.AnnData: 106 ''' 107 Function to calculate Z matrix. 108 109 Parameters 110 ---------- 111 **adata** : *AnnData* 112 > created by `create_adata()` with `adata.uns.keys()` 113 `'sigma'`, `'train_indices'`, and `'test_indices'`. 114 `'sigma'` key can be added by running `estimate_sigma()` on 115 adata. 116 117 **n_features** : *int* 118 > Number of random feature to use when calculating Z- used for 119 scalability. 120 121 Returns 122 ------- 123 **adata** : *AnnData* 124 > adata with Z matrices accessible with `adata.uns['Z_train']` 125 and `adata.uns['Z_test']`. 126 127 Examples 128 -------- 129 >>> adata = scmkl.estimate_sigma(adata) 130 >>> adata = scmkl.calculate_z(adata) 131 >>> adata.uns.keys() 132 dict_keys(['Z_train', 'Z_test', 'sigmas', 'train_indices', 133 'test_indices']) 134 ''' 135 assert np.all(adata.uns['sigma'] > 0), 'Sigma must be positive' 136 137 # Number of groupings taking from group_dict 138 n_pathway = len(adata.uns['group_dict'].keys()) 139 D = adata.uns['D'] 140 141 # Capturing training and testing indices 142 train_idx = np.array(adata.uns['train_indices'], dtype = np.int_) 143 test_idx = np.array(adata.uns['test_indices'], dtype = np.int_) 144 145 # Create Arrays to store concatenated group Z 146 # Each group of features will have a corresponding entry in each array 147 n_cols = 2 * adata.uns['D'] * n_pathway 148 149 Z_train = np.zeros((train_idx.shape[0], n_cols)) 150 Z_test = np.zeros((test_idx.shape[0], n_cols)) 151 152 153 # Loop over each of the groups and creating Z for each 154 for m, group_features in enumerate(adata.uns['group_dict'].values()): 155 156 #Extract features from mth group 157 num_group_features = len(group_features) 158 159 # Sample up to n_features features- important for scalability if 160 # using large groupings 161 # Will use all features if the grouping contains fewer than n_features 162 number_features = np.min([n_features, num_group_features]) 163 group_array = np.array(list(group_features)) 164 group_features = adata.uns['seed_obj'].choice(group_array, 165 number_features, 166 replace = False) 167 168 # Create data arrays containing only features within this group 169 X_train = adata[adata.uns['train_indices'],:][:, group_features].X 170 X_test = adata[adata.uns['test_indices'],:][:, group_features].X 171 172 if adata.uns['tfidf']: 173 X_train, X_test = _tfidf_train_test(X_train, X_test) 174 175 # Data filtering, and transformation according to given data_type 176 # Will remove low variance (< 1e5) features regardless of data_type 177 # If given data_type is 'counts' will log scale and z-score the data 178 X_train, X_test = _process_data(X_train = X_train, X_test = X_test, 179 scale_data = adata.uns['scale_data'], 180 return_dense = True) 181 182 if adata.uns['reduction'].lower() == 'svd': 183 184 SVD_func = TruncatedSVD(n_components = np.min([50, X_train.shape[1]]), random_state = 1) 185 186 # Remove first component as it corresponds with sequencing depth 187 X_train = SVD_func.fit_transform(scipy.sparse.csr_array(X_train))[:, 1:] 188 X_test = SVD_func.transform(scipy.sparse.csr_array(X_test))[:, 1:] 189 190 elif adata.uns['reduction'].lower() == 'pca': 191 PCA_func = PCA(n_components = np.min([50, X_train.shape[1]]), random_state = 1) 192 193 X_train = PCA_func.fit_transform(np.asarray(X_train)) 194 X_test = PCA_func.transform(np.asarray(X_test)) 195 196 elif adata.uns['reduction'] == 'linear': 197 198 X_train = X_train @ adata.uns['seed_obj'].choice([0,1], p = [0.02, 0.98], size = X_train.shape[1] * 50).reshape((X_train.shape[1], 50)) 199 X_test = X_test @ adata.uns['seed_obj'].choice([0,1], p = [0.02, 0.98], size = X_test.shape[1] * 50).reshape((X_train.shape[1], 50)) 200 201 if scipy.sparse.issparse(X_train): 202 X_train = X_train.toarray().astype(np.float16) 203 X_test = X_test.toarray().astype(np.float16) 204 205 # Extract pre-calculated sigma used for approximating kernel 206 adjusted_sigma = adata.uns['sigma'][m] 207 208 # Calculates approximate kernel according to chosen kernel function 209 # Distribution data comes from Fourier Transform of kernel function 210 if adata.uns['kernel_type'].lower() == 'gaussian': 211 212 gamma = 1/(2*adjusted_sigma**2) 213 sigma_p = 0.5*np.sqrt(2*gamma) 214 215 W = adata.uns['seed_obj'].normal(0, sigma_p, X_train.shape[1] * D) 216 W = W.reshape((X_train.shape[1]), D) 217 218 elif adata.uns['kernel_type'].lower() == 'laplacian': 219 220 gamma = 1 / (2 * adjusted_sigma) 221 222 W = adata.uns['seed_obj'].standard_cauchy(X_train.shape[1] * D) 223 W = gamma * W.reshape((X_train.shape[1], D)) 224 225 elif adata.uns['kernel_type'].lower() == 'cauchy': 226 227 gamma = 1 / (2 * adjusted_sigma ** 2) 228 b = 0.5 * np.sqrt(gamma) 229 230 W = adata.uns['seed_obj'].laplace(0, b, X_train.shape[1] * D) 231 W = W.reshape((X_train.shape[1], D)) 232 233 234 train_projection = np.matmul(X_train, W) 235 test_projection = np.matmul(X_test, W) 236 237 # Store group Z in whole-Z object. 238 # Preserves order to be able to extract meaningful groups 239 x_idx = np.arange( m * 2 * D , (m + 1) * 2 * D) 240 sq_i_d = np.sqrt(1/D) 241 242 Z_train[0:, x_idx] = sq_i_d * np.hstack((np.cos(train_projection), 243 np.sin(train_projection))) 244 Z_test[0:, x_idx] = sq_i_d * np.hstack((np.cos(test_projection), 245 np.sin(test_projection))) 246 247 adata.uns['Z_train'] = Z_train 248 adata.uns['Z_test'] = Z_test 249 250 return adata
Function to calculate Z matrix.
Parameters
adata : AnnData
created by
create_adata()
withadata.uns.keys()
'sigma'
,'train_indices'
, and'test_indices'
.'sigma'
key can be added by runningestimate_sigma()
on adata.
n_features : int
Number of random feature to use when calculating Z- used for scalability.
Returns
adata : AnnData
adata with Z matrices accessible with
adata.uns['Z_train']
andadata.uns['Z_test']
.
Examples
>>> adata = scmkl.estimate_sigma(adata)
>>> adata = scmkl.calculate_z(adata)
>>> adata.uns.keys()
dict_keys(['Z_train', 'Z_test', 'sigmas', 'train_indices',
'test_indices'])