scmkl.calculate_z

View Source

  1import numpy as np
  2import scipy
  3import anndata as ad
  4from sklearn.decomposition import TruncatedSVD, PCA
  5from scmkl.tfidf_normalize import _tfidf_train_test
  6
  7def _sparse_var(X, axis = None):
  8    '''
  9    Function to calculate variance on a scipy sparse matrix.
 10    
 11    Parameters
 12    ----------
 13    X : A scipy sparse or numpy array
 14    axis : Determines which axis variance is calculated on. Same usage 
 15    as Numpy.
 16        axis = 0 => column variances
 17        axis = 1 => row variances
 18        axis = None => total variance (calculated on all data)
 19    
 20    Returns
 21    -------
 22    var : Variance values calculated over the given axis
 23    '''
 24
 25    # E[X^2] - E[X]^2
 26    if scipy.sparse.issparse(X):
 27        exp_mean = (X.power(2).mean(axis = axis))
 28        sq_mean = np.square(X.mean(axis = axis))
 29        var = np.array(exp_mean - sq_mean)
 30    else:
 31        var = np.var(X, axis = axis)
 32
 33    return var.ravel()
 34
 35
 36def _process_data(X_train, X_test = None, scale_data = True, 
 37                  return_dense = True):
 38    '''
 39    Function to preprocess data matrix according to type of data 
 40    (counts- e.g. rna, or binary- atac). Will process test data 
 41    according to parameters calculated from test data
 42    
 43    Parameters
 44    ----------
 45    X_train : A scipy sparse or numpy array
 46    X_train : A scipy sparse or numpy array
 47    data_type : 'counts' or 'binary'.  Determines what preprocessing is 
 48                applied to the data. Log transforms and standard scales 
 49                counts data TFIDF filters ATAC data to remove 
 50                uninformative columns
 51    
 52    Returns
 53    -------
 54    X_train, X_test : Numpy arrays with the process train/test data 
 55    respectively.
 56    '''
 57    if X_test is None:
 58            # Creates dummy matrix to for the sake of calculation without 
 59            # increasing computational time
 60            X_test = X_train[:1,:] 
 61            orig_test = None
 62    else:
 63        orig_test = 'given'
 64
 65    # Remove features that have no variance in the training data 
 66    # (will be uniformative)
 67    var = _sparse_var(X_train, axis = 0)
 68    variable_features = np.where(var > 1e-5)[0]
 69
 70    X_train = X_train[:,variable_features]
 71    X_test = X_test[:, variable_features]
 72
 73    #Data processing according to data type
 74    if scale_data:
 75
 76        if scipy.sparse.issparse(X_train):
 77            X_train = X_train.log1p()
 78            X_test = X_test.log1p()
 79        else:
 80            X_train = np.log1p(X_train)
 81            X_test = np.log1p(X_test)
 82            
 83        #Center and scale count data
 84        train_means = np.mean(X_train, 0)
 85        train_sds = np.sqrt(var[variable_features])
 86
 87        # Perform transformation on test data according to parameters 
 88        # of the training data
 89        X_train = (X_train - train_means) / train_sds
 90        X_test = (X_test - train_means) / train_sds
 91
 92
 93    if return_dense and scipy.sparse.issparse(X_train):
 94        X_train = X_train.toarray()
 95        X_test = X_test.toarray()
 96
 97
 98    if orig_test is None:
 99        return X_train
100    else:
101        return X_train, X_test
102
103
104def calculate_z(adata, n_features = 5000) -> ad.AnnData:
105    '''
106    Function to calculate Z matrix.
107
108    Parameters
109    ----------
110    **adata** : *AnnData*
111        > created by `create_adata()` with `adata.uns.keys()` 
112        `'sigma'`, `'train_indices'`, and `'test_indices'`. 
113        `'sigma'` key can be added by running `estimate_sigma()` on 
114        adata. 
115
116    **n_features** : *int* 
117        > Number of random feature to use when calculating Z- used for 
118        scalability.
119
120    Returns
121    -------
122    **adata** : *AnnData*
123        > adata with Z matrices accessible with `adata.uns['Z_train']` 
124        and `adata.uns['Z_test']`.
125
126    Examples
127    --------
128    >>> adata = scmkl.estimate_sigma(adata)
129    >>> adata = scmkl.calculate_z(adata)
130    >>> adata.uns.keys()
131    dict_keys(['Z_train', 'Z_test', 'sigmas', 'train_indices', 
132    'test_indices'])
133    '''
134    assert np.all(adata.uns['sigma'] > 0), 'Sigma must be positive'
135
136    # Number of groupings taking from group_dict
137    n_pathway = len(adata.uns['group_dict'].keys())
138    D = adata.uns['D']
139
140    # Capturing training and testing indices
141    train_idx = np.array(adata.uns['train_indices'], dtype = np.int_)
142    test_idx = np.array(adata.uns['test_indices'], dtype = np.int_)
143
144    # Create Arrays to store concatenated group Z
145    # Each group of features will have a corresponding entry in each array
146    n_cols = 2 * adata.uns['D'] * n_pathway
147
148    Z_train = np.zeros((train_idx.shape[0], n_cols))
149    Z_test = np.zeros((test_idx.shape[0], n_cols))
150
151
152    # Loop over each of the groups and creating Z for each
153    for m, group_features in enumerate(adata.uns['group_dict'].values()):
154        
155        #Extract features from mth group
156        num_group_features = len(group_features)
157
158        # Sample up to n_features features- important for scalability if 
159        # using large groupings
160        # Will use all features if the grouping contains fewer than n_features
161        number_features = np.min([n_features, num_group_features])
162        group_array = np.array(list(group_features))
163        group_features = adata.uns['seed_obj'].choice(group_array, 
164                                                      number_features, 
165                                                      replace = False) 
166
167        # Create data arrays containing only features within this group
168        X_train = adata[adata.uns['train_indices'],:][:, group_features].X
169        X_test = adata[adata.uns['test_indices'],:][:, group_features].X
170
171        if adata.uns['tfidf']:
172            X_train, X_test = _tfidf_train_test(X_train, X_test)
173
174        # Data filtering, and transformation according to given data_type
175        # Will remove low variance (< 1e5) features regardless of data_type
176        # If given data_type is 'counts' will log scale and z-score the data
177        X_train, X_test = _process_data(X_train = X_train, X_test = X_test, 
178                                        scale_data = adata.uns['scale_data'], 
179                                        return_dense = True)          
180
181        if adata.uns['reduction'].lower() == 'svd':
182
183            SVD_func = TruncatedSVD(n_components = np.min([50, X_train.shape[1]]), random_state = 1)
184            
185            # Remove first component as it corresponds with sequencing depth
186            X_train = SVD_func.fit_transform(scipy.sparse.csr_array(X_train))[:, 1:]
187            X_test = SVD_func.transform(scipy.sparse.csr_array(X_test))[:, 1:]
188
189        elif adata.uns['reduction'].lower() == 'pca':
190            PCA_func = PCA(n_components = np.min([50, X_train.shape[1]]), random_state = 1)
191
192            X_train = PCA_func.fit_transform(np.asarray(X_train))
193            X_test = PCA_func.transform(np.asarray(X_test))
194
195        elif adata.uns['reduction'] == 'linear':
196
197            X_train = X_train @ adata.uns['seed_obj'].choice([0,1], p = [0.02, 0.98], size = X_train.shape[1] * 50).reshape((X_train.shape[1], 50))
198            X_test = X_test @ adata.uns['seed_obj'].choice([0,1], p = [0.02, 0.98], size = X_test.shape[1] * 50).reshape((X_train.shape[1], 50))
199
200        if scipy.sparse.issparse(X_train):
201            X_train = X_train.toarray().astype(np.float16)
202            X_test = X_test.toarray().astype(np.float16)
203
204        # Extract pre-calculated sigma used for approximating kernel
205        adjusted_sigma = adata.uns['sigma'][m]
206
207        # Calculates approximate kernel according to chosen kernel function
208        # Distribution data comes from Fourier Transform of kernel function
209        if adata.uns['kernel_type'].lower() == 'gaussian':
210
211            gamma = 1/(2*adjusted_sigma**2)
212            sigma_p = 0.5*np.sqrt(2*gamma)
213
214            W = adata.uns['seed_obj'].normal(0, sigma_p, X_train.shape[1] * D)
215            W = W.reshape((X_train.shape[1]), D)
216
217        elif adata.uns['kernel_type'].lower() == 'laplacian':
218
219            gamma = 1 / (2 * adjusted_sigma)
220
221            W = adata.uns['seed_obj'].standard_cauchy(X_train.shape[1] * D)
222            W = gamma * W.reshape((X_train.shape[1], D))
223
224        elif adata.uns['kernel_type'].lower() == 'cauchy':
225
226            gamma = 1 / (2 * adjusted_sigma ** 2)
227            b = 0.5 * np.sqrt(gamma)
228
229            W = adata.uns['seed_obj'].laplace(0, b, X_train.shape[1] * D)
230            W = W.reshape((X_train.shape[1], D))
231
232
233        train_projection = np.matmul(X_train, W)
234        test_projection = np.matmul(X_test, W)
235        
236        # Store group Z in whole-Z object. 
237        # Preserves order to be able to extract meaningful groups
238        x_idx = np.arange( m * 2 * D , (m + 1) * 2 * D)
239        sq_i_d = np.sqrt(1/D)
240
241        Z_train[0:, x_idx] = sq_i_d * np.hstack((np.cos(train_projection), 
242                                                 np.sin(train_projection)))
243        Z_test[0:, x_idx] = sq_i_d * np.hstack((np.cos(test_projection), 
244                                                np.sin(test_projection)))
245
246    adata.uns['Z_train'] = Z_train
247    adata.uns['Z_test'] = Z_test
248
249    return adata

def calculate_z(adata, n_features=5000) -> anndata._core.anndata.AnnData: View Source

105def calculate_z(adata, n_features = 5000) -> ad.AnnData:
106    '''
107    Function to calculate Z matrix.
108
109    Parameters
110    ----------
111    **adata** : *AnnData*
112        > created by `create_adata()` with `adata.uns.keys()` 
113        `'sigma'`, `'train_indices'`, and `'test_indices'`. 
114        `'sigma'` key can be added by running `estimate_sigma()` on 
115        adata. 
116
117    **n_features** : *int* 
118        > Number of random feature to use when calculating Z- used for 
119        scalability.
120
121    Returns
122    -------
123    **adata** : *AnnData*
124        > adata with Z matrices accessible with `adata.uns['Z_train']` 
125        and `adata.uns['Z_test']`.
126
127    Examples
128    --------
129    >>> adata = scmkl.estimate_sigma(adata)
130    >>> adata = scmkl.calculate_z(adata)
131    >>> adata.uns.keys()
132    dict_keys(['Z_train', 'Z_test', 'sigmas', 'train_indices', 
133    'test_indices'])
134    '''
135    assert np.all(adata.uns['sigma'] > 0), 'Sigma must be positive'
136
137    # Number of groupings taking from group_dict
138    n_pathway = len(adata.uns['group_dict'].keys())
139    D = adata.uns['D']
140
141    # Capturing training and testing indices
142    train_idx = np.array(adata.uns['train_indices'], dtype = np.int_)
143    test_idx = np.array(adata.uns['test_indices'], dtype = np.int_)
144
145    # Create Arrays to store concatenated group Z
146    # Each group of features will have a corresponding entry in each array
147    n_cols = 2 * adata.uns['D'] * n_pathway
148
149    Z_train = np.zeros((train_idx.shape[0], n_cols))
150    Z_test = np.zeros((test_idx.shape[0], n_cols))
151
152
153    # Loop over each of the groups and creating Z for each
154    for m, group_features in enumerate(adata.uns['group_dict'].values()):
155        
156        #Extract features from mth group
157        num_group_features = len(group_features)
158
159        # Sample up to n_features features- important for scalability if 
160        # using large groupings
161        # Will use all features if the grouping contains fewer than n_features
162        number_features = np.min([n_features, num_group_features])
163        group_array = np.array(list(group_features))
164        group_features = adata.uns['seed_obj'].choice(group_array, 
165                                                      number_features, 
166                                                      replace = False) 
167
168        # Create data arrays containing only features within this group
169        X_train = adata[adata.uns['train_indices'],:][:, group_features].X
170        X_test = adata[adata.uns['test_indices'],:][:, group_features].X
171
172        if adata.uns['tfidf']:
173            X_train, X_test = _tfidf_train_test(X_train, X_test)
174
175        # Data filtering, and transformation according to given data_type
176        # Will remove low variance (< 1e5) features regardless of data_type
177        # If given data_type is 'counts' will log scale and z-score the data
178        X_train, X_test = _process_data(X_train = X_train, X_test = X_test, 
179                                        scale_data = adata.uns['scale_data'], 
180                                        return_dense = True)          
181
182        if adata.uns['reduction'].lower() == 'svd':
183
184            SVD_func = TruncatedSVD(n_components = np.min([50, X_train.shape[1]]), random_state = 1)
185            
186            # Remove first component as it corresponds with sequencing depth
187            X_train = SVD_func.fit_transform(scipy.sparse.csr_array(X_train))[:, 1:]
188            X_test = SVD_func.transform(scipy.sparse.csr_array(X_test))[:, 1:]
189
190        elif adata.uns['reduction'].lower() == 'pca':
191            PCA_func = PCA(n_components = np.min([50, X_train.shape[1]]), random_state = 1)
192
193            X_train = PCA_func.fit_transform(np.asarray(X_train))
194            X_test = PCA_func.transform(np.asarray(X_test))
195
196        elif adata.uns['reduction'] == 'linear':
197
198            X_train = X_train @ adata.uns['seed_obj'].choice([0,1], p = [0.02, 0.98], size = X_train.shape[1] * 50).reshape((X_train.shape[1], 50))
199            X_test = X_test @ adata.uns['seed_obj'].choice([0,1], p = [0.02, 0.98], size = X_test.shape[1] * 50).reshape((X_train.shape[1], 50))
200
201        if scipy.sparse.issparse(X_train):
202            X_train = X_train.toarray().astype(np.float16)
203            X_test = X_test.toarray().astype(np.float16)
204
205        # Extract pre-calculated sigma used for approximating kernel
206        adjusted_sigma = adata.uns['sigma'][m]
207
208        # Calculates approximate kernel according to chosen kernel function
209        # Distribution data comes from Fourier Transform of kernel function
210        if adata.uns['kernel_type'].lower() == 'gaussian':
211
212            gamma = 1/(2*adjusted_sigma**2)
213            sigma_p = 0.5*np.sqrt(2*gamma)
214
215            W = adata.uns['seed_obj'].normal(0, sigma_p, X_train.shape[1] * D)
216            W = W.reshape((X_train.shape[1]), D)
217
218        elif adata.uns['kernel_type'].lower() == 'laplacian':
219
220            gamma = 1 / (2 * adjusted_sigma)
221
222            W = adata.uns['seed_obj'].standard_cauchy(X_train.shape[1] * D)
223            W = gamma * W.reshape((X_train.shape[1], D))
224
225        elif adata.uns['kernel_type'].lower() == 'cauchy':
226
227            gamma = 1 / (2 * adjusted_sigma ** 2)
228            b = 0.5 * np.sqrt(gamma)
229
230            W = adata.uns['seed_obj'].laplace(0, b, X_train.shape[1] * D)
231            W = W.reshape((X_train.shape[1], D))
232
233
234        train_projection = np.matmul(X_train, W)
235        test_projection = np.matmul(X_test, W)
236        
237        # Store group Z in whole-Z object. 
238        # Preserves order to be able to extract meaningful groups
239        x_idx = np.arange( m * 2 * D , (m + 1) * 2 * D)
240        sq_i_d = np.sqrt(1/D)
241
242        Z_train[0:, x_idx] = sq_i_d * np.hstack((np.cos(train_projection), 
243                                                 np.sin(train_projection)))
244        Z_test[0:, x_idx] = sq_i_d * np.hstack((np.cos(test_projection), 
245                                                np.sin(test_projection)))
246
247    adata.uns['Z_train'] = Z_train
248    adata.uns['Z_test'] = Z_test
249
250    return adata

Function to calculate Z matrix.

Parameters

adata : AnnData

created by create_adata() with adata.uns.keys() 'sigma', 'train_indices', and 'test_indices'. 'sigma' key can be added by running estimate_sigma() on adata.

n_features : int

Number of random feature to use when calculating Z- used for scalability.

Returns

adata : AnnData

adata with Z matrices accessible with adata.uns['Z_train'] and adata.uns['Z_test'].

Examples

>>> adata = scmkl.estimate_sigma(adata)
>>> adata = scmkl.calculate_z(adata)
>>> adata.uns.keys()
dict_keys(['Z_train', 'Z_test', 'sigmas', 'train_indices', 
'test_indices'])