scmkl.create_adata

  1import numpy as np
  2import anndata as ad
  3
  4
  5def _filter_features(X, feature_names, group_dict, remove_features):
  6    '''
  7    Function to remove unused features from X matrix. Any features not 
  8    included in group_dict will be removed from the matrix. Also puts 
  9    the features in the same relative order (of included features)
 10    
 11    Parameters
 12    ----------
 13    X : Data array. Can be Numpy array or Scipy Sparse Array
 14    feature_names : Numpy array of corresponding feature names
 15    group_dict : Dictionary containing feature grouping information.
 16                 Example: {geneset: np.array(gene_1, gene_2, ..., 
 17                 gene_n)}
 18    Returns
 19    -------
 20    X : Data array containing data only for features in the group_dict
 21    feature_names : Numpy array of corresponding feature names from 
 22                    group_dict
 23    '''
 24    assert X.shape[1] == len(feature_names), ("Given features do not "
 25                                              "correspond with features in X")  
 26
 27    group_features = set()
 28    feature_set = set(feature_names)
 29
 30    # Store all objects in dictionary in set
 31    for group in group_dict.keys():
 32        group_features.update(set(group_dict[group]))
 33
 34        # Finds intersection between group features and features in data
 35        # Converts to nd.array and sorts to preserve order of feature names
 36        group_feats = list(feature_set.intersection(set(group_dict[group])))
 37        group_dict[group] = np.sort(np.array(group_feats))
 38
 39    # Only keeping groupings that have at least two features
 40    group_dict = {group : group_dict[group] for group in group_dict.keys()
 41                  if len(group_dict[group]) > 1}
 42
 43    if remove_features:
 44        # Find location of desired features in whole feature set
 45        g_features = np.array(list(group_features))
 46        group_feature_indices = np.where(np.in1d(feature_names, 
 47                                                 g_features, 
 48                                                 assume_unique = True))[0]
 49
 50        # Subset only the desired features and data
 51        X = X[:,group_feature_indices]
 52        feature_names = np.array(list(feature_names))[group_feature_indices]
 53
 54    return X, feature_names, group_dict
 55
 56
 57def _multi_class_split(y, train_ratio = 0.8, class_threshold = 'median', 
 58                       seed_obj = np.random.default_rng(100)):
 59    '''
 60    Function for calculating the training and testing cell positions 
 61    for multiclass data sets.
 62
 63    Parameters
 64    ----------
 65    **y** : *np.ndarray* | *pd.Series* | *list*
 66        > Should be an iterable object cooresponding to samples in 
 67        `ad.AnnData` object.
 68
 69    **seed_obj** : *numpy.random._generator.Generator*
 70        > Seed used to randomly sample and split data.
 71
 72    **train_ratio** : *float*
 73        > Ratio of number of training samples to entire data set. 
 74        Note: if a threshold is applied, the ratio training samples 
 75        may decrease depending on class balance and `class_threshold`
 76        parameter.
 77
 78    **class_threshold** : *str* | *int*
 79        > If is type `int`, classes with more samples than 
 80        class_threshold will be sampled. If `'median'`, 
 81        samples will be sampled to the median number of samples per 
 82        class.
 83
 84    Returns
 85    -------
 86    **train_indices** : *np.ndarray*
 87        > Indices for training samples.
 88
 89    **test_indices** : *np.ndarray*
 90        > Indices for testing samples.
 91    '''
 92    uniq_labels = np.unique(y)
 93
 94    # Finding indices for each cell class
 95    class_positions = {class_ : np.where(y == class_)[0] 
 96                       for class_ in uniq_labels}
 97    
 98    # Capturing training indices while maintaining original class proportions
 99    train_samples = {class_ : seed_obj.choice(class_positions[class_], 
100                                              int(len(class_positions[class_])
101                                                  * train_ratio), 
102                                              replace = False)
103                        for class_ in class_positions.keys()}
104    
105    # Capturing testing indices while maintaining original class proportions
106    test_samples = {class_ : np.setdiff1d(class_positions[class_], 
107                                          train_samples[class_])
108                    for class_ in class_positions.keys()}
109    
110    # Applying threshold for samples per class
111    if class_threshold == 'median':
112        all_train = [idx for class_ in train_samples.keys()
113                         for idx in train_samples[class_]]
114        _, class_threshold = np.unique(y[all_train], return_counts = True)
115        class_threshold = int(np.median(class_threshold))
116    
117    for class_ in train_samples.keys():
118        if len(train_samples[class_]) > class_threshold:
119            train_samples[class_] = seed_obj.choice(train_samples[class_], 
120                                                       class_threshold)
121            
122    train_indices = np.array([idx for class_ in train_samples.keys()
123                                  for idx in train_samples[class_]])
124    
125    test_indices = np.array([idx for class_ in test_samples.keys()
126                                 for idx in test_samples[class_]])
127    
128    return train_indices, test_indices
129
130
131def _binary_split(y, train_indices = None, train_ratio = 0.8,
132                  seed_obj = np.random.default_rng(100)):
133    '''
134    Function to calculate training and testing indices for given 
135    dataset. If train indices are given, it will calculate the test 
136    indices. If train_indices == None, then it calculates both indices, 
137    preserving the ratio of each label in y
138
139    Parameters
140    ----------
141    y : Numpy array of cell labels. Can have any number of classes for 
142        this function.
143    train_indices : Optional array of pre-determined training indices
144    seed_obj : Numpy random state used for random processes. Can be 
145    specified for reproducubility or set by default.
146    train_ratio : decimal value ratio of features in training/testing 
147                  sets
148    
149    Returns
150    -------
151    train_indices : Array of indices of training cells
152    test_indices : Array of indices of testing cells
153    '''
154
155    # If train indices aren't provided
156    if train_indices is None:
157
158        unique_labels = np.unique(y)
159        train_indices = []
160
161        for label in unique_labels:
162
163            # Find index of each unique label
164            label_indices = np.where(y == label)[0]
165
166            # Sample these indices according to train ratio
167            n = int(len(label_indices) * train_ratio)
168            train_label_indices = seed_obj.choice(label_indices, n, 
169                                                  replace = False)
170            train_indices.extend(train_label_indices)
171    else:
172        assert len(train_indices) <= len(y), ("More train indices than there "
173                                              "are samples")
174
175    train_indices = np.array(train_indices)
176
177    # Test indices are the indices not in the train_indices
178    test_indices = np.setdiff1d(np.arange(len(y)), train_indices, 
179                                assume_unique = True)
180
181    return train_indices, test_indices
182
183
184def calculate_d(num_samples : int):
185    '''
186    This function calculates the optimal number of dimensions for 
187    performance. See https://doi.org/10.48550/arXiv.1806.09178 for more
188    information.
189
190    Parameters
191    ----------
192    **num_samples** : *int*
193        > The number of samples in the data set including both training
194        and testing sets.
195
196    Returns
197    -------
198    **d** : *int*
199        > The optimal number of dimensions to run scMKL with the given 
200        data set.
201
202    Examples
203    --------
204    >>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz')
205    >>> d = scmkl.calculate_d(raw_counts.shape[0])
206    >>> d
207    161
208    '''
209    d = int(np.sqrt(num_samples) * np.log(np.log(num_samples)))
210    return d
211
212
213def create_adata(X, feature_names: np.ndarray, cell_labels: np.ndarray, 
214                 group_dict: dict, scale_data: bool = True, 
215                 split_data : np.ndarray | None = None, D : int | None = None, 
216                 remove_features = True, train_ratio = 0.8,
217                 distance_metric = 'euclidean', kernel_type = 'Gaussian', 
218                 random_state : int = 1, allow_multiclass : bool = False, 
219                 class_threshold : str | int = 'median',
220                 reduction: str | None = None, tfidf: bool = False):
221    '''
222    Function to create an AnnData object to carry all relevant 
223    information going forward.
224
225    Parameters
226    ----------
227    **X** : *scipy.sparse.csc_matrix* | *np.ndarray* | 
228            *pd.DataFrame*
229        > A data matrix of cells by features (sparse array 
230        recommended for large datasets).
231
232    **feature_names** : *np.ndarray*
233        > array of feature names corresponding with the features 
234        in X.
235
236    **cell_labels** : *np.ndarray*
237        > A numpy array of cell phenotypes corresponding with 
238        the cells in X.
239
240    **group_dict** : *dict* 
241        > Dictionary containing feature grouping information.
242            - Example: {geneset: np.array([gene_1, gene_2, ..., 
243                        gene_n])}
244
245    **scale_data** : *bool*  
246        > If `True`, data matrix is log transformed and standard 
247        scaled. 
248        
249    **split_data** : *None* | *np.ndarray*
250        > If *None*, data will be split stratified by cell labels. 
251        Else, is an array of precalculated train/test split 
252        corresponding to samples. Can include labels for entire
253        dataset to benchmark performance or for only training
254        data to classify unknown cell types.
255            - Example: np.array(['train', 'test', ..., 'train'])
256
257    **D** : *int* 
258        > Number of Random Fourier Features used to calculate Z. 
259        Should be a positive integer. Higher values of D will 
260        increase classification accuracy at the cost of computation 
261        time. If set to `None`, will be calculated given number of 
262        samples. 
263    
264    **remove_features** : *bool* 
265        > If `True`, will remove features from X and feature_names
266        not in group_dict and remove features from groupings not in
267        feature_names.
268
269    **train_ratio** : *float*
270        > Ratio of number of training samples to entire data set. Note:
271        if a threshold is applied, the ratio training samples may 
272        decrease depending on class balance and `class_threshold`
273        parameter if `allow_multiclass = True`.
274
275    **distance_metric** : *str* 
276        > The pairwise distance metric used to estimate sigma. Must
277        be one of the options used in scipy.spatial.distance.cdist.
278
279    **kernel_type** : *str*
280        > The approximated kernel function used to calculate Zs.
281        Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`.
282
283    **random_state** : *int*
284        > Integer random_state used to set the seed for 
285        reproducibilty.
286
287    **allow_multiclass** : *bool*
288        > If `False`, will ensure that cell labels are binary.
289
290    **class_threshold** : *str* | *int*
291        > Number of samples allowed in the training data for each cell
292        class in the training data. If `'median'`, the median number of
293        cells per cell class will be the threshold for number of 
294        samples per class.
295
296    **reduction**: *str* | *None*
297        > Choose which dimension reduction technique to perform on features
298        within a group.  'svd' will run sklearn.decomposition.TruncatedSVD,
299        'linear' will multiply by an array of 1s down to 50 dimensions.
300        
301    **tfidf**: *bool*
302        > Whether to calculate TFIDF transformation on peaks within 
303        groupings.
304        
305    Returns
306    -------
307    **adata** : *AnnData*
308    > *AnnData* with the following attributes and keys:
309
310    > `adata.X` : the data matrix.
311    
312    > `adata.var_names` : the feature names corresponding to
313    `adata.X`.
314
315    > `adata.obs['labels']` : cell classes/phenotypes from 
316    `cell_labels`.
317
318    > `adata.uns['train_indices']` : Indices for training data. 
319
320    > `adata.uns['test_indices']` : Indices for testing data.
321
322    > `adata.uns['group_dict']` : Grouping information.
323
324    > `adata.uns['seed_obj']` : Seed object with seed equal to
325    100 * `random_state`.
326
327    > `with adata.uns['D']` : Number of dimensions to scMKL with.
328
329    > `adata.uns['scale_data']` : *bool* for whether or not data is log
330    transformed and scaled.
331
332    > `adata.uns['distance_metric']` : Distance metric as given.
333    
334    > `adata.uns['kernel_type']` : Kernel function as given.
335
336    > `adata.uns['svd'] : *bool* for whether to calculate svd reduction.
337
338    > `adata.uns['tfidf'] : *bool* for whether to calculate tfidf per grouping.
339
340    Examples
341    --------
342    >>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz')
343    >>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True)
344    >>> group_dict = np.load('hallmark_genesets.pkl', 
345    >>>                      allow_pickle = True)
346    >>> 
347    >>> adata = scmkl.create_adata(X = data_mat, 
348    ...                            feature_names = gene_names, 
349    ...                            group_dict = group_dict)
350    >>> adata
351    AnnData object with n_obs × n_vars = 1000 × 4341
352    obs: 'labels'
353    uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
354    'distance_metric', 'train_indices', 'test_indices'
355    '''
356
357    assert X.shape[1] == len(feature_names), ("Different number of features "
358                                              "in X than feature names")
359    
360    if not allow_multiclass:
361        assert len(np.unique(cell_labels)) == 2, ("cell_labels must contain "
362                                                  "2 classes")
363    if D is not None:    
364        assert isinstance(D, int) and D > 0, 'D must be a positive integer'
365
366    kernel_options = ['gaussian', 'laplacian', 'cauchy']
367    assert kernel_type.lower() in kernel_options, ("Given kernel type not "
368                                                   "implemented. Gaussian, "
369                                                   "Laplacian, and Cauchy "
370                                                   "are the acceptable "
371                                                   "types.")
372
373    X, feature_names, group_dict = _filter_features(X, 
374                                                    feature_names, 
375                                                    group_dict, 
376                                                    remove_features)
377
378    # Create adata object and add column names
379    adata = ad.AnnData(X)
380    adata.var_names = feature_names
381
382    # Add metadata to adata object
383    adata.uns['group_dict'] = group_dict
384    adata.uns['seed_obj'] = np.random.default_rng(100 * random_state)
385    adata.uns['scale_data'] = scale_data
386    adata.uns['D'] = D if D is not None else calculate_d(adata.shape[0])
387    adata.uns['kernel_type'] = kernel_type
388    adata.uns['distance_metric'] = distance_metric
389    adata.uns['reduction'] = reduction if isinstance(reduction, str) else 'None'
390    adata.uns['tfidf'] = tfidf
391
392    if (split_data is None):
393        assert X.shape[0] == len(cell_labels), ("Different number of cells "
394                                                "than labels")
395        adata.obs['labels'] = cell_labels
396
397        if (allow_multiclass == False):
398            split = _binary_split(cell_labels, 
399                                  seed_obj = adata.uns['seed_obj'],
400                                  train_ratio = train_ratio)
401            train_indices, test_indices = split
402
403        elif (allow_multiclass == True):
404            split = _multi_class_split(cell_labels, 
405                                       seed_obj = adata.uns['seed_obj'], 
406                                       class_threshold = class_threshold,
407                                       train_ratio = train_ratio)
408            train_indices, test_indices = split
409
410        adata.uns['labeled_test'] = True
411
412    else:
413        x_eq_labs = X.shape[0] == len(cell_labels)
414        train_eq_labs = X.shape[0] == len(cell_labels)
415        assert x_eq_labs or train_eq_labs, ("Must give labels for all cells "
416                                            "or only for training cells")
417        
418        train_indices = np.where(split_data == 'train')[0]
419        test_indices = np.where(split_data == 'test')[0]
420
421        if len(cell_labels) == len(train_indices):
422
423            padded_cell_labels = np.zeros((X.shape[0])).astype('object')
424            padded_cell_labels[train_indices] = cell_labels
425            padded_cell_labels[test_indices] = 'padded_test_label'
426
427            adata.obs['labels'] = padded_cell_labels
428            adata.uns['labeled_test'] = False
429
430        elif len(cell_labels) == len(split_data):
431            adata.obs['labels'] = cell_labels
432            adata.uns['labeled_test'] = True
433
434    adata.uns['train_indices'] = train_indices
435    adata.uns['test_indices'] = test_indices
436
437    if not scale_data:
438        print("WARNING: Data will not be log transformed and scaled "
439              "To change this behavior, set scale_data to True")
440
441    return adata
def calculate_d(num_samples: int):
185def calculate_d(num_samples : int):
186    '''
187    This function calculates the optimal number of dimensions for 
188    performance. See https://doi.org/10.48550/arXiv.1806.09178 for more
189    information.
190
191    Parameters
192    ----------
193    **num_samples** : *int*
194        > The number of samples in the data set including both training
195        and testing sets.
196
197    Returns
198    -------
199    **d** : *int*
200        > The optimal number of dimensions to run scMKL with the given 
201        data set.
202
203    Examples
204    --------
205    >>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz')
206    >>> d = scmkl.calculate_d(raw_counts.shape[0])
207    >>> d
208    161
209    '''
210    d = int(np.sqrt(num_samples) * np.log(np.log(num_samples)))
211    return d

This function calculates the optimal number of dimensions for performance. See https://doi.org/10.48550/arXiv.1806.09178 for more information.

Parameters

num_samples : int

The number of samples in the data set including both training and testing sets.

Returns

d : int

The optimal number of dimensions to run scMKL with the given data set.

Examples

>>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz')
>>> d = scmkl.calculate_d(raw_counts.shape[0])
>>> d
161
def create_adata( X, feature_names: numpy.ndarray, cell_labels: numpy.ndarray, group_dict: dict, scale_data: bool = True, split_data: numpy.ndarray | None = None, D: int | None = None, remove_features=True, train_ratio=0.8, distance_metric='euclidean', kernel_type='Gaussian', random_state: int = 1, allow_multiclass: bool = False, class_threshold: str | int = 'median', reduction: str | None = None, tfidf: bool = False):
214def create_adata(X, feature_names: np.ndarray, cell_labels: np.ndarray, 
215                 group_dict: dict, scale_data: bool = True, 
216                 split_data : np.ndarray | None = None, D : int | None = None, 
217                 remove_features = True, train_ratio = 0.8,
218                 distance_metric = 'euclidean', kernel_type = 'Gaussian', 
219                 random_state : int = 1, allow_multiclass : bool = False, 
220                 class_threshold : str | int = 'median',
221                 reduction: str | None = None, tfidf: bool = False):
222    '''
223    Function to create an AnnData object to carry all relevant 
224    information going forward.
225
226    Parameters
227    ----------
228    **X** : *scipy.sparse.csc_matrix* | *np.ndarray* | 
229            *pd.DataFrame*
230        > A data matrix of cells by features (sparse array 
231        recommended for large datasets).
232
233    **feature_names** : *np.ndarray*
234        > array of feature names corresponding with the features 
235        in X.
236
237    **cell_labels** : *np.ndarray*
238        > A numpy array of cell phenotypes corresponding with 
239        the cells in X.
240
241    **group_dict** : *dict* 
242        > Dictionary containing feature grouping information.
243            - Example: {geneset: np.array([gene_1, gene_2, ..., 
244                        gene_n])}
245
246    **scale_data** : *bool*  
247        > If `True`, data matrix is log transformed and standard 
248        scaled. 
249        
250    **split_data** : *None* | *np.ndarray*
251        > If *None*, data will be split stratified by cell labels. 
252        Else, is an array of precalculated train/test split 
253        corresponding to samples. Can include labels for entire
254        dataset to benchmark performance or for only training
255        data to classify unknown cell types.
256            - Example: np.array(['train', 'test', ..., 'train'])
257
258    **D** : *int* 
259        > Number of Random Fourier Features used to calculate Z. 
260        Should be a positive integer. Higher values of D will 
261        increase classification accuracy at the cost of computation 
262        time. If set to `None`, will be calculated given number of 
263        samples. 
264    
265    **remove_features** : *bool* 
266        > If `True`, will remove features from X and feature_names
267        not in group_dict and remove features from groupings not in
268        feature_names.
269
270    **train_ratio** : *float*
271        > Ratio of number of training samples to entire data set. Note:
272        if a threshold is applied, the ratio training samples may 
273        decrease depending on class balance and `class_threshold`
274        parameter if `allow_multiclass = True`.
275
276    **distance_metric** : *str* 
277        > The pairwise distance metric used to estimate sigma. Must
278        be one of the options used in scipy.spatial.distance.cdist.
279
280    **kernel_type** : *str*
281        > The approximated kernel function used to calculate Zs.
282        Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`.
283
284    **random_state** : *int*
285        > Integer random_state used to set the seed for 
286        reproducibilty.
287
288    **allow_multiclass** : *bool*
289        > If `False`, will ensure that cell labels are binary.
290
291    **class_threshold** : *str* | *int*
292        > Number of samples allowed in the training data for each cell
293        class in the training data. If `'median'`, the median number of
294        cells per cell class will be the threshold for number of 
295        samples per class.
296
297    **reduction**: *str* | *None*
298        > Choose which dimension reduction technique to perform on features
299        within a group.  'svd' will run sklearn.decomposition.TruncatedSVD,
300        'linear' will multiply by an array of 1s down to 50 dimensions.
301        
302    **tfidf**: *bool*
303        > Whether to calculate TFIDF transformation on peaks within 
304        groupings.
305        
306    Returns
307    -------
308    **adata** : *AnnData*
309    > *AnnData* with the following attributes and keys:
310
311    > `adata.X` : the data matrix.
312    
313    > `adata.var_names` : the feature names corresponding to
314    `adata.X`.
315
316    > `adata.obs['labels']` : cell classes/phenotypes from 
317    `cell_labels`.
318
319    > `adata.uns['train_indices']` : Indices for training data. 
320
321    > `adata.uns['test_indices']` : Indices for testing data.
322
323    > `adata.uns['group_dict']` : Grouping information.
324
325    > `adata.uns['seed_obj']` : Seed object with seed equal to
326    100 * `random_state`.
327
328    > `with adata.uns['D']` : Number of dimensions to scMKL with.
329
330    > `adata.uns['scale_data']` : *bool* for whether or not data is log
331    transformed and scaled.
332
333    > `adata.uns['distance_metric']` : Distance metric as given.
334    
335    > `adata.uns['kernel_type']` : Kernel function as given.
336
337    > `adata.uns['svd'] : *bool* for whether to calculate svd reduction.
338
339    > `adata.uns['tfidf'] : *bool* for whether to calculate tfidf per grouping.
340
341    Examples
342    --------
343    >>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz')
344    >>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True)
345    >>> group_dict = np.load('hallmark_genesets.pkl', 
346    >>>                      allow_pickle = True)
347    >>> 
348    >>> adata = scmkl.create_adata(X = data_mat, 
349    ...                            feature_names = gene_names, 
350    ...                            group_dict = group_dict)
351    >>> adata
352    AnnData object with n_obs × n_vars = 1000 × 4341
353    obs: 'labels'
354    uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
355    'distance_metric', 'train_indices', 'test_indices'
356    '''
357
358    assert X.shape[1] == len(feature_names), ("Different number of features "
359                                              "in X than feature names")
360    
361    if not allow_multiclass:
362        assert len(np.unique(cell_labels)) == 2, ("cell_labels must contain "
363                                                  "2 classes")
364    if D is not None:    
365        assert isinstance(D, int) and D > 0, 'D must be a positive integer'
366
367    kernel_options = ['gaussian', 'laplacian', 'cauchy']
368    assert kernel_type.lower() in kernel_options, ("Given kernel type not "
369                                                   "implemented. Gaussian, "
370                                                   "Laplacian, and Cauchy "
371                                                   "are the acceptable "
372                                                   "types.")
373
374    X, feature_names, group_dict = _filter_features(X, 
375                                                    feature_names, 
376                                                    group_dict, 
377                                                    remove_features)
378
379    # Create adata object and add column names
380    adata = ad.AnnData(X)
381    adata.var_names = feature_names
382
383    # Add metadata to adata object
384    adata.uns['group_dict'] = group_dict
385    adata.uns['seed_obj'] = np.random.default_rng(100 * random_state)
386    adata.uns['scale_data'] = scale_data
387    adata.uns['D'] = D if D is not None else calculate_d(adata.shape[0])
388    adata.uns['kernel_type'] = kernel_type
389    adata.uns['distance_metric'] = distance_metric
390    adata.uns['reduction'] = reduction if isinstance(reduction, str) else 'None'
391    adata.uns['tfidf'] = tfidf
392
393    if (split_data is None):
394        assert X.shape[0] == len(cell_labels), ("Different number of cells "
395                                                "than labels")
396        adata.obs['labels'] = cell_labels
397
398        if (allow_multiclass == False):
399            split = _binary_split(cell_labels, 
400                                  seed_obj = adata.uns['seed_obj'],
401                                  train_ratio = train_ratio)
402            train_indices, test_indices = split
403
404        elif (allow_multiclass == True):
405            split = _multi_class_split(cell_labels, 
406                                       seed_obj = adata.uns['seed_obj'], 
407                                       class_threshold = class_threshold,
408                                       train_ratio = train_ratio)
409            train_indices, test_indices = split
410
411        adata.uns['labeled_test'] = True
412
413    else:
414        x_eq_labs = X.shape[0] == len(cell_labels)
415        train_eq_labs = X.shape[0] == len(cell_labels)
416        assert x_eq_labs or train_eq_labs, ("Must give labels for all cells "
417                                            "or only for training cells")
418        
419        train_indices = np.where(split_data == 'train')[0]
420        test_indices = np.where(split_data == 'test')[0]
421
422        if len(cell_labels) == len(train_indices):
423
424            padded_cell_labels = np.zeros((X.shape[0])).astype('object')
425            padded_cell_labels[train_indices] = cell_labels
426            padded_cell_labels[test_indices] = 'padded_test_label'
427
428            adata.obs['labels'] = padded_cell_labels
429            adata.uns['labeled_test'] = False
430
431        elif len(cell_labels) == len(split_data):
432            adata.obs['labels'] = cell_labels
433            adata.uns['labeled_test'] = True
434
435    adata.uns['train_indices'] = train_indices
436    adata.uns['test_indices'] = test_indices
437
438    if not scale_data:
439        print("WARNING: Data will not be log transformed and scaled "
440              "To change this behavior, set scale_data to True")
441
442    return adata

Function to create an AnnData object to carry all relevant information going forward.

Parameters

X : scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame

A data matrix of cells by features (sparse array recommended for large datasets).

feature_names : np.ndarray

array of feature names corresponding with the features in X.

cell_labels : np.ndarray

A numpy array of cell phenotypes corresponding with the cells in X.

group_dict : dict

Dictionary containing feature grouping information. - Example: {geneset: np.array([gene_1, gene_2, ..., gene_n])}

scale_data : bool

If True, data matrix is log transformed and standard scaled.

split_data : None | np.ndarray

If None, data will be split stratified by cell labels. Else, is an array of precalculated train/test split corresponding to samples. Can include labels for entire dataset to benchmark performance or for only training data to classify unknown cell types. - Example: np.array(['train', 'test', ..., 'train'])

D : int

Number of Random Fourier Features used to calculate Z. Should be a positive integer. Higher values of D will increase classification accuracy at the cost of computation time. If set to None, will be calculated given number of samples.

remove_features : bool

If True, will remove features from X and feature_names not in group_dict and remove features from groupings not in feature_names.

train_ratio : float

Ratio of number of training samples to entire data set. Note: if a threshold is applied, the ratio training samples may decrease depending on class balance and class_threshold parameter if allow_multiclass = True.

distance_metric : str

The pairwise distance metric used to estimate sigma. Must be one of the options used in scipy.spatial.distance.cdist.

kernel_type : str

The approximated kernel function used to calculate Zs. Must be one of 'Gaussian', 'Laplacian', or 'Cauchy'.

random_state : int

Integer random_state used to set the seed for reproducibilty.

allow_multiclass : bool

If False, will ensure that cell labels are binary.

class_threshold : str | int

Number of samples allowed in the training data for each cell class in the training data. If 'median', the median number of cells per cell class will be the threshold for number of samples per class.

reduction: str | None

Choose which dimension reduction technique to perform on features within a group. 'svd' will run sklearn.decomposition.TruncatedSVD, 'linear' will multiply by an array of 1s down to 50 dimensions.

tfidf: bool

Whether to calculate TFIDF transformation on peaks within groupings.

Returns

adata : AnnData

AnnData with the following attributes and keys:

adata.X : the data matrix.

adata.var_names : the feature names corresponding to adata.X.

adata.obs['labels'] : cell classes/phenotypes from cell_labels.

adata.uns['train_indices'] : Indices for training data.

adata.uns['test_indices'] : Indices for testing data.

adata.uns['group_dict'] : Grouping information.

adata.uns['seed_obj'] : Seed object with seed equal to 100 * random_state.

with adata.uns['D'] : Number of dimensions to scMKL with.

adata.uns['scale_data'] : bool for whether or not data is log transformed and scaled.

adata.uns['distance_metric'] : Distance metric as given.

adata.uns['kernel_type'] : Kernel function as given.

`adata.uns['svd'] : bool for whether to calculate svd reduction.

`adata.uns['tfidf'] : bool for whether to calculate tfidf per grouping.

Examples

>>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz')
>>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True)
>>> group_dict = np.load('hallmark_genesets.pkl', 
>>>                      allow_pickle = True)
>>> 
>>> adata = scmkl.create_adata(X = data_mat, 
...                            feature_names = gene_names, 
...                            group_dict = group_dict)
>>> adata
AnnData object with n_obs × n_vars = 1000 × 4341
obs: 'labels'
uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
'distance_metric', 'train_indices', 'test_indices'