scmkl.create_adata

  1import numpy as np
  2import anndata as ad
  3
  4
  5def _filter_features(X, feature_names, group_dict, remove_features):
  6    '''
  7    Function to remove unused features from X matrix. Any features not 
  8    included in group_dict will be removed from the matrix. Also puts 
  9    the features in the same relative order (of included features)
 10    
 11    Parameters
 12    ----------
 13    X : Data array. Can be Numpy array or Scipy Sparse Array
 14    feature_names : Numpy array of corresponding feature names
 15    group_dict : Dictionary containing feature grouping information.
 16                 Example: {geneset: np.array(gene_1, gene_2, ..., 
 17                 gene_n)}
 18    Returns
 19    -------
 20    X : Data array containing data only for features in the group_dict
 21    feature_names : Numpy array of corresponding feature names from 
 22                    group_dict
 23    '''
 24    assert X.shape[1] == len(feature_names), ("Given features do not "
 25                                              "correspond with features in X")  
 26
 27    group_features = set()
 28    feature_set = set(feature_names)
 29
 30    # Store all objects in dictionary in set
 31    for group in group_dict.keys():
 32        group_features.update(set(group_dict[group]))
 33
 34        # Finds intersection between group features and features in data
 35        # Converts to nd.array and sorts to preserve order of feature names
 36        group_feats = list(feature_set.intersection(set(group_dict[group])))
 37        group_dict[group] = np.sort(np.array(group_feats))
 38
 39    # Only keeping groupings that have at least two features
 40    group_dict = {group : group_dict[group] for group in group_dict.keys()
 41                  if len(group_dict[group]) > 1}
 42
 43    if remove_features:
 44        # Find location of desired features in whole feature set
 45        g_features = np.array(list(group_features))
 46        group_feature_indices = np.where(np.in1d(feature_names, 
 47                                                 g_features, 
 48                                                 assume_unique = True))[0]
 49
 50        # Subset only the desired features and data
 51        X = X[:,group_feature_indices]
 52        feature_names = np.array(list(feature_names))[group_feature_indices]
 53
 54    return X, feature_names, group_dict
 55
 56
 57def _multi_class_split(y, train_ratio = 0.8, class_threshold = 'median', 
 58                       seed_obj = np.random.default_rng(100)):
 59    '''
 60    Function for calculating the training and testing cell positions 
 61    for multiclass data sets.
 62
 63    Parameters
 64    ----------
 65    **y** : *np.ndarray* | *pd.Series* | *list*
 66        > Should be an iterable object cooresponding to samples in 
 67        `ad.AnnData` object.
 68
 69    **seed_obj** : *numpy.random._generator.Generator*
 70        > Seed used to randomly sample and split data.
 71
 72    **train_ratio** : *float*
 73        > Ratio of number of training samples to entire data set. 
 74        Note: if a threshold is applied, the ratio training samples 
 75        may decrease depending on class balance and `class_threshold`
 76        parameter.
 77
 78    **class_threshold** : *str* | *int*
 79        > If is type `int`, classes with more samples than 
 80        class_threshold will be sampled. If `'median'`, 
 81        samples will be sampled to the median number of samples per 
 82        class.
 83
 84    Returns
 85    -------
 86    **train_indices** : *np.ndarray*
 87        > Indices for training samples.
 88
 89    **test_indices** : *np.ndarray*
 90        > Indices for testing samples.
 91    '''
 92    uniq_labels = np.unique(y)
 93
 94    # Finding indices for each cell class
 95    class_positions = {class_ : np.where(y == class_)[0] 
 96                       for class_ in uniq_labels}
 97    
 98    # Capturing training indices while maintaining original class proportions
 99    train_samples = {class_ : seed_obj.choice(class_positions[class_], 
100                                              int(len(class_positions[class_])
101                                                  * train_ratio), 
102                                              replace = False)
103                        for class_ in class_positions.keys()}
104    
105    # Capturing testing indices while maintaining original class proportions
106    test_samples = {class_ : np.setdiff1d(class_positions[class_], 
107                                          train_samples[class_])
108                    for class_ in class_positions.keys()}
109    
110    # Applying threshold for samples per class
111    if class_threshold == 'median':
112        all_train = [idx for class_ in train_samples.keys()
113                         for idx in train_samples[class_]]
114        _, class_threshold = np.unique(y[all_train], return_counts = True)
115        class_threshold = int(np.median(class_threshold))
116    
117    for class_ in train_samples.keys():
118        if len(train_samples[class_]) > class_threshold:
119            train_samples[class_] = seed_obj.choice(train_samples[class_], 
120                                                       class_threshold)
121            
122    train_indices = np.array([idx for class_ in train_samples.keys()
123                                  for idx in train_samples[class_]])
124    
125    test_indices = np.array([idx for class_ in test_samples.keys()
126                                 for idx in test_samples[class_]])
127    
128    return train_indices, test_indices
129
130
131def _binary_split(y, train_indices = None, train_ratio = 0.8,
132                  seed_obj = np.random.default_rng(100)):
133    '''
134    Function to calculate training and testing indices for given 
135    dataset. If train indices are given, it will calculate the test 
136    indices. If train_indices == None, then it calculates both indices, 
137    preserving the ratio of each label in y
138
139    Parameters
140    ----------
141    y : Numpy array of cell labels. Can have any number of classes for 
142        this function.
143    train_indices : Optional array of pre-determined training indices
144    seed_obj : Numpy random state used for random processes. Can be 
145    specified for reproducubility or set by default.
146    train_ratio : decimal value ratio of features in training/testing 
147                  sets
148    
149    Returns
150    -------
151    train_indices : Array of indices of training cells
152    test_indices : Array of indices of testing cells
153    '''
154
155    # If train indices aren't provided
156    if train_indices is None:
157
158        unique_labels = np.unique(y)
159        train_indices = []
160
161        for label in unique_labels:
162
163            # Find index of each unique label
164            label_indices = np.where(y == label)[0]
165
166            # Sample these indices according to train ratio
167            n = int(len(label_indices) * train_ratio)
168            train_label_indices = seed_obj.choice(label_indices, n, 
169                                                  replace = False)
170            train_indices.extend(train_label_indices)
171    else:
172        assert len(train_indices) <= len(y), ("More train indices than there "
173                                              "are samples")
174
175    train_indices = np.array(train_indices)
176
177    # Test indices are the indices not in the train_indices
178    test_indices = np.setdiff1d(np.arange(len(y)), train_indices, 
179                                assume_unique = True)
180
181    return train_indices, test_indices
182
183
184def calculate_d(num_samples : int):
185    '''
186    This function calculates the optimal number of dimensions for 
187    performance. See https://doi.org/10.48550/arXiv.1806.09178 for more
188    information.
189
190    Parameters
191    ----------
192    **num_samples** : *int*
193        > The number of samples in the data set including both training
194        and testing sets.
195
196    Returns
197    -------
198    **d** : *int*
199        > The optimal number of dimensions to run scMKL with the given 
200        data set.
201
202    Examples
203    --------
204    >>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz')
205    >>> d = scmkl.calculate_d(raw_counts.shape[0])
206    >>> d
207    161
208    '''
209    d = int(np.sqrt(num_samples) * np.log(np.log(num_samples)))
210    return d
211
212
213def create_adata(X, feature_names: np.ndarray, cell_labels: np.ndarray, 
214                 group_dict: dict, scale_data: bool = True, 
215                 split_data : np.ndarray | None = None, D : int | None = None, 
216                 remove_features = True, train_ratio = 0.8,
217                 distance_metric = 'euclidean', kernel_type = 'Gaussian', 
218                 random_state : int = 1, allow_multiclass : bool = False, 
219                 class_threshold : str | int = 'median'):
220    '''
221    Function to create an AnnData object to carry all relevant 
222    information going forward.
223
224    Parameters
225    ----------
226    **X** : *scipy.sparse.csc_matrix* | *np.ndarray* | 
227            *pd.DataFrame*
228        > A data matrix of cells by features (sparse array 
229        recommended for large datasets).
230
231    **feature_names** : *np.ndarray*
232        > array of feature names corresponding with the features 
233        in X.
234
235    **cell_labels** : *np.ndarray*
236        > A numpy array of cell phenotypes corresponding with 
237        the cells in X.
238
239    **group_dict** : *dict* 
240        > Dictionary containing feature grouping information.
241            - Example: {geneset: np.array([gene_1, gene_2, ..., 
242                        gene_n])}
243
244    **scale_data** : *bool*  
245        > If `True`, data matrix is log transformed and standard 
246        scaled. 
247        
248    **split_data** : *None* | *np.ndarray*
249        > If *None*, data will be split stratified by cell labels. 
250        Else, is an array of precalculated train/test split 
251        corresponding to samples. Can include labels for entire
252        dataset to benchmark performance or for only training
253        data to classify unknown cell types.
254            - Example: np.array(['train', 'test', ..., 'train'])
255
256    **D** : *int* 
257        > Number of Random Fourier Features used to calculate Z. 
258        Should be a positive integer. Higher values of D will 
259        increase classification accuracy at the cost of computation 
260        time. If set to `None`, will be calculated given number of 
261        samples. 
262    
263    **remove_features** : *bool* 
264        > If `True`, will remove features from X and feature_names
265        not in group_dict and remove features from groupings not in
266        feature_names.
267
268    **train_ratio** : *float*
269        > Ratio of number of training samples to entire data set. Note:
270        if a threshold is applied, the ratio training samples may 
271        decrease depending on class balance and `class_threshold`
272        parameter if `allow_multiclass = True`.
273
274    **distance_metric** : *str* 
275        > The pairwise distance metric used to estimate sigma. Must
276        be one of the options used in scipy.spatial.distance.cdist.
277
278    **kernel_type** : *str*
279        > The approximated kernel function used to calculate Zs.
280        Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`.
281
282    **random_state** : *int*
283        > Integer random_state used to set the seed for 
284        reproducibilty.
285
286    **allow_multiclass** : *bool*
287        > If `False`, will ensure that cell labels are binary.
288
289    **class_threshold** : *str* | *int*
290        > Number of samples allowed in the training data for each cell
291        class in the training data. If `'median'`, the median number of
292        cells per cell class will be the threshold for number of 
293        samples per class.
294
295    Returns
296    -------
297    **adata** : *AnnData*
298    > *AnnData* with the following attributes and keys:
299
300    > `adata.X` : the data matrix.
301    
302    > `adata.var_names` : the feature names corresponding to
303    `adata.X`.
304
305    > `adata.obs['labels']` : cell classes/phenotypes from 
306    `cell_labels`.
307
308    > `adata.uns['train_indices']` : Indices for training data. 
309
310    > `adata.uns['test_indices']` : Indices for testing data.
311
312    > `adata.uns['group_dict']` : Grouping information.
313
314    > `adata.uns['seed_obj']` : Seed object with seed equal to
315    100 * `random_state`.
316
317    > `with adata.uns['D']` : Number of dimensions to scMKL with.
318
319    > `adata.uns['scale_data']` : *bool* for whether or not data is log
320    transformed and scaled.
321
322    > `adata.uns['distance_metric']` : Distance metric as given.
323    
324    > `adata.uns['kernel_type']` : Kernel function as given.
325
326    Examples
327    --------
328    >>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz')
329    >>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True)
330    >>> group_dict = np.load('hallmark_genesets.pkl', 
331    >>>                      allow_pickle = True)
332    >>> 
333    >>> adata = scmkl.create_adata(X = data_mat, 
334    ...                            feature_names = gene_names, 
335    ...                            group_dict = group_dict)
336    >>> adata
337    AnnData object with n_obs × n_vars = 1000 × 4341
338    obs: 'labels'
339    uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
340    'distance_metric', 'train_indices', 'test_indices'
341    '''
342
343    assert X.shape[1] == len(feature_names), ("Different number of features "
344                                              "in X than feature names")
345    
346    if not allow_multiclass:
347        assert len(np.unique(cell_labels)) == 2, ("cell_labels must contain "
348                                                  "2 classes")
349    if D is not None:    
350        assert isinstance(D, int) and D > 0, 'D must be a positive integer'
351
352    kernel_options = ['gaussian', 'laplacian', 'cauchy']
353    assert kernel_type.lower() in kernel_options, ("Given kernel type not "
354                                                   "implemented. Gaussian, "
355                                                   "Laplacian, and Cauchy "
356                                                   "are the acceptable "
357                                                   "types.")
358
359    X, feature_names, group_dict = _filter_features(X, 
360                                                    feature_names, 
361                                                    group_dict, 
362                                                    remove_features)
363
364    # Create adata object and add column names
365    adata = ad.AnnData(X)
366    adata.var_names = feature_names
367
368    # Add metadata to adata object
369    adata.uns['group_dict'] = group_dict
370    adata.uns['seed_obj'] = np.random.default_rng(100 * random_state)
371    adata.uns['scale_data'] = scale_data
372    adata.uns['D'] = D if D is not None else calculate_d(adata.shape[0])
373    adata.uns['kernel_type'] = kernel_type
374    adata.uns['distance_metric'] = distance_metric
375
376    if (split_data is None):
377        assert X.shape[0] == len(cell_labels), ("Different number of cells "
378                                                "than labels")
379        adata.obs['labels'] = cell_labels
380
381        if (allow_multiclass == False):
382            split = _binary_split(cell_labels, 
383                                  seed_obj = adata.uns['seed_obj'],
384                                  train_ratio = train_ratio)
385            train_indices, test_indices = split
386
387        elif (allow_multiclass == True):
388            split = _multi_class_split(cell_labels, 
389                                       seed_obj = adata.uns['seed_obj'], 
390                                       class_threshold = class_threshold,
391                                       train_ratio = train_ratio)
392            train_indices, test_indices = split
393
394        adata.uns['labeled_test'] = True
395
396    else:
397        x_eq_labs = X.shape[0] == len(cell_labels)
398        train_eq_labs = X.shape[0] == len(cell_labels)
399        assert x_eq_labs or train_eq_labs, ("Must give labels for all cells "
400                                            "or only for training cells")
401        
402        train_indices = np.where(split_data == 'train')[0]
403        test_indices = np.where(split_data == 'test')[0]
404
405        if len(cell_labels) == len(train_indices):
406
407            padded_cell_labels = np.zeros((X.shape[0])).astype('object')
408            padded_cell_labels[train_indices] = cell_labels
409            padded_cell_labels[test_indices] = 'padded_test_label'
410
411            adata.obs['labels'] = padded_cell_labels
412            adata.uns['labeled_test'] = False
413
414        elif len(cell_labels) == len(split_data):
415            adata.obs['labels'] = cell_labels
416            adata.uns['labeled_test'] = True
417
418    adata.uns['train_indices'] = train_indices
419    adata.uns['test_indices'] = test_indices
420
421    if not scale_data:
422        print("WARNING: Data will not be log transformed and scaled "
423              "To change this behavior, set scale_data to True")
424
425    return adata
def calculate_d(num_samples: int):
185def calculate_d(num_samples : int):
186    '''
187    This function calculates the optimal number of dimensions for 
188    performance. See https://doi.org/10.48550/arXiv.1806.09178 for more
189    information.
190
191    Parameters
192    ----------
193    **num_samples** : *int*
194        > The number of samples in the data set including both training
195        and testing sets.
196
197    Returns
198    -------
199    **d** : *int*
200        > The optimal number of dimensions to run scMKL with the given 
201        data set.
202
203    Examples
204    --------
205    >>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz')
206    >>> d = scmkl.calculate_d(raw_counts.shape[0])
207    >>> d
208    161
209    '''
210    d = int(np.sqrt(num_samples) * np.log(np.log(num_samples)))
211    return d

This function calculates the optimal number of dimensions for performance. See https://doi.org/10.48550/arXiv.1806.09178 for more information.

Parameters

num_samples : int

The number of samples in the data set including both training and testing sets.

Returns

d : int

The optimal number of dimensions to run scMKL with the given data set.

Examples

>>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz')
>>> d = scmkl.calculate_d(raw_counts.shape[0])
>>> d
161
def create_adata( X, feature_names: numpy.ndarray, cell_labels: numpy.ndarray, group_dict: dict, scale_data: bool = True, split_data: numpy.ndarray | None = None, D: int | None = None, remove_features=True, train_ratio=0.8, distance_metric='euclidean', kernel_type='Gaussian', random_state: int = 1, allow_multiclass: bool = False, class_threshold: str | int = 'median'):
214def create_adata(X, feature_names: np.ndarray, cell_labels: np.ndarray, 
215                 group_dict: dict, scale_data: bool = True, 
216                 split_data : np.ndarray | None = None, D : int | None = None, 
217                 remove_features = True, train_ratio = 0.8,
218                 distance_metric = 'euclidean', kernel_type = 'Gaussian', 
219                 random_state : int = 1, allow_multiclass : bool = False, 
220                 class_threshold : str | int = 'median'):
221    '''
222    Function to create an AnnData object to carry all relevant 
223    information going forward.
224
225    Parameters
226    ----------
227    **X** : *scipy.sparse.csc_matrix* | *np.ndarray* | 
228            *pd.DataFrame*
229        > A data matrix of cells by features (sparse array 
230        recommended for large datasets).
231
232    **feature_names** : *np.ndarray*
233        > array of feature names corresponding with the features 
234        in X.
235
236    **cell_labels** : *np.ndarray*
237        > A numpy array of cell phenotypes corresponding with 
238        the cells in X.
239
240    **group_dict** : *dict* 
241        > Dictionary containing feature grouping information.
242            - Example: {geneset: np.array([gene_1, gene_2, ..., 
243                        gene_n])}
244
245    **scale_data** : *bool*  
246        > If `True`, data matrix is log transformed and standard 
247        scaled. 
248        
249    **split_data** : *None* | *np.ndarray*
250        > If *None*, data will be split stratified by cell labels. 
251        Else, is an array of precalculated train/test split 
252        corresponding to samples. Can include labels for entire
253        dataset to benchmark performance or for only training
254        data to classify unknown cell types.
255            - Example: np.array(['train', 'test', ..., 'train'])
256
257    **D** : *int* 
258        > Number of Random Fourier Features used to calculate Z. 
259        Should be a positive integer. Higher values of D will 
260        increase classification accuracy at the cost of computation 
261        time. If set to `None`, will be calculated given number of 
262        samples. 
263    
264    **remove_features** : *bool* 
265        > If `True`, will remove features from X and feature_names
266        not in group_dict and remove features from groupings not in
267        feature_names.
268
269    **train_ratio** : *float*
270        > Ratio of number of training samples to entire data set. Note:
271        if a threshold is applied, the ratio training samples may 
272        decrease depending on class balance and `class_threshold`
273        parameter if `allow_multiclass = True`.
274
275    **distance_metric** : *str* 
276        > The pairwise distance metric used to estimate sigma. Must
277        be one of the options used in scipy.spatial.distance.cdist.
278
279    **kernel_type** : *str*
280        > The approximated kernel function used to calculate Zs.
281        Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`.
282
283    **random_state** : *int*
284        > Integer random_state used to set the seed for 
285        reproducibilty.
286
287    **allow_multiclass** : *bool*
288        > If `False`, will ensure that cell labels are binary.
289
290    **class_threshold** : *str* | *int*
291        > Number of samples allowed in the training data for each cell
292        class in the training data. If `'median'`, the median number of
293        cells per cell class will be the threshold for number of 
294        samples per class.
295
296    Returns
297    -------
298    **adata** : *AnnData*
299    > *AnnData* with the following attributes and keys:
300
301    > `adata.X` : the data matrix.
302    
303    > `adata.var_names` : the feature names corresponding to
304    `adata.X`.
305
306    > `adata.obs['labels']` : cell classes/phenotypes from 
307    `cell_labels`.
308
309    > `adata.uns['train_indices']` : Indices for training data. 
310
311    > `adata.uns['test_indices']` : Indices for testing data.
312
313    > `adata.uns['group_dict']` : Grouping information.
314
315    > `adata.uns['seed_obj']` : Seed object with seed equal to
316    100 * `random_state`.
317
318    > `with adata.uns['D']` : Number of dimensions to scMKL with.
319
320    > `adata.uns['scale_data']` : *bool* for whether or not data is log
321    transformed and scaled.
322
323    > `adata.uns['distance_metric']` : Distance metric as given.
324    
325    > `adata.uns['kernel_type']` : Kernel function as given.
326
327    Examples
328    --------
329    >>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz')
330    >>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True)
331    >>> group_dict = np.load('hallmark_genesets.pkl', 
332    >>>                      allow_pickle = True)
333    >>> 
334    >>> adata = scmkl.create_adata(X = data_mat, 
335    ...                            feature_names = gene_names, 
336    ...                            group_dict = group_dict)
337    >>> adata
338    AnnData object with n_obs × n_vars = 1000 × 4341
339    obs: 'labels'
340    uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
341    'distance_metric', 'train_indices', 'test_indices'
342    '''
343
344    assert X.shape[1] == len(feature_names), ("Different number of features "
345                                              "in X than feature names")
346    
347    if not allow_multiclass:
348        assert len(np.unique(cell_labels)) == 2, ("cell_labels must contain "
349                                                  "2 classes")
350    if D is not None:    
351        assert isinstance(D, int) and D > 0, 'D must be a positive integer'
352
353    kernel_options = ['gaussian', 'laplacian', 'cauchy']
354    assert kernel_type.lower() in kernel_options, ("Given kernel type not "
355                                                   "implemented. Gaussian, "
356                                                   "Laplacian, and Cauchy "
357                                                   "are the acceptable "
358                                                   "types.")
359
360    X, feature_names, group_dict = _filter_features(X, 
361                                                    feature_names, 
362                                                    group_dict, 
363                                                    remove_features)
364
365    # Create adata object and add column names
366    adata = ad.AnnData(X)
367    adata.var_names = feature_names
368
369    # Add metadata to adata object
370    adata.uns['group_dict'] = group_dict
371    adata.uns['seed_obj'] = np.random.default_rng(100 * random_state)
372    adata.uns['scale_data'] = scale_data
373    adata.uns['D'] = D if D is not None else calculate_d(adata.shape[0])
374    adata.uns['kernel_type'] = kernel_type
375    adata.uns['distance_metric'] = distance_metric
376
377    if (split_data is None):
378        assert X.shape[0] == len(cell_labels), ("Different number of cells "
379                                                "than labels")
380        adata.obs['labels'] = cell_labels
381
382        if (allow_multiclass == False):
383            split = _binary_split(cell_labels, 
384                                  seed_obj = adata.uns['seed_obj'],
385                                  train_ratio = train_ratio)
386            train_indices, test_indices = split
387
388        elif (allow_multiclass == True):
389            split = _multi_class_split(cell_labels, 
390                                       seed_obj = adata.uns['seed_obj'], 
391                                       class_threshold = class_threshold,
392                                       train_ratio = train_ratio)
393            train_indices, test_indices = split
394
395        adata.uns['labeled_test'] = True
396
397    else:
398        x_eq_labs = X.shape[0] == len(cell_labels)
399        train_eq_labs = X.shape[0] == len(cell_labels)
400        assert x_eq_labs or train_eq_labs, ("Must give labels for all cells "
401                                            "or only for training cells")
402        
403        train_indices = np.where(split_data == 'train')[0]
404        test_indices = np.where(split_data == 'test')[0]
405
406        if len(cell_labels) == len(train_indices):
407
408            padded_cell_labels = np.zeros((X.shape[0])).astype('object')
409            padded_cell_labels[train_indices] = cell_labels
410            padded_cell_labels[test_indices] = 'padded_test_label'
411
412            adata.obs['labels'] = padded_cell_labels
413            adata.uns['labeled_test'] = False
414
415        elif len(cell_labels) == len(split_data):
416            adata.obs['labels'] = cell_labels
417            adata.uns['labeled_test'] = True
418
419    adata.uns['train_indices'] = train_indices
420    adata.uns['test_indices'] = test_indices
421
422    if not scale_data:
423        print("WARNING: Data will not be log transformed and scaled "
424              "To change this behavior, set scale_data to True")
425
426    return adata

Function to create an AnnData object to carry all relevant information going forward.

Parameters

X : scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame

A data matrix of cells by features (sparse array recommended for large datasets).

feature_names : np.ndarray

array of feature names corresponding with the features in X.

cell_labels : np.ndarray

A numpy array of cell phenotypes corresponding with the cells in X.

group_dict : dict

Dictionary containing feature grouping information. - Example: {geneset: np.array([gene_1, gene_2, ..., gene_n])}

scale_data : bool

If True, data matrix is log transformed and standard scaled.

split_data : None | np.ndarray

If None, data will be split stratified by cell labels. Else, is an array of precalculated train/test split corresponding to samples. Can include labels for entire dataset to benchmark performance or for only training data to classify unknown cell types. - Example: np.array(['train', 'test', ..., 'train'])

D : int

Number of Random Fourier Features used to calculate Z. Should be a positive integer. Higher values of D will increase classification accuracy at the cost of computation time. If set to None, will be calculated given number of samples.

remove_features : bool

If True, will remove features from X and feature_names not in group_dict and remove features from groupings not in feature_names.

train_ratio : float

Ratio of number of training samples to entire data set. Note: if a threshold is applied, the ratio training samples may decrease depending on class balance and class_threshold parameter if allow_multiclass = True.

distance_metric : str

The pairwise distance metric used to estimate sigma. Must be one of the options used in scipy.spatial.distance.cdist.

kernel_type : str

The approximated kernel function used to calculate Zs. Must be one of 'Gaussian', 'Laplacian', or 'Cauchy'.

random_state : int

Integer random_state used to set the seed for reproducibilty.

allow_multiclass : bool

If False, will ensure that cell labels are binary.

class_threshold : str | int

Number of samples allowed in the training data for each cell class in the training data. If 'median', the median number of cells per cell class will be the threshold for number of samples per class.

Returns

adata : AnnData

AnnData with the following attributes and keys:

adata.X : the data matrix.

adata.var_names : the feature names corresponding to adata.X.

adata.obs['labels'] : cell classes/phenotypes from cell_labels.

adata.uns['train_indices'] : Indices for training data.

adata.uns['test_indices'] : Indices for testing data.

adata.uns['group_dict'] : Grouping information.

adata.uns['seed_obj'] : Seed object with seed equal to 100 * random_state.

with adata.uns['D'] : Number of dimensions to scMKL with.

adata.uns['scale_data'] : bool for whether or not data is log transformed and scaled.

adata.uns['distance_metric'] : Distance metric as given.

adata.uns['kernel_type'] : Kernel function as given.

Examples

>>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz')
>>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True)
>>> group_dict = np.load('hallmark_genesets.pkl', 
>>>                      allow_pickle = True)
>>> 
>>> adata = scmkl.create_adata(X = data_mat, 
...                            feature_names = gene_names, 
...                            group_dict = group_dict)
>>> adata
AnnData object with n_obs × n_vars = 1000 × 4341
obs: 'labels'
uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
'distance_metric', 'train_indices', 'test_indices'