scmkl.create_adata

  1import numpy as np
  2import anndata as ad
  3import scipy
  4import pandas as pd
  5import gc
  6import warnings
  7
  8
  9def filter_features(feature_names: np.ndarray, group_dict: dict):
 10    """
 11    Function to remove features only in feature names or group_dict.
 12    Any features not included in group_dict will be removed from the
 13    matrix. Also puts the features in the same relative order (of 
 14    included features)
 15    
 16    Parameters
 17    ----------
 18    feature_names : np.ndarray
 19        Numpy array of corresponding feature names.
 20
 21    group_dict : dict
 22        Dictionary containing feature grouping information.
 23                 Example: {geneset: np.array(gene_1, gene_2, ..., 
 24                 gene_n)}
 25    Returns
 26    -------
 27    feature_names : np.ndarray
 28        Numpy array of corresponding feature names from group_dict.
 29
 30    group_dict : dict
 31        Dictionary containing features overlapping input grouping 
 32        information and full feature names.
 33    """
 34    group_features = set()
 35    feature_set = set(feature_names)
 36
 37    # Store all objects in dictionary in set
 38    for group in group_dict.keys():
 39        group_features.update(set(group_dict[group]))
 40
 41        # Finds intersection between group features and features in data
 42        # Converts to nd.array and sorts to preserve order of feature names
 43        group_feats = list(feature_set.intersection(set(group_dict[group])))
 44        group_dict[group] = np.sort(np.array(group_feats))
 45
 46    # Only keeping groupings that have at least two features
 47    group_dict = {group : group_dict[group] for group in group_dict.keys()
 48                  if len(group_dict[group]) > 1}
 49
 50    group_features = np.array(list(group_features.intersection(feature_set)))
 51
 52    return group_features, group_dict
 53
 54
 55def multi_class_split(y: np.ndarray, seed_obj: np.random._generator.Generator, 
 56                      class_threshold: str | int | None=None, 
 57                      train_ratio: float=0.8):
 58    """
 59    Function for calculating the training and testing cell positions 
 60    for multiclass data sets.
 61
 62    Parameters
 63    ----------
 64    y : array_like
 65        Should be an iterable object cooresponding to samples in 
 66        `ad.AnnData` object.
 67
 68    seed_obj : np.random._generator.Generator
 69        Seed used to randomly sample and split data.
 70
 71    train_ratio : float
 72        Ratio of number of training samples to entire data set. 
 73        Note: if a threshold is applied, the ratio training samples 
 74        may decrease depending on class balance and `class_threshold`
 75        parameter.
 76
 77    class_threshold : str | int
 78        If is type `int`, classes with more samples than 
 79        class_threshold will be sampled. If `'median'`, 
 80        samples will be sampled to the median number of samples per 
 81        class.
 82
 83    Returns
 84    -------
 85    train_indices : np.ndarray
 86        Indices for training samples.
 87
 88    test_indices : np.ndarray
 89        Indices for testing samples.
 90    """
 91    uniq_labels = np.unique(y)
 92
 93    # Finding indices for each cell class
 94    class_positions = {class_ : np.where(y == class_)[0] 
 95                       for class_ in uniq_labels}
 96    
 97    # Capturing training indices while maintaining original class proportions
 98    train_samples = {class_ : seed_obj.choice(class_positions[class_], 
 99                                              int(len(class_positions[class_])
100                                                  * train_ratio), 
101                                              replace = False)
102                        for class_ in class_positions.keys()}
103    
104    # Capturing testing indices while maintaining original class proportions
105    test_samples = {class_ : np.setdiff1d(class_positions[class_], 
106                                          train_samples[class_])
107                    for class_ in class_positions.keys()}
108    
109    # Applying threshold for samples per class
110    if class_threshold == 'median':
111        cells_per_class = [len(values) for values in train_samples.values()]
112        class_threshold = int(np.median(cells_per_class))
113
114    if isinstance(class_threshold, int):
115        # Down sample to class_threshold
116        for class_ in train_samples.keys():
117            if len(train_samples[class_]) > class_threshold:
118                train_samples[class_] = seed_obj.choice(train_samples[class_], 
119                                                        class_threshold)
120            
121    train_indices = np.array([idx for class_ in train_samples.keys()
122                                  for idx in train_samples[class_]])
123    
124    test_indices = np.array([idx for class_ in test_samples.keys()
125                                 for idx in test_samples[class_]])
126    
127    return train_indices, test_indices
128
129
130def binary_split(y: np.ndarray, train_indices: np.ndarray | None=None, 
131                  train_ratio: float=0.8,
132                  seed_obj: np.random._generator.Generator=np.random.default_rng(100)):
133    """
134    Function to calculate training and testing indices for given 
135    dataset. If train indices are given, it will calculate the test 
136    indices. If train_indices == None, then it calculates both indices, 
137    preserving the ratio of each label in y
138
139    Parameters
140    ----------
141    y : np.ndarray
142        Numpy array of cell labels. Can have any number of classes 
143        for this function.
144
145    train_indices : np.ndarray | None
146        Optional array of pre-determined training indices
147
148    train_ratio : float
149        Decimal value ratio of features in training/testing sets
150
151    seed_obj : np.random._generator.Generator
152        Numpy random state used for random processes. Can be 
153        specified for reproducubility or set by default.
154    
155    
156    Returns
157    -------
158    train_indices : np.ndarray
159        Array of indices of training cells.
160
161    test_indices : np.ndarray:
162        Array of indices of testing cells.
163    """
164    # If train indices aren't provided
165    if train_indices is None:
166
167        unique_labels = np.unique(y)
168        train_indices = []
169
170        for label in unique_labels:
171
172            # Find indices of each unique label
173            label_indices = np.where(y == label)[0]
174
175            # Sample these indices according to train ratio
176            n = int(len(label_indices) * train_ratio)
177            train_label_indices = seed_obj.choice(label_indices, n, 
178                                                  replace = False)
179            train_indices.extend(train_label_indices)
180    else:
181        assert len(train_indices) <= len(y), ("More train indices than there "
182                                              "are samples")
183
184    train_indices = np.array(train_indices)
185
186    # Test indices are the indices not in the train_indices
187    test_indices = np.setdiff1d(np.arange(len(y)), train_indices, 
188                                assume_unique = True)
189
190    return train_indices, test_indices
191
192
193def calculate_d(num_samples : int):
194    """
195    This function calculates the optimal number of dimensions for 
196    performance. See https://doi.org/10.48550/arXiv.1806.09178 for more
197    information.
198
199    Parameters
200    ----------
201    num_samples : int
202        The number of samples in the data set including both training
203        and testing sets.
204
205    Returns
206    -------
207    d : int
208        The optimal number of dimensions to run scMKL with the given 
209        data set.
210
211    Examples
212    --------
213    >>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz')
214    >>>
215    >>> num_cells = raw_counts.shape[0]
216    >>> d = scmkl.calculate_d(num_cells)
217    >>> d
218    161
219    """
220    d = int(np.sqrt(num_samples)*np.log(np.log(num_samples)))
221
222    return int(np.max([d, 100]))
223
224
225def sort_samples(train_indices, test_indices):
226    """
227    Ensures that samples in adata obj are all training, then all 
228    testing.
229
230    Parameters
231    ----------
232    train_indices : np.ndarray
233        Indices in ad.AnnData object for training.
234    
235    test_indices : np.ndarray
236        Indices in ad.AnnData object for testing.
237
238    Returns
239    -------
240    sort_idc : np.ndarray
241        Ordered indices that will sort ad.AnnData object as all 
242        training samples, then all testing.
243
244    train_indices : np.ndarray
245        The new training indices given the new index order, `sort_idc`.
246
247    test_indices : np.ndarray
248        The new testing indices given the new index order, `sort_idc`.
249    """
250    sort_idc = np.concatenate([train_indices, test_indices])
251
252    train_indices = np.arange(0, train_indices.shape[0])
253    test_indices = np.arange(train_indices.shape[0], 
254                             train_indices.shape[0] + test_indices.shape[0])
255    
256    return sort_idc, train_indices, test_indices
257
258
259def create_adata(X: scipy.sparse._csc.csc_matrix | np.ndarray | pd.DataFrame, 
260                 feature_names: np.ndarray, cell_labels: np.ndarray, 
261                 group_dict: dict, obs_names: None | np.ndarray=None, 
262                 scale_data: bool=True, transform_data: bool=False, 
263                 split_data: np.ndarray | None=None, D: int | None=None, 
264                 remove_features: bool=True, train_ratio: float=0.8, 
265                 distance_metric: str='euclidean', kernel_type: str='Gaussian', 
266                 random_state: int=1, allow_multiclass: bool = False, 
267                 class_threshold: str | int | None = None,
268                 reduction: str | None = None, tfidf: bool = False):
269    """
270    Function to create an AnnData object to carry all relevant 
271    information going forward.
272
273    Parameters
274    ----------
275    X : scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame
276        A data matrix of cells by features (sparse array 
277        recommended for large datasets).
278
279    feature_names : np.ndarray
280        Array of feature names corresponding with the features 
281        in `X`.
282
283    cell_labels : np.ndarray
284        A numpy array of cell phenotypes corresponding with 
285        the cells in `X`.
286
287    group_dict : dict 
288        Dictionary containing feature grouping information (i.e. 
289        `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 
290        np.array([...]), ...}`.
291
292    obs_names : None | np.ndarray
293        The cell names corresponding to `X` to be assigned to output 
294        object `.obs_names` attribute.
295
296    scale_data : bool  
297        If `True`, data matrix is log transformed and standard 
298        scaled. Default is `True`.
299
300    transform_data : bool
301        If `True`, data will be log1p transformed (recommended for 
302        counts data). Default is `False`.   
303    
304    split_data : None | np.ndarray
305        If `None`, data will be split stratified by cell labels. 
306        Else, is an array of precalculated train/test split 
307        corresponding to samples. Can include labels for entire
308        dataset to benchmark performance or for only training
309        data to classify unknown cell types (i.e. `np.array(['train', 
310        'test', ..., 'train'])`.
311
312    D : int 
313        Number of Random Fourier Features used to calculate Z. 
314        Should be a positive integer. Higher values of D will 
315        increase classification accuracy at the cost of computation 
316        time. If set to `None`, will be calculated given number of 
317        samples. 
318    
319    remove_features : bool
320        If `True`, will remove features from `X` and `feature_names` 
321        not in `group_dict` and remove features from groupings not in 
322        `feature_names`.
323
324    train_ratio : float
325        Ratio of number of training samples to entire data set. Note:
326        if a threshold is applied, the ratio training samples may 
327        decrease depending on class balance and `class_threshold`
328        parameter if `allow_multiclass = True`.
329
330    distance_metric : str
331        The pairwise distance metric used to estimate sigma. Must
332        be one of the options used in `scipy.spatial.distance.cdist`.
333
334    kernel_type : str
335        The approximated kernel function used to calculate Zs.
336        Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`.
337
338    random_state : int
339        Integer random_state used to set the seed for 
340        reproducibilty.
341
342    allow_multiclass : bool
343        If `False`, will ensure that cell labels are binary.
344
345    class_threshold : str | int
346        Number of samples allowed in the training data for each cell
347        class in the training data. If `'median'`, the median number 
348        of cells per cell class will be the threshold for number of 
349        samples per class.
350
351    reduction: str | None
352        Choose which dimension reduction technique to perform on 
353        features within a group. 'svd' will run 
354        `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 
355        by an array of 1s down to 50 dimensions. 'pca' will replace 
356        each group values with 50 PCs from principal component 
357        analysis.
358        
359    tfidf: bool
360        Whether to calculate TFIDF transformation on peaks within 
361        groupings.
362        
363    Returns
364    -------
365    adata : ad.AnnData
366        AnnData with the following attributes and keys:
367
368        `adata.X` (array_like):
369            Data matrix.
370    
371        `adata.var_names` (array_like): 
372            Feature names corresponding to `adata.X`.
373
374        `adata.obs['labels']` (array_like):
375            cell classes/phenotypes from `cell_labels`.
376
377        `adata.uns['train_indices']` (array_like):
378            Indices for training data. 
379
380        `adata.uns['test_indices']` (array_like)
381            Indices for testing data.
382
383        `adata.uns['group_dict']` (dict):
384            Grouping information.
385
386        `adata.uns['seed_obj']` (np.random._generator.Generator): 
387            Seed object with seed equal to 100 * `random_state`.
388
389        `adata.uns['D']` (int):
390            Number of dimensions to scMKL with.
391
392        `adata.uns['scale_data']` (bool):
393            Whether or not data is scaled.
394
395        `adata.uns['transform_data']` (bool):
396            Whether or not data is log1p transformed.
397
398        `adata.uns['distance_metric']` (str): 
399            Distance metric as given.
400    
401        `adata.uns['kernel_type']` (str): 
402            Kernel function as given.
403
404        `adata.uns['svd']` (bool): 
405            Whether to calculate SVD reduction.
406
407        `adata.uns['tfidf']` (bool): 
408            Whether to calculate TF-IDF per grouping.
409
410    Examples
411    --------
412    >>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz')
413    >>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True)
414    >>> group_dict = np.load('hallmark_genesets.pkl', 
415    >>>                      allow_pickle = True)
416    >>> 
417    >>> adata = scmkl.create_adata(X = data_mat, 
418    ...                            feature_names = gene_names, 
419    ...                            group_dict = group_dict)
420    >>> adata
421    AnnData object with n_obs × n_vars = 1000 × 4341
422    obs: 'labels'
423    uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
424    'distance_metric', 'train_indices', 'test_indices'
425    """
426
427    assert X.shape[1] == len(feature_names), ("Different number of features "
428                                              "in X than feature names")
429    
430    if not allow_multiclass:
431        assert len(np.unique(cell_labels)) == 2, ("cell_labels must contain "
432                                                  "2 classes")
433    if D is not None:    
434        assert isinstance(D, int) and D > 0, 'D must be a positive integer'
435
436    kernel_options = ['gaussian', 'laplacian', 'cauchy']
437    assert kernel_type.lower() in kernel_options, ("Given kernel type not "
438                                                   "implemented. Gaussian, "
439                                                   "Laplacian, and Cauchy "
440                                                   "are the acceptable "
441                                                   "types.")
442
443    # Create adata object and add column names
444    adata = ad.AnnData(X)
445    adata.var_names = feature_names
446
447    if isinstance(obs_names, (np.ndarray)):
448        adata.obs_names = obs_names
449
450    filtered_feature_names, group_dict = filter_features(feature_names, 
451                                                          group_dict)
452    
453    # Ensuring that there are common features between feature_names and groups
454    overlap_err = ("No common features between feature names and grouping "
455                   "dict. Check grouping.")
456    assert len(filtered_feature_names) > 0, overlap_err
457
458    if remove_features:
459        warnings.filterwarnings('ignore', category = ad.ImplicitModificationWarning)
460        adata = adata[:, filtered_feature_names]
461    
462    gc.collect()
463
464    # Add metadata to adata object
465    adata.uns['group_dict'] = group_dict
466    adata.uns['seed_obj'] = np.random.default_rng(100*random_state)
467    adata.uns['scale_data'] = scale_data
468    adata.uns['transform_data'] = transform_data
469    adata.uns['D'] = D if D is not None else calculate_d(adata.shape[0])
470    adata.uns['kernel_type'] = kernel_type
471    adata.uns['distance_metric'] = distance_metric
472    adata.uns['reduction'] = reduction if isinstance(reduction, str) else 'None'
473    adata.uns['tfidf'] = tfidf
474
475    if (split_data is None):
476        assert X.shape[0] == len(cell_labels), ("Different number of cells "
477                                                "than labels")
478        adata.obs['labels'] = cell_labels
479
480        if (allow_multiclass == False):
481            split = binary_split(cell_labels, 
482                                  seed_obj = adata.uns['seed_obj'],
483                                  train_ratio = train_ratio)
484            train_indices, test_indices = split
485
486        elif (allow_multiclass == True):
487            split = multi_class_split(cell_labels, 
488                                       seed_obj = adata.uns['seed_obj'], 
489                                       class_threshold = class_threshold,
490                                       train_ratio = train_ratio)
491            train_indices, test_indices = split
492
493        adata.uns['labeled_test'] = True
494
495    else:
496        sd_err_message = "`split_data` argument must be of type np.ndarray"
497        assert isinstance(split_data, np.ndarray), sd_err_message
498        x_eq_labs = X.shape[0] == len(cell_labels)
499        train_eq_labs = X.shape[0] == len(cell_labels)
500        assert x_eq_labs or train_eq_labs, ("Must give labels for all cells "
501                                            "or only for training cells")
502        
503        train_indices = np.where(split_data == 'train')[0]
504        test_indices = np.where(split_data == 'test')[0]
505
506        if len(cell_labels) == len(train_indices):
507
508            padded_cell_labels = np.zeros((X.shape[0])).astype('object')
509            padded_cell_labels[train_indices] = cell_labels
510            padded_cell_labels[test_indices] = 'padded_test_label'
511
512            adata.obs['labels'] = padded_cell_labels
513            adata.uns['labeled_test'] = False
514
515        elif len(cell_labels) == len(split_data):
516            adata.obs['labels'] = cell_labels
517            adata.uns['labeled_test'] = True
518
519    # Ensuring all train samples are first in adata object followed by test
520    sort_idx, train_indices, test_indices = sort_samples(train_indices, 
521                                                         test_indices)
522
523    adata = adata[sort_idx]
524
525    if not isinstance(obs_names, (np.ndarray)):
526        adata.obs = adata.obs.reset_index(drop=True)
527        adata.obs.index = adata.obs.index.astype('O')
528
529    adata.uns['train_indices'] = train_indices
530    adata.uns['test_indices'] = test_indices
531
532    if not scale_data:
533        print("WARNING: Data will not be scaled. "
534              "To change this behavior, set scale_data to True")
535    if not transform_data:
536        print("WARNING: Data will not be transformed."
537              "To change this behavior, set transform_data to True")
538
539    return adata
540
541
542def format_adata(adata: ad.AnnData | str, cell_labels: np.ndarray | str, 
543                 group_dict: dict | str, use_raw: bool=False, 
544                 scale_data: bool=True, transform_data: bool=False, 
545                 split_data: np.ndarray | None=None, D: int | None=None, 
546                 remove_features: bool=True, train_ratio: float=0.8, 
547                 distance_metric: str='euclidean', kernel_type: str='Gaussian', 
548                 random_state: int=1, allow_multiclass: bool = False, 
549                 class_threshold: str | int | None=None, 
550                 reduction: str | None = None, tfidf: bool = False):
551    """
552    Function to format an `ad.AnnData` object to carry all relevant 
553    information going forward. `adata.obs_names` will be retained.
554
555    **NOTE: Information not needed for running `scmkl` will be 
556    removed.**
557
558    Parameters
559    ----------
560    adata : ad.AnnData
561        Object with data for `scmkl` to be applied to. Only requirment 
562        is that `.var_names` is correct and data matrix is in `adata.X` 
563        or `adata.raw.X`. A h5ad file can be provided as a `str` and it 
564        will be read in.
565
566    cell_labels : np.ndarray | str
567        If type `str`, the labels for `scmkl` to learn are captured 
568        from `adata.obs['cell_labels']`. Else, a `np.ndarray` of cell 
569        phenotypes corresponding with the cells in `adata.X`.
570
571    group_dict : dict | str
572        Dictionary containing feature grouping information (i.e. 
573        `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 
574        np.array([...]), ...}`. A pickle file can be provided as a `str` 
575        and it will be read in.
576
577    obs_names : None | np.ndarray
578        The cell names corresponding to `X` to be assigned to output 
579        object `.obs_names` attribute.
580
581    use_raw : bool
582        If `False`, will use `adata.X` to create new `adata`. Else, 
583        will use `adata.raw.X`.
584
585    scale_data : bool  
586        If `True`, data matrix is log transformed and standard 
587        scaled. 
588
589    transform_data : bool
590        If `True`, data will be log1p transformed (recommended for 
591        counts data). Default is `False`. 
592        
593    split_data : None | np.ndarray
594        If `None`, data will be split stratified by cell labels. 
595        Else, is an array of precalculated train/test split 
596        corresponding to samples. Can include labels for entire
597        dataset to benchmark performance or for only training
598        data to classify unknown cell types (i.e. `np.array(['train', 
599        'test', ..., 'train'])`.
600
601    D : int 
602        Number of Random Fourier Features used to calculate Z. 
603        Should be a positive integer. Higher values of D will 
604        increase classification accuracy at the cost of computation 
605        time. If set to `None`, will be calculated given number of 
606        samples. 
607    
608    remove_features : bool
609        If `True`, will remove features from `X` and `feature_names` 
610        not in `group_dict` and remove features from groupings not in 
611        `feature_names`.
612
613    train_ratio : float
614        Ratio of number of training samples to entire data set. Note:
615        if a threshold is applied, the ratio training samples may 
616        decrease depending on class balance and `class_threshold`
617        parameter if `allow_multiclass = True`.
618
619    distance_metric : str
620        The pairwise distance metric used to estimate sigma. Must
621        be one of the options used in `scipy.spatial.distance.cdist`.
622
623    kernel_type : str
624        The approximated kernel function used to calculate Zs.
625        Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`.
626
627    random_state : int
628        Integer random_state used to set the seed for 
629        reproducibilty.
630
631    allow_multiclass : bool
632        If `False`, will ensure that cell labels are binary.
633
634    class_threshold : str | int
635        Number of samples allowed in the training data for each cell
636        class in the training data. If `'median'`, the median number 
637        of cells per cell class will be the threshold for number of 
638        samples per class.
639
640    reduction: str | None
641        Choose which dimension reduction technique to perform on 
642        features within a group. 'svd' will run 
643        `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 
644        by an array of 1s down to 50 dimensions.
645        
646    tfidf: bool
647        Whether to calculate TFIDF transformation on peaks within 
648        groupings.
649        
650    Returns
651    -------
652    adata : ad.AnnData
653        AnnData with the following attributes and keys:
654
655        `adata.X` (array_like):
656            Data matrix.
657    
658        `adata.var_names` (array_like): 
659            Feature names corresponding to `adata.X`.
660
661        `adata.obs['labels']` (array_like):
662            cell classes/phenotypes from `cell_labels`.
663
664        `adata.uns['train_indices']` (array_like):
665            Indices for training data. 
666
667        `adata.uns['test_indices']` (array_like)
668            Indices for testing data.
669
670        `adata.uns['group_dict']` (dict):
671            Grouping information.
672
673        `adata.uns['seed_obj']` (np.random._generator.Generator): 
674            Seed object with seed equal to 100 * `random_state`.
675
676        `adata.uns['D']` (int):
677            Number of dimensions to scMKL with.
678
679        `adata.uns['scale_data']` (bool):
680            Whether or not data is scaled.
681
682        `adata.uns['transform_data']` (bool):
683            Whether or not data is log1p transformed.
684
685        `adata.uns['distance_metric']` (str): 
686            Distance metric as given.
687    
688        `adata.uns['kernel_type']` (str): 
689            Kernel function as given.
690
691        `adata.uns['svd']` (bool): 
692            Whether to calculate SVD reduction.
693
694        `adata.uns['tfidf']` (bool): 
695            Whether to calculate TF-IDF per grouping.
696
697    Examples
698    --------
699    >>> adata = ad.read_h5ad('MCF7_rna.h5ad')
700    >>> group_dict = np.load('hallmark_genesets.pkl', 
701    >>>                      allow_pickle = True)
702    >>> 
703    >>> 
704    >>> # The labels in adata.obs we want to learn are 'celltypes'
705    >>> adata = scmkl.format_adata(adata, 'celltypes', 
706    ...                            group_dict)
707    >>> adata
708    AnnData object with n_obs × n_vars = 1000 × 4341
709    obs: 'labels'
710    uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
711    'distance_metric', 'train_indices', 'test_indices'
712    """
713    if str == type(adata):
714        adata = ad.read_h5ad(adata)
715
716    if str == type(group_dict):
717        group_dict = np.load(group_dict, allow_pickle=True)
718        
719    if str == type(cell_labels):
720        err_msg = f"{cell_labels} is not in `adata.obs`"
721        assert cell_labels in adata.obs.keys(), err_msg
722        cell_labels = adata.obs[cell_labels].to_numpy()
723    
724    if use_raw:
725        assert adata.raw, "`adata.raw` is empty, set `use_raw` to `False`"
726        X = adata.raw.X
727    else:
728        X = adata.X
729
730    adata = create_adata(X, adata.var_names.to_numpy().copy(), cell_labels, 
731                         group_dict, adata.obs_names.to_numpy().copy(), 
732                         scale_data, transform_data, split_data, D, remove_features, 
733                         train_ratio, distance_metric, kernel_type, 
734                         random_state, allow_multiclass, class_threshold, 
735                         reduction, tfidf)
736
737    return adata
def filter_features(feature_names: numpy.ndarray, group_dict: dict):
10def filter_features(feature_names: np.ndarray, group_dict: dict):
11    """
12    Function to remove features only in feature names or group_dict.
13    Any features not included in group_dict will be removed from the
14    matrix. Also puts the features in the same relative order (of 
15    included features)
16    
17    Parameters
18    ----------
19    feature_names : np.ndarray
20        Numpy array of corresponding feature names.
21
22    group_dict : dict
23        Dictionary containing feature grouping information.
24                 Example: {geneset: np.array(gene_1, gene_2, ..., 
25                 gene_n)}
26    Returns
27    -------
28    feature_names : np.ndarray
29        Numpy array of corresponding feature names from group_dict.
30
31    group_dict : dict
32        Dictionary containing features overlapping input grouping 
33        information and full feature names.
34    """
35    group_features = set()
36    feature_set = set(feature_names)
37
38    # Store all objects in dictionary in set
39    for group in group_dict.keys():
40        group_features.update(set(group_dict[group]))
41
42        # Finds intersection between group features and features in data
43        # Converts to nd.array and sorts to preserve order of feature names
44        group_feats = list(feature_set.intersection(set(group_dict[group])))
45        group_dict[group] = np.sort(np.array(group_feats))
46
47    # Only keeping groupings that have at least two features
48    group_dict = {group : group_dict[group] for group in group_dict.keys()
49                  if len(group_dict[group]) > 1}
50
51    group_features = np.array(list(group_features.intersection(feature_set)))
52
53    return group_features, group_dict

Function to remove features only in feature names or group_dict. Any features not included in group_dict will be removed from the matrix. Also puts the features in the same relative order (of included features)

Parameters
  • feature_names (np.ndarray): Numpy array of corresponding feature names.
  • group_dict (dict): Dictionary containing feature grouping information. Example: {geneset: np.array(gene_1, gene_2, ..., gene_n)}
Returns
  • feature_names (np.ndarray): Numpy array of corresponding feature names from group_dict.
  • group_dict (dict): Dictionary containing features overlapping input grouping information and full feature names.
def multi_class_split( y: numpy.ndarray, seed_obj: numpy.random._generator.Generator, class_threshold: str | int | None = None, train_ratio: float = 0.8):
 56def multi_class_split(y: np.ndarray, seed_obj: np.random._generator.Generator, 
 57                      class_threshold: str | int | None=None, 
 58                      train_ratio: float=0.8):
 59    """
 60    Function for calculating the training and testing cell positions 
 61    for multiclass data sets.
 62
 63    Parameters
 64    ----------
 65    y : array_like
 66        Should be an iterable object cooresponding to samples in 
 67        `ad.AnnData` object.
 68
 69    seed_obj : np.random._generator.Generator
 70        Seed used to randomly sample and split data.
 71
 72    train_ratio : float
 73        Ratio of number of training samples to entire data set. 
 74        Note: if a threshold is applied, the ratio training samples 
 75        may decrease depending on class balance and `class_threshold`
 76        parameter.
 77
 78    class_threshold : str | int
 79        If is type `int`, classes with more samples than 
 80        class_threshold will be sampled. If `'median'`, 
 81        samples will be sampled to the median number of samples per 
 82        class.
 83
 84    Returns
 85    -------
 86    train_indices : np.ndarray
 87        Indices for training samples.
 88
 89    test_indices : np.ndarray
 90        Indices for testing samples.
 91    """
 92    uniq_labels = np.unique(y)
 93
 94    # Finding indices for each cell class
 95    class_positions = {class_ : np.where(y == class_)[0] 
 96                       for class_ in uniq_labels}
 97    
 98    # Capturing training indices while maintaining original class proportions
 99    train_samples = {class_ : seed_obj.choice(class_positions[class_], 
100                                              int(len(class_positions[class_])
101                                                  * train_ratio), 
102                                              replace = False)
103                        for class_ in class_positions.keys()}
104    
105    # Capturing testing indices while maintaining original class proportions
106    test_samples = {class_ : np.setdiff1d(class_positions[class_], 
107                                          train_samples[class_])
108                    for class_ in class_positions.keys()}
109    
110    # Applying threshold for samples per class
111    if class_threshold == 'median':
112        cells_per_class = [len(values) for values in train_samples.values()]
113        class_threshold = int(np.median(cells_per_class))
114
115    if isinstance(class_threshold, int):
116        # Down sample to class_threshold
117        for class_ in train_samples.keys():
118            if len(train_samples[class_]) > class_threshold:
119                train_samples[class_] = seed_obj.choice(train_samples[class_], 
120                                                        class_threshold)
121            
122    train_indices = np.array([idx for class_ in train_samples.keys()
123                                  for idx in train_samples[class_]])
124    
125    test_indices = np.array([idx for class_ in test_samples.keys()
126                                 for idx in test_samples[class_]])
127    
128    return train_indices, test_indices

Function for calculating the training and testing cell positions for multiclass data sets.

Parameters
  • y (array_like): Should be an iterable object cooresponding to samples in ad.AnnData object.
  • seed_obj (np.random._generator.Generator): Seed used to randomly sample and split data.
  • train_ratio (float): Ratio of number of training samples to entire data set. Note: if a threshold is applied, the ratio training samples may decrease depending on class balance and class_threshold parameter.
  • class_threshold (str | int): If is type int, classes with more samples than class_threshold will be sampled. If 'median', samples will be sampled to the median number of samples per class.
Returns
  • train_indices (np.ndarray): Indices for training samples.
  • test_indices (np.ndarray): Indices for testing samples.
def binary_split( y: numpy.ndarray, train_indices: numpy.ndarray | None = None, train_ratio: float = 0.8, seed_obj: numpy.random._generator.Generator = Generator(PCG64) at 0x72C2294C4F20):
131def binary_split(y: np.ndarray, train_indices: np.ndarray | None=None, 
132                  train_ratio: float=0.8,
133                  seed_obj: np.random._generator.Generator=np.random.default_rng(100)):
134    """
135    Function to calculate training and testing indices for given 
136    dataset. If train indices are given, it will calculate the test 
137    indices. If train_indices == None, then it calculates both indices, 
138    preserving the ratio of each label in y
139
140    Parameters
141    ----------
142    y : np.ndarray
143        Numpy array of cell labels. Can have any number of classes 
144        for this function.
145
146    train_indices : np.ndarray | None
147        Optional array of pre-determined training indices
148
149    train_ratio : float
150        Decimal value ratio of features in training/testing sets
151
152    seed_obj : np.random._generator.Generator
153        Numpy random state used for random processes. Can be 
154        specified for reproducubility or set by default.
155    
156    
157    Returns
158    -------
159    train_indices : np.ndarray
160        Array of indices of training cells.
161
162    test_indices : np.ndarray:
163        Array of indices of testing cells.
164    """
165    # If train indices aren't provided
166    if train_indices is None:
167
168        unique_labels = np.unique(y)
169        train_indices = []
170
171        for label in unique_labels:
172
173            # Find indices of each unique label
174            label_indices = np.where(y == label)[0]
175
176            # Sample these indices according to train ratio
177            n = int(len(label_indices) * train_ratio)
178            train_label_indices = seed_obj.choice(label_indices, n, 
179                                                  replace = False)
180            train_indices.extend(train_label_indices)
181    else:
182        assert len(train_indices) <= len(y), ("More train indices than there "
183                                              "are samples")
184
185    train_indices = np.array(train_indices)
186
187    # Test indices are the indices not in the train_indices
188    test_indices = np.setdiff1d(np.arange(len(y)), train_indices, 
189                                assume_unique = True)
190
191    return train_indices, test_indices

Function to calculate training and testing indices for given dataset. If train indices are given, it will calculate the test indices. If train_indices == None, then it calculates both indices, preserving the ratio of each label in y

Parameters
  • y (np.ndarray): Numpy array of cell labels. Can have any number of classes for this function.
  • train_indices (np.ndarray | None): Optional array of pre-determined training indices
  • train_ratio (float): Decimal value ratio of features in training/testing sets
  • seed_obj (np.random._generator.Generator): Numpy random state used for random processes. Can be specified for reproducubility or set by default.
Returns
  • train_indices (np.ndarray): Array of indices of training cells.
  • test_indices (np.ndarray:): Array of indices of testing cells.
def calculate_d(num_samples: int):
194def calculate_d(num_samples : int):
195    """
196    This function calculates the optimal number of dimensions for 
197    performance. See https://doi.org/10.48550/arXiv.1806.09178 for more
198    information.
199
200    Parameters
201    ----------
202    num_samples : int
203        The number of samples in the data set including both training
204        and testing sets.
205
206    Returns
207    -------
208    d : int
209        The optimal number of dimensions to run scMKL with the given 
210        data set.
211
212    Examples
213    --------
214    >>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz')
215    >>>
216    >>> num_cells = raw_counts.shape[0]
217    >>> d = scmkl.calculate_d(num_cells)
218    >>> d
219    161
220    """
221    d = int(np.sqrt(num_samples)*np.log(np.log(num_samples)))
222
223    return int(np.max([d, 100]))

This function calculates the optimal number of dimensions for performance. See https://doi.org/10.48550/arXiv.1806.09178 for more information.

Parameters
  • num_samples (int): The number of samples in the data set including both training and testing sets.
Returns
  • d (int): The optimal number of dimensions to run scMKL with the given data set.
Examples
>>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz')
>>>
>>> num_cells = raw_counts.shape[0]
>>> d = scmkl.calculate_d(num_cells)
>>> d
161
def sort_samples(train_indices, test_indices):
226def sort_samples(train_indices, test_indices):
227    """
228    Ensures that samples in adata obj are all training, then all 
229    testing.
230
231    Parameters
232    ----------
233    train_indices : np.ndarray
234        Indices in ad.AnnData object for training.
235    
236    test_indices : np.ndarray
237        Indices in ad.AnnData object for testing.
238
239    Returns
240    -------
241    sort_idc : np.ndarray
242        Ordered indices that will sort ad.AnnData object as all 
243        training samples, then all testing.
244
245    train_indices : np.ndarray
246        The new training indices given the new index order, `sort_idc`.
247
248    test_indices : np.ndarray
249        The new testing indices given the new index order, `sort_idc`.
250    """
251    sort_idc = np.concatenate([train_indices, test_indices])
252
253    train_indices = np.arange(0, train_indices.shape[0])
254    test_indices = np.arange(train_indices.shape[0], 
255                             train_indices.shape[0] + test_indices.shape[0])
256    
257    return sort_idc, train_indices, test_indices

Ensures that samples in adata obj are all training, then all testing.

Parameters
  • train_indices (np.ndarray): Indices in ad.AnnData object for training.
  • test_indices (np.ndarray): Indices in ad.AnnData object for testing.
Returns
  • sort_idc (np.ndarray): Ordered indices that will sort ad.AnnData object as all training samples, then all testing.
  • train_indices (np.ndarray): The new training indices given the new index order, sort_idc.
  • test_indices (np.ndarray): The new testing indices given the new index order, sort_idc.
def create_adata( X: scipy.sparse._csc.csc_matrix | numpy.ndarray | pandas.core.frame.DataFrame, feature_names: numpy.ndarray, cell_labels: numpy.ndarray, group_dict: dict, obs_names: None | numpy.ndarray = None, scale_data: bool = True, transform_data: bool = False, split_data: numpy.ndarray | None = None, D: int | None = None, remove_features: bool = True, train_ratio: float = 0.8, distance_metric: str = 'euclidean', kernel_type: str = 'Gaussian', random_state: int = 1, allow_multiclass: bool = False, class_threshold: str | int | None = None, reduction: str | None = None, tfidf: bool = False):
260def create_adata(X: scipy.sparse._csc.csc_matrix | np.ndarray | pd.DataFrame, 
261                 feature_names: np.ndarray, cell_labels: np.ndarray, 
262                 group_dict: dict, obs_names: None | np.ndarray=None, 
263                 scale_data: bool=True, transform_data: bool=False, 
264                 split_data: np.ndarray | None=None, D: int | None=None, 
265                 remove_features: bool=True, train_ratio: float=0.8, 
266                 distance_metric: str='euclidean', kernel_type: str='Gaussian', 
267                 random_state: int=1, allow_multiclass: bool = False, 
268                 class_threshold: str | int | None = None,
269                 reduction: str | None = None, tfidf: bool = False):
270    """
271    Function to create an AnnData object to carry all relevant 
272    information going forward.
273
274    Parameters
275    ----------
276    X : scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame
277        A data matrix of cells by features (sparse array 
278        recommended for large datasets).
279
280    feature_names : np.ndarray
281        Array of feature names corresponding with the features 
282        in `X`.
283
284    cell_labels : np.ndarray
285        A numpy array of cell phenotypes corresponding with 
286        the cells in `X`.
287
288    group_dict : dict 
289        Dictionary containing feature grouping information (i.e. 
290        `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 
291        np.array([...]), ...}`.
292
293    obs_names : None | np.ndarray
294        The cell names corresponding to `X` to be assigned to output 
295        object `.obs_names` attribute.
296
297    scale_data : bool  
298        If `True`, data matrix is log transformed and standard 
299        scaled. Default is `True`.
300
301    transform_data : bool
302        If `True`, data will be log1p transformed (recommended for 
303        counts data). Default is `False`.   
304    
305    split_data : None | np.ndarray
306        If `None`, data will be split stratified by cell labels. 
307        Else, is an array of precalculated train/test split 
308        corresponding to samples. Can include labels for entire
309        dataset to benchmark performance or for only training
310        data to classify unknown cell types (i.e. `np.array(['train', 
311        'test', ..., 'train'])`.
312
313    D : int 
314        Number of Random Fourier Features used to calculate Z. 
315        Should be a positive integer. Higher values of D will 
316        increase classification accuracy at the cost of computation 
317        time. If set to `None`, will be calculated given number of 
318        samples. 
319    
320    remove_features : bool
321        If `True`, will remove features from `X` and `feature_names` 
322        not in `group_dict` and remove features from groupings not in 
323        `feature_names`.
324
325    train_ratio : float
326        Ratio of number of training samples to entire data set. Note:
327        if a threshold is applied, the ratio training samples may 
328        decrease depending on class balance and `class_threshold`
329        parameter if `allow_multiclass = True`.
330
331    distance_metric : str
332        The pairwise distance metric used to estimate sigma. Must
333        be one of the options used in `scipy.spatial.distance.cdist`.
334
335    kernel_type : str
336        The approximated kernel function used to calculate Zs.
337        Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`.
338
339    random_state : int
340        Integer random_state used to set the seed for 
341        reproducibilty.
342
343    allow_multiclass : bool
344        If `False`, will ensure that cell labels are binary.
345
346    class_threshold : str | int
347        Number of samples allowed in the training data for each cell
348        class in the training data. If `'median'`, the median number 
349        of cells per cell class will be the threshold for number of 
350        samples per class.
351
352    reduction: str | None
353        Choose which dimension reduction technique to perform on 
354        features within a group. 'svd' will run 
355        `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 
356        by an array of 1s down to 50 dimensions. 'pca' will replace 
357        each group values with 50 PCs from principal component 
358        analysis.
359        
360    tfidf: bool
361        Whether to calculate TFIDF transformation on peaks within 
362        groupings.
363        
364    Returns
365    -------
366    adata : ad.AnnData
367        AnnData with the following attributes and keys:
368
369        `adata.X` (array_like):
370            Data matrix.
371    
372        `adata.var_names` (array_like): 
373            Feature names corresponding to `adata.X`.
374
375        `adata.obs['labels']` (array_like):
376            cell classes/phenotypes from `cell_labels`.
377
378        `adata.uns['train_indices']` (array_like):
379            Indices for training data. 
380
381        `adata.uns['test_indices']` (array_like)
382            Indices for testing data.
383
384        `adata.uns['group_dict']` (dict):
385            Grouping information.
386
387        `adata.uns['seed_obj']` (np.random._generator.Generator): 
388            Seed object with seed equal to 100 * `random_state`.
389
390        `adata.uns['D']` (int):
391            Number of dimensions to scMKL with.
392
393        `adata.uns['scale_data']` (bool):
394            Whether or not data is scaled.
395
396        `adata.uns['transform_data']` (bool):
397            Whether or not data is log1p transformed.
398
399        `adata.uns['distance_metric']` (str): 
400            Distance metric as given.
401    
402        `adata.uns['kernel_type']` (str): 
403            Kernel function as given.
404
405        `adata.uns['svd']` (bool): 
406            Whether to calculate SVD reduction.
407
408        `adata.uns['tfidf']` (bool): 
409            Whether to calculate TF-IDF per grouping.
410
411    Examples
412    --------
413    >>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz')
414    >>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True)
415    >>> group_dict = np.load('hallmark_genesets.pkl', 
416    >>>                      allow_pickle = True)
417    >>> 
418    >>> adata = scmkl.create_adata(X = data_mat, 
419    ...                            feature_names = gene_names, 
420    ...                            group_dict = group_dict)
421    >>> adata
422    AnnData object with n_obs × n_vars = 1000 × 4341
423    obs: 'labels'
424    uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
425    'distance_metric', 'train_indices', 'test_indices'
426    """
427
428    assert X.shape[1] == len(feature_names), ("Different number of features "
429                                              "in X than feature names")
430    
431    if not allow_multiclass:
432        assert len(np.unique(cell_labels)) == 2, ("cell_labels must contain "
433                                                  "2 classes")
434    if D is not None:    
435        assert isinstance(D, int) and D > 0, 'D must be a positive integer'
436
437    kernel_options = ['gaussian', 'laplacian', 'cauchy']
438    assert kernel_type.lower() in kernel_options, ("Given kernel type not "
439                                                   "implemented. Gaussian, "
440                                                   "Laplacian, and Cauchy "
441                                                   "are the acceptable "
442                                                   "types.")
443
444    # Create adata object and add column names
445    adata = ad.AnnData(X)
446    adata.var_names = feature_names
447
448    if isinstance(obs_names, (np.ndarray)):
449        adata.obs_names = obs_names
450
451    filtered_feature_names, group_dict = filter_features(feature_names, 
452                                                          group_dict)
453    
454    # Ensuring that there are common features between feature_names and groups
455    overlap_err = ("No common features between feature names and grouping "
456                   "dict. Check grouping.")
457    assert len(filtered_feature_names) > 0, overlap_err
458
459    if remove_features:
460        warnings.filterwarnings('ignore', category = ad.ImplicitModificationWarning)
461        adata = adata[:, filtered_feature_names]
462    
463    gc.collect()
464
465    # Add metadata to adata object
466    adata.uns['group_dict'] = group_dict
467    adata.uns['seed_obj'] = np.random.default_rng(100*random_state)
468    adata.uns['scale_data'] = scale_data
469    adata.uns['transform_data'] = transform_data
470    adata.uns['D'] = D if D is not None else calculate_d(adata.shape[0])
471    adata.uns['kernel_type'] = kernel_type
472    adata.uns['distance_metric'] = distance_metric
473    adata.uns['reduction'] = reduction if isinstance(reduction, str) else 'None'
474    adata.uns['tfidf'] = tfidf
475
476    if (split_data is None):
477        assert X.shape[0] == len(cell_labels), ("Different number of cells "
478                                                "than labels")
479        adata.obs['labels'] = cell_labels
480
481        if (allow_multiclass == False):
482            split = binary_split(cell_labels, 
483                                  seed_obj = adata.uns['seed_obj'],
484                                  train_ratio = train_ratio)
485            train_indices, test_indices = split
486
487        elif (allow_multiclass == True):
488            split = multi_class_split(cell_labels, 
489                                       seed_obj = adata.uns['seed_obj'], 
490                                       class_threshold = class_threshold,
491                                       train_ratio = train_ratio)
492            train_indices, test_indices = split
493
494        adata.uns['labeled_test'] = True
495
496    else:
497        sd_err_message = "`split_data` argument must be of type np.ndarray"
498        assert isinstance(split_data, np.ndarray), sd_err_message
499        x_eq_labs = X.shape[0] == len(cell_labels)
500        train_eq_labs = X.shape[0] == len(cell_labels)
501        assert x_eq_labs or train_eq_labs, ("Must give labels for all cells "
502                                            "or only for training cells")
503        
504        train_indices = np.where(split_data == 'train')[0]
505        test_indices = np.where(split_data == 'test')[0]
506
507        if len(cell_labels) == len(train_indices):
508
509            padded_cell_labels = np.zeros((X.shape[0])).astype('object')
510            padded_cell_labels[train_indices] = cell_labels
511            padded_cell_labels[test_indices] = 'padded_test_label'
512
513            adata.obs['labels'] = padded_cell_labels
514            adata.uns['labeled_test'] = False
515
516        elif len(cell_labels) == len(split_data):
517            adata.obs['labels'] = cell_labels
518            adata.uns['labeled_test'] = True
519
520    # Ensuring all train samples are first in adata object followed by test
521    sort_idx, train_indices, test_indices = sort_samples(train_indices, 
522                                                         test_indices)
523
524    adata = adata[sort_idx]
525
526    if not isinstance(obs_names, (np.ndarray)):
527        adata.obs = adata.obs.reset_index(drop=True)
528        adata.obs.index = adata.obs.index.astype('O')
529
530    adata.uns['train_indices'] = train_indices
531    adata.uns['test_indices'] = test_indices
532
533    if not scale_data:
534        print("WARNING: Data will not be scaled. "
535              "To change this behavior, set scale_data to True")
536    if not transform_data:
537        print("WARNING: Data will not be transformed."
538              "To change this behavior, set transform_data to True")
539
540    return adata

Function to create an AnnData object to carry all relevant information going forward.

Parameters
  • X (scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame): A data matrix of cells by features (sparse array recommended for large datasets).
  • feature_names (np.ndarray): Array of feature names corresponding with the features in X.
  • cell_labels (np.ndarray): A numpy array of cell phenotypes corresponding with the cells in X.
  • group_dict (dict): Dictionary containing feature grouping information (i.e. {geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: np.array([...]), ...}.
  • obs_names (None | np.ndarray): The cell names corresponding to X to be assigned to output object .obs_names attribute.
  • scale_data (bool): If True, data matrix is log transformed and standard scaled. Default is True.
  • transform_data (bool): If True, data will be log1p transformed (recommended for counts data). Default is False.
  • split_data (None | np.ndarray): If None, data will be split stratified by cell labels. Else, is an array of precalculated train/test split corresponding to samples. Can include labels for entire dataset to benchmark performance or for only training data to classify unknown cell types (i.e. np.array(['train', 'test', ..., 'train']).
  • D (int): Number of Random Fourier Features used to calculate Z. Should be a positive integer. Higher values of D will increase classification accuracy at the cost of computation time. If set to None, will be calculated given number of samples.
  • remove_features (bool): If True, will remove features from X and feature_names not in group_dict and remove features from groupings not in feature_names.
  • train_ratio (float): Ratio of number of training samples to entire data set. Note: if a threshold is applied, the ratio training samples may decrease depending on class balance and class_threshold parameter if allow_multiclass = True.
  • distance_metric (str): The pairwise distance metric used to estimate sigma. Must be one of the options used in scipy.spatial.distance.cdist.
  • kernel_type (str): The approximated kernel function used to calculate Zs. Must be one of 'Gaussian', 'Laplacian', or 'Cauchy'.
  • random_state (int): Integer random_state used to set the seed for reproducibilty.
  • allow_multiclass (bool): If False, will ensure that cell labels are binary.
  • class_threshold (str | int): Number of samples allowed in the training data for each cell class in the training data. If 'median', the median number of cells per cell class will be the threshold for number of samples per class.
  • reduction (str | None): Choose which dimension reduction technique to perform on features within a group. 'svd' will run sklearn.decomposition.TruncatedSVD, 'linear' will multiply by an array of 1s down to 50 dimensions. 'pca' will replace each group values with 50 PCs from principal component analysis.
  • tfidf (bool): Whether to calculate TFIDF transformation on peaks within groupings.
Returns
  • adata (ad.AnnData): AnnData with the following attributes and keys:

    adata.X (array_like): Data matrix.

    adata.var_names (array_like): Feature names corresponding to adata.X.

    adata.obs['labels'] (array_like): cell classes/phenotypes from cell_labels.

    adata.uns['train_indices'] (array_like): Indices for training data.

    adata.uns['test_indices'] (array_like) Indices for testing data.

    adata.uns['group_dict'] (dict): Grouping information.

    adata.uns['seed_obj'] (np.random._generator.Generator): Seed object with seed equal to 100 * random_state.

    adata.uns['D'] (int): Number of dimensions to scMKL with.

    adata.uns['scale_data'] (bool): Whether or not data is scaled.

    adata.uns['transform_data'] (bool): Whether or not data is log1p transformed.

    adata.uns['distance_metric'] (str): Distance metric as given.

    adata.uns['kernel_type'] (str): Kernel function as given.

    adata.uns['svd'] (bool): Whether to calculate SVD reduction.

    adata.uns['tfidf'] (bool): Whether to calculate TF-IDF per grouping.

Examples
>>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz')
>>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True)
>>> group_dict = np.load('hallmark_genesets.pkl', 
>>>                      allow_pickle = True)
>>> 
>>> adata = scmkl.create_adata(X = data_mat, 
...                            feature_names = gene_names, 
...                            group_dict = group_dict)
>>> adata
AnnData object with n_obs × n_vars = 1000 × 4341
obs: 'labels'
uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
'distance_metric', 'train_indices', 'test_indices'
def format_adata( adata: anndata._core.anndata.AnnData | str, cell_labels: numpy.ndarray | str, group_dict: dict | str, use_raw: bool = False, scale_data: bool = True, transform_data: bool = False, split_data: numpy.ndarray | None = None, D: int | None = None, remove_features: bool = True, train_ratio: float = 0.8, distance_metric: str = 'euclidean', kernel_type: str = 'Gaussian', random_state: int = 1, allow_multiclass: bool = False, class_threshold: str | int | None = None, reduction: str | None = None, tfidf: bool = False):
543def format_adata(adata: ad.AnnData | str, cell_labels: np.ndarray | str, 
544                 group_dict: dict | str, use_raw: bool=False, 
545                 scale_data: bool=True, transform_data: bool=False, 
546                 split_data: np.ndarray | None=None, D: int | None=None, 
547                 remove_features: bool=True, train_ratio: float=0.8, 
548                 distance_metric: str='euclidean', kernel_type: str='Gaussian', 
549                 random_state: int=1, allow_multiclass: bool = False, 
550                 class_threshold: str | int | None=None, 
551                 reduction: str | None = None, tfidf: bool = False):
552    """
553    Function to format an `ad.AnnData` object to carry all relevant 
554    information going forward. `adata.obs_names` will be retained.
555
556    **NOTE: Information not needed for running `scmkl` will be 
557    removed.**
558
559    Parameters
560    ----------
561    adata : ad.AnnData
562        Object with data for `scmkl` to be applied to. Only requirment 
563        is that `.var_names` is correct and data matrix is in `adata.X` 
564        or `adata.raw.X`. A h5ad file can be provided as a `str` and it 
565        will be read in.
566
567    cell_labels : np.ndarray | str
568        If type `str`, the labels for `scmkl` to learn are captured 
569        from `adata.obs['cell_labels']`. Else, a `np.ndarray` of cell 
570        phenotypes corresponding with the cells in `adata.X`.
571
572    group_dict : dict | str
573        Dictionary containing feature grouping information (i.e. 
574        `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 
575        np.array([...]), ...}`. A pickle file can be provided as a `str` 
576        and it will be read in.
577
578    obs_names : None | np.ndarray
579        The cell names corresponding to `X` to be assigned to output 
580        object `.obs_names` attribute.
581
582    use_raw : bool
583        If `False`, will use `adata.X` to create new `adata`. Else, 
584        will use `adata.raw.X`.
585
586    scale_data : bool  
587        If `True`, data matrix is log transformed and standard 
588        scaled. 
589
590    transform_data : bool
591        If `True`, data will be log1p transformed (recommended for 
592        counts data). Default is `False`. 
593        
594    split_data : None | np.ndarray
595        If `None`, data will be split stratified by cell labels. 
596        Else, is an array of precalculated train/test split 
597        corresponding to samples. Can include labels for entire
598        dataset to benchmark performance or for only training
599        data to classify unknown cell types (i.e. `np.array(['train', 
600        'test', ..., 'train'])`.
601
602    D : int 
603        Number of Random Fourier Features used to calculate Z. 
604        Should be a positive integer. Higher values of D will 
605        increase classification accuracy at the cost of computation 
606        time. If set to `None`, will be calculated given number of 
607        samples. 
608    
609    remove_features : bool
610        If `True`, will remove features from `X` and `feature_names` 
611        not in `group_dict` and remove features from groupings not in 
612        `feature_names`.
613
614    train_ratio : float
615        Ratio of number of training samples to entire data set. Note:
616        if a threshold is applied, the ratio training samples may 
617        decrease depending on class balance and `class_threshold`
618        parameter if `allow_multiclass = True`.
619
620    distance_metric : str
621        The pairwise distance metric used to estimate sigma. Must
622        be one of the options used in `scipy.spatial.distance.cdist`.
623
624    kernel_type : str
625        The approximated kernel function used to calculate Zs.
626        Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`.
627
628    random_state : int
629        Integer random_state used to set the seed for 
630        reproducibilty.
631
632    allow_multiclass : bool
633        If `False`, will ensure that cell labels are binary.
634
635    class_threshold : str | int
636        Number of samples allowed in the training data for each cell
637        class in the training data. If `'median'`, the median number 
638        of cells per cell class will be the threshold for number of 
639        samples per class.
640
641    reduction: str | None
642        Choose which dimension reduction technique to perform on 
643        features within a group. 'svd' will run 
644        `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 
645        by an array of 1s down to 50 dimensions.
646        
647    tfidf: bool
648        Whether to calculate TFIDF transformation on peaks within 
649        groupings.
650        
651    Returns
652    -------
653    adata : ad.AnnData
654        AnnData with the following attributes and keys:
655
656        `adata.X` (array_like):
657            Data matrix.
658    
659        `adata.var_names` (array_like): 
660            Feature names corresponding to `adata.X`.
661
662        `adata.obs['labels']` (array_like):
663            cell classes/phenotypes from `cell_labels`.
664
665        `adata.uns['train_indices']` (array_like):
666            Indices for training data. 
667
668        `adata.uns['test_indices']` (array_like)
669            Indices for testing data.
670
671        `adata.uns['group_dict']` (dict):
672            Grouping information.
673
674        `adata.uns['seed_obj']` (np.random._generator.Generator): 
675            Seed object with seed equal to 100 * `random_state`.
676
677        `adata.uns['D']` (int):
678            Number of dimensions to scMKL with.
679
680        `adata.uns['scale_data']` (bool):
681            Whether or not data is scaled.
682
683        `adata.uns['transform_data']` (bool):
684            Whether or not data is log1p transformed.
685
686        `adata.uns['distance_metric']` (str): 
687            Distance metric as given.
688    
689        `adata.uns['kernel_type']` (str): 
690            Kernel function as given.
691
692        `adata.uns['svd']` (bool): 
693            Whether to calculate SVD reduction.
694
695        `adata.uns['tfidf']` (bool): 
696            Whether to calculate TF-IDF per grouping.
697
698    Examples
699    --------
700    >>> adata = ad.read_h5ad('MCF7_rna.h5ad')
701    >>> group_dict = np.load('hallmark_genesets.pkl', 
702    >>>                      allow_pickle = True)
703    >>> 
704    >>> 
705    >>> # The labels in adata.obs we want to learn are 'celltypes'
706    >>> adata = scmkl.format_adata(adata, 'celltypes', 
707    ...                            group_dict)
708    >>> adata
709    AnnData object with n_obs × n_vars = 1000 × 4341
710    obs: 'labels'
711    uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
712    'distance_metric', 'train_indices', 'test_indices'
713    """
714    if str == type(adata):
715        adata = ad.read_h5ad(adata)
716
717    if str == type(group_dict):
718        group_dict = np.load(group_dict, allow_pickle=True)
719        
720    if str == type(cell_labels):
721        err_msg = f"{cell_labels} is not in `adata.obs`"
722        assert cell_labels in adata.obs.keys(), err_msg
723        cell_labels = adata.obs[cell_labels].to_numpy()
724    
725    if use_raw:
726        assert adata.raw, "`adata.raw` is empty, set `use_raw` to `False`"
727        X = adata.raw.X
728    else:
729        X = adata.X
730
731    adata = create_adata(X, adata.var_names.to_numpy().copy(), cell_labels, 
732                         group_dict, adata.obs_names.to_numpy().copy(), 
733                         scale_data, transform_data, split_data, D, remove_features, 
734                         train_ratio, distance_metric, kernel_type, 
735                         random_state, allow_multiclass, class_threshold, 
736                         reduction, tfidf)
737
738    return adata

Function to format an ad.AnnData object to carry all relevant information going forward. adata.obs_names will be retained.

NOTE: Information not needed for running scmkl will be removed.

Parameters
  • adata (ad.AnnData): Object with data for scmkl to be applied to. Only requirment is that .var_names is correct and data matrix is in adata.X or adata.raw.X. A h5ad file can be provided as a str and it will be read in.
  • cell_labels (np.ndarray | str): If type str, the labels for scmkl to learn are captured from adata.obs['cell_labels']. Else, a np.ndarray of cell phenotypes corresponding with the cells in adata.X.
  • group_dict (dict | str): Dictionary containing feature grouping information (i.e. {geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: np.array([...]), ...}. A pickle file can be provided as a str and it will be read in.
  • obs_names (None | np.ndarray): The cell names corresponding to X to be assigned to output object .obs_names attribute.
  • use_raw (bool): If False, will use adata.X to create new adata. Else, will use adata.raw.X.
  • scale_data (bool): If True, data matrix is log transformed and standard scaled.
  • transform_data (bool): If True, data will be log1p transformed (recommended for counts data). Default is False.
  • split_data (None | np.ndarray): If None, data will be split stratified by cell labels. Else, is an array of precalculated train/test split corresponding to samples. Can include labels for entire dataset to benchmark performance or for only training data to classify unknown cell types (i.e. np.array(['train', 'test', ..., 'train']).
  • D (int): Number of Random Fourier Features used to calculate Z. Should be a positive integer. Higher values of D will increase classification accuracy at the cost of computation time. If set to None, will be calculated given number of samples.
  • remove_features (bool): If True, will remove features from X and feature_names not in group_dict and remove features from groupings not in feature_names.
  • train_ratio (float): Ratio of number of training samples to entire data set. Note: if a threshold is applied, the ratio training samples may decrease depending on class balance and class_threshold parameter if allow_multiclass = True.
  • distance_metric (str): The pairwise distance metric used to estimate sigma. Must be one of the options used in scipy.spatial.distance.cdist.
  • kernel_type (str): The approximated kernel function used to calculate Zs. Must be one of 'Gaussian', 'Laplacian', or 'Cauchy'.
  • random_state (int): Integer random_state used to set the seed for reproducibilty.
  • allow_multiclass (bool): If False, will ensure that cell labels are binary.
  • class_threshold (str | int): Number of samples allowed in the training data for each cell class in the training data. If 'median', the median number of cells per cell class will be the threshold for number of samples per class.
  • reduction (str | None): Choose which dimension reduction technique to perform on features within a group. 'svd' will run sklearn.decomposition.TruncatedSVD, 'linear' will multiply by an array of 1s down to 50 dimensions.
  • tfidf (bool): Whether to calculate TFIDF transformation on peaks within groupings.
Returns
  • adata (ad.AnnData): AnnData with the following attributes and keys:

    adata.X (array_like): Data matrix.

    adata.var_names (array_like): Feature names corresponding to adata.X.

    adata.obs['labels'] (array_like): cell classes/phenotypes from cell_labels.

    adata.uns['train_indices'] (array_like): Indices for training data.

    adata.uns['test_indices'] (array_like) Indices for testing data.

    adata.uns['group_dict'] (dict): Grouping information.

    adata.uns['seed_obj'] (np.random._generator.Generator): Seed object with seed equal to 100 * random_state.

    adata.uns['D'] (int): Number of dimensions to scMKL with.

    adata.uns['scale_data'] (bool): Whether or not data is scaled.

    adata.uns['transform_data'] (bool): Whether or not data is log1p transformed.

    adata.uns['distance_metric'] (str): Distance metric as given.

    adata.uns['kernel_type'] (str): Kernel function as given.

    adata.uns['svd'] (bool): Whether to calculate SVD reduction.

    adata.uns['tfidf'] (bool): Whether to calculate TF-IDF per grouping.

Examples
>>> adata = ad.read_h5ad('MCF7_rna.h5ad')
>>> group_dict = np.load('hallmark_genesets.pkl', 
>>>                      allow_pickle = True)
>>> 
>>> 
>>> # The labels in adata.obs we want to learn are 'celltypes'
>>> adata = scmkl.format_adata(adata, 'celltypes', 
...                            group_dict)
>>> adata
AnnData object with n_obs × n_vars = 1000 × 4341
obs: 'labels'
uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
'distance_metric', 'train_indices', 'test_indices'