scmkl.create_adata

  1import numpy as np
  2import anndata as ad
  3import scipy
  4import pandas as pd
  5import gc
  6import warnings 
  7
  8
  9def _filter_features(feature_names: np.ndarray, group_dict: dict):
 10    """
 11    Function to remove features only in feature names or group_dict.
 12    Any features not included in group_dict will be removed from the
 13    matrix. Also puts the features in the same relative order (of 
 14    included features)
 15    
 16    Parameters
 17    ----------
 18    feature_names : np.ndarray
 19        Numpy array of corresponding feature names.
 20
 21    group_dict : dict
 22        Dictionary containing feature grouping information.
 23                 Example: {geneset: np.array(gene_1, gene_2, ..., 
 24                 gene_n)}
 25    Returns
 26    -------
 27    feature_names : np.ndarray
 28        Numpy array of corresponding feature names from group_dict.
 29
 30    group_dict : dict
 31        Dictionary containing features overlapping input grouping 
 32        information and full feature names.
 33    """
 34    group_features = set()
 35    feature_set = set(feature_names)
 36
 37    # Store all objects in dictionary in set
 38    for group in group_dict.keys():
 39        group_features.update(set(group_dict[group]))
 40
 41        # Finds intersection between group features and features in data
 42        # Converts to nd.array and sorts to preserve order of feature names
 43        group_feats = list(feature_set.intersection(set(group_dict[group])))
 44        group_dict[group] = np.sort(np.array(group_feats))
 45
 46    # Only keeping groupings that have at least two features
 47    group_dict = {group : group_dict[group] for group in group_dict.keys()
 48                  if len(group_dict[group]) > 1}
 49
 50    group_features = np.array(list(group_features.intersection(feature_set)))
 51
 52    return group_features, group_dict
 53
 54
 55def _multi_class_split(y: np.ndarray, train_ratio: float=0.8, class_threshold: str | int='median', 
 56                       seed_obj: np.random._generator.Generator=np.random.default_rng(100)):
 57    """
 58    Function for calculating the training and testing cell positions 
 59    for multiclass data sets.
 60
 61    Parameters
 62    ----------
 63    y : array_like
 64        Should be an iterable object cooresponding to samples in 
 65        `ad.AnnData` object.
 66
 67    seed_obj : np.random._generator.Generator
 68        Seed used to randomly sample and split data.
 69
 70    train_ratio : float
 71        Ratio of number of training samples to entire data set. 
 72        Note: if a threshold is applied, the ratio training samples 
 73        may decrease depending on class balance and `class_threshold`
 74        parameter.
 75
 76    class_threshold : str | int
 77        If is type `int`, classes with more samples than 
 78        class_threshold will be sampled. If `'median'`, 
 79        samples will be sampled to the median number of samples per 
 80        class.
 81
 82    Returns
 83    -------
 84    train_indices : np.ndarray
 85        Indices for training samples.
 86
 87    test_indices : np.ndarray
 88        Indices for testing samples.
 89    """
 90    uniq_labels = np.unique(y)
 91
 92    # Finding indices for each cell class
 93    class_positions = {class_ : np.where(y == class_)[0] 
 94                       for class_ in uniq_labels}
 95    
 96    # Capturing training indices while maintaining original class proportions
 97    train_samples = {class_ : seed_obj.choice(class_positions[class_], 
 98                                              int(len(class_positions[class_])
 99                                                  * train_ratio), 
100                                              replace = False)
101                        for class_ in class_positions.keys()}
102    
103    # Capturing testing indices while maintaining original class proportions
104    test_samples = {class_ : np.setdiff1d(class_positions[class_], 
105                                          train_samples[class_])
106                    for class_ in class_positions.keys()}
107    
108    # Applying threshold for samples per class
109    if class_threshold == 'median':
110        # I believe this does the same as the commented code below
111
112        cells_per_class = [len(values) for values in train_samples.values()]
113        class_threshold = int(np.median(cells_per_class))
114        # all_train = [idx for class_ in train_samples.keys()
115        #                  for idx in train_samples[class_]]
116        # _, class_threshold = np.unique(y[all_train], return_counts = True)
117        # class_threshold = int(np.median(class_threshold))
118    
119    # Down sample to class_threshold
120    for class_ in train_samples.keys():
121        if len(train_samples[class_]) > class_threshold:
122            train_samples[class_] = seed_obj.choice(train_samples[class_], 
123                                                       class_threshold)
124            
125    train_indices = np.array([idx for class_ in train_samples.keys()
126                                  for idx in train_samples[class_]])
127    
128    test_indices = np.array([idx for class_ in test_samples.keys()
129                                 for idx in test_samples[class_]])
130    
131    return train_indices, test_indices
132
133
134def _binary_split(y: np.ndarray, train_indices: np.ndarray | None=None, 
135                  train_ratio: float=0.8,
136                  seed_obj: np.random._generator.Generator=np.random.default_rng(100)):
137    """
138    Function to calculate training and testing indices for given 
139    dataset. If train indices are given, it will calculate the test 
140    indices. If train_indices == None, then it calculates both indices, 
141    preserving the ratio of each label in y
142
143    Parameters
144    ----------
145    y : np.ndarray
146        Numpy array of cell labels. Can have any number of classes 
147        for this function.
148
149    train_indices : np.ndarray | None
150        Optional array of pre-determined training indices
151
152    train_ratio : float
153        Decimal value ratio of features in training/testing sets
154
155    seed_obj : np.random._generator.Generator
156        Numpy random state used for random processes. Can be 
157        specified for reproducubility or set by default.
158    
159    
160    Returns
161    -------
162    train_indices : np.ndarray
163        Array of indices of training cells.
164
165    test_indices : np.ndarray:
166        Array of indices of testing cells.
167    """
168    # If train indices aren't provided
169    if train_indices is None:
170
171        unique_labels = np.unique(y)
172        train_indices = []
173
174        for label in unique_labels:
175
176            # Find indices of each unique label
177            label_indices = np.where(y == label)[0]
178
179            # Sample these indices according to train ratio
180            n = int(len(label_indices) * train_ratio)
181            train_label_indices = seed_obj.choice(label_indices, n, 
182                                                  replace = False)
183            train_indices.extend(train_label_indices)
184    else:
185        assert len(train_indices) <= len(y), ("More train indices than there "
186                                              "are samples")
187
188    train_indices = np.array(train_indices)
189
190    # Test indices are the indices not in the train_indices
191    test_indices = np.setdiff1d(np.arange(len(y)), train_indices, 
192                                assume_unique = True)
193
194    return train_indices, test_indices
195
196
197def calculate_d(num_samples : int):
198    """
199    This function calculates the optimal number of dimensions for 
200    performance. See https://doi.org/10.48550/arXiv.1806.09178 for more
201    information.
202
203    Parameters
204    ----------
205    num_samples : int
206        The number of samples in the data set including both training
207        and testing sets.
208
209    Returns
210    -------
211    d : int
212        The optimal number of dimensions to run scMKL with the given 
213        data set.
214
215    Examples
216    --------
217    >>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz')
218    >>>
219    >>> num_cells = raw_counts.shape[0]
220    >>> d = scmkl.calculate_d(num_cells)
221    >>> d
222    161
223    """
224    d = int(np.sqrt(num_samples)*np.log(np.log(num_samples)))
225
226    return int(np.max([d, 100]))
227
228
229def sort_samples(train_indices, test_indices):
230    """
231    Ensures that samples in adata obj are all training, then all 
232    testing.
233
234    Parameters
235    ----------
236    train_indices : np.ndarray
237        Indices in ad.AnnData object for training.
238    
239    test_indices : np.ndarray
240        Indices in ad.AnnData object for testing.
241
242    Returns
243    -------
244    sort_idc : np.ndarray
245        Ordered indices that will sort ad.AnnData object as all 
246        training samples, then all testing.
247
248    train_indices : np.ndarray
249        The new training indices given the new index order, `sort_idc`.
250
251    test_indices : np.ndarray
252        The new testing indices given the new index order, `sort_idc`.
253    """
254    sort_idc = np.concatenate([train_indices, test_indices])
255
256    train_indices = np.arange(0, train_indices.shape[0])
257    test_indices = np.arange(train_indices.shape[0], 
258                             train_indices.shape[0] + test_indices.shape[0])
259    
260    return sort_idc, train_indices, test_indices
261
262
263def create_adata(X: scipy.sparse._csc.csc_matrix | np.ndarray | pd.DataFrame, 
264                 feature_names: np.ndarray, cell_labels: np.ndarray, 
265                 group_dict: dict, obs_names: None | np.ndarray=None, 
266                 scale_data: bool=True, split_data: np.ndarray | None=None, 
267                 D: int | None=None, remove_features: bool=True, 
268                 train_ratio: float=0.8, distance_metric: str='euclidean', 
269                 kernel_type: str='Gaussian', random_state: int=1, 
270                 allow_multiclass: bool = False, 
271                 class_threshold: str | int = 'median',
272                 reduction: str | None = None, tfidf: bool = False):
273    """
274    Function to create an AnnData object to carry all relevant 
275    information going forward.
276
277    Parameters
278    ----------
279    X : scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame
280        A data matrix of cells by features (sparse array 
281        recommended for large datasets).
282
283    feature_names : np.ndarray
284        Array of feature names corresponding with the features 
285        in `X`.
286
287    cell_labels : np.ndarray
288        A numpy array of cell phenotypes corresponding with 
289        the cells in `X`.
290
291    group_dict : dict 
292        Dictionary containing feature grouping information (i.e. 
293        `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 
294        np.array([...]), ...}`.
295
296    obs_names : None | np.ndarray
297        The cell names corresponding to `X` to be assigned to output 
298        object `.obs_names` attribute.
299
300    scale_data : bool  
301        If `True`, data matrix is log transformed and standard 
302        scaled. 
303        
304    split_data : None | np.ndarray
305        If `None`, data will be split stratified by cell labels. 
306        Else, is an array of precalculated train/test split 
307        corresponding to samples. Can include labels for entire
308        dataset to benchmark performance or for only training
309        data to classify unknown cell types (i.e. `np.array(['train', 
310        'test', ..., 'train'])`.
311
312    D : int 
313        Number of Random Fourier Features used to calculate Z. 
314        Should be a positive integer. Higher values of D will 
315        increase classification accuracy at the cost of computation 
316        time. If set to `None`, will be calculated given number of 
317        samples. 
318    
319    remove_features : bool
320        If `True`, will remove features from `X` and `feature_names` 
321        not in `group_dict` and remove features from groupings not in 
322        `feature_names`.
323
324    train_ratio : float
325        Ratio of number of training samples to entire data set. Note:
326        if a threshold is applied, the ratio training samples may 
327        decrease depending on class balance and `class_threshold`
328        parameter if `allow_multiclass = True`.
329
330    distance_metric : str
331        The pairwise distance metric used to estimate sigma. Must
332        be one of the options used in `scipy.spatial.distance.cdist`.
333
334    kernel_type : str
335        The approximated kernel function used to calculate Zs.
336        Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`.
337
338    random_state : int
339        Integer random_state used to set the seed for 
340        reproducibilty.
341
342    allow_multiclass : bool
343        If `False`, will ensure that cell labels are binary.
344
345    class_threshold : str | int
346        Number of samples allowed in the training data for each cell
347        class in the training data. If `'median'`, the median number 
348        of cells per cell class will be the threshold for number of 
349        samples per class.
350
351    reduction: str | None
352        Choose which dimension reduction technique to perform on 
353        features within a group. 'svd' will run 
354        `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 
355        by an array of 1s down to 50 dimensions.
356        
357    tfidf: bool
358        Whether to calculate TFIDF transformation on peaks within 
359        groupings.
360        
361    Returns
362    -------
363    adata : ad.AnnData
364        AnnData with the following attributes and keys:
365
366        `adata.X` (array_like):
367            Data matrix.
368    
369        `adata.var_names` (array_like): 
370            Feature names corresponding to `adata.X`.
371
372        `adata.obs['labels']` (array_like):
373            cell classes/phenotypes from `cell_labels`.
374
375        `adata.uns['train_indices']` (array_like):
376            Indices for training data. 
377
378        `adata.uns['test_indices']` (array_like)
379            Indices for testing data.
380
381        `adata.uns['group_dict']` (dict):
382            Grouping information.
383
384        `adata.uns['seed_obj']` (np.random._generator.Generator): 
385            Seed object with seed equal to 100 * `random_state`.
386
387        `adata.uns['D']` (int):
388            Number of dimensions to scMKL with.
389
390        `adata.uns['scale_data']` (bool):
391            Whether or not data is log and z-score transformed.
392
393        `adata.uns['distance_metric']` (str): 
394            Distance metric as given.
395    
396        `adata.uns['kernel_type']` (str): 
397            Kernel function as given.
398
399        `adata.uns['svd']` (bool): 
400            Whether to calculate SVD reduction.
401
402        `adata.uns['tfidf']` (bool): 
403            Whether to calculate TF-IDF per grouping.
404
405    Examples
406    --------
407    >>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz')
408    >>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True)
409    >>> group_dict = np.load('hallmark_genesets.pkl', 
410    >>>                      allow_pickle = True)
411    >>> 
412    >>> adata = scmkl.create_adata(X = data_mat, 
413    ...                            feature_names = gene_names, 
414    ...                            group_dict = group_dict)
415    >>> adata
416    AnnData object with n_obs × n_vars = 1000 × 4341
417    obs: 'labels'
418    uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
419    'distance_metric', 'train_indices', 'test_indices'
420    """
421
422    assert X.shape[1] == len(feature_names), ("Different number of features "
423                                              "in X than feature names")
424    
425    if not allow_multiclass:
426        assert len(np.unique(cell_labels)) == 2, ("cell_labels must contain "
427                                                  "2 classes")
428    if D is not None:    
429        assert isinstance(D, int) and D > 0, 'D must be a positive integer'
430
431    kernel_options = ['gaussian', 'laplacian', 'cauchy']
432    assert kernel_type.lower() in kernel_options, ("Given kernel type not "
433                                                   "implemented. Gaussian, "
434                                                   "Laplacian, and Cauchy "
435                                                   "are the acceptable "
436                                                   "types.")
437
438    # Create adata object and add column names
439    adata = ad.AnnData(X)
440    adata.var_names = feature_names
441
442    if isinstance(obs_names, (np.ndarray)):
443        adata.obs_names = obs_names
444
445    filtered_feature_names, group_dict = _filter_features(feature_names, 
446                                                          group_dict)
447
448    if remove_features:
449        warnings.filterwarnings('ignore', category = ad.ImplicitModificationWarning)
450        adata = adata[:, filtered_feature_names]
451    
452    gc.collect()
453
454    # Add metadata to adata object
455    adata.uns['group_dict'] = group_dict
456    adata.uns['seed_obj'] = np.random.default_rng(100*random_state)
457    adata.uns['scale_data'] = scale_data
458    adata.uns['D'] = D if D is not None else calculate_d(adata.shape[0])
459    adata.uns['kernel_type'] = kernel_type
460    adata.uns['distance_metric'] = distance_metric
461    adata.uns['reduction'] = reduction if isinstance(reduction, str) else 'None'
462    adata.uns['tfidf'] = tfidf
463
464    if (split_data is None):
465        assert X.shape[0] == len(cell_labels), ("Different number of cells "
466                                                "than labels")
467        adata.obs['labels'] = cell_labels
468
469        if (allow_multiclass == False):
470            split = _binary_split(cell_labels, 
471                                  seed_obj = adata.uns['seed_obj'],
472                                  train_ratio = train_ratio)
473            train_indices, test_indices = split
474
475        elif (allow_multiclass == True):
476            split = _multi_class_split(cell_labels, 
477                                       seed_obj = adata.uns['seed_obj'], 
478                                       class_threshold = class_threshold,
479                                       train_ratio = train_ratio)
480            train_indices, test_indices = split
481
482        adata.uns['labeled_test'] = True
483
484    else:
485        x_eq_labs = X.shape[0] == len(cell_labels)
486        train_eq_labs = X.shape[0] == len(cell_labels)
487        assert x_eq_labs or train_eq_labs, ("Must give labels for all cells "
488                                            "or only for training cells")
489        
490        train_indices = np.where(split_data == 'train')[0]
491        test_indices = np.where(split_data == 'test')[0]
492
493        if len(cell_labels) == len(train_indices):
494
495            padded_cell_labels = np.zeros((X.shape[0])).astype('object')
496            padded_cell_labels[train_indices] = cell_labels
497            padded_cell_labels[test_indices] = 'padded_test_label'
498
499            adata.obs['labels'] = padded_cell_labels
500            adata.uns['labeled_test'] = False
501
502        elif len(cell_labels) == len(split_data):
503            adata.obs['labels'] = cell_labels
504            adata.uns['labeled_test'] = True
505
506    # Ensuring all train samples are first in adata object followed by test
507    sort_idx, train_indices, test_indices = sort_samples(train_indices, 
508                                                         test_indices)
509
510    adata = adata[sort_idx]
511
512    if not isinstance(obs_names, (np.ndarray)):
513        adata.obs = adata.obs.reset_index(drop=True)
514        adata.obs.index = adata.obs.index.astype('O')
515
516    adata.uns['train_indices'] = train_indices
517    adata.uns['test_indices'] = test_indices
518
519    if not scale_data:
520        print("WARNING: Data will not be log transformed and scaled. "
521              "To change this behavior, set scale_data to True")
522
523    return adata
524
525
526def format_adata(adata: ad.AnnData | str, cell_labels: np.ndarray | str, 
527                 group_dict: dict | str, use_raw: bool=False, 
528                 scale_data: bool=True, split_data: np.ndarray | None=None, 
529                 D: int | None=None, remove_features: bool=True, 
530                 train_ratio: float=0.8, distance_metric: str='euclidean', 
531                 kernel_type: str='Gaussian', random_state: int=1, 
532                 allow_multiclass: bool = False, 
533                 class_threshold: str | int = 'median', 
534                 reduction: str | None = None, tfidf: bool = False):
535    """
536    Function to format an `ad.AnnData` object to carry all relevant 
537    information going forward. `adata.obs_names` will be retained.
538
539    **NOTE: Information not needed for running `scmkl` will be 
540    removed.**
541
542    Parameters
543    ----------
544    adata : ad.AnnData
545        Object with data for `scmkl` to be applied to. Only requirment 
546        is that `.var_names` is correct and data matrix is in `adata.X` 
547        or `adata.raw.X`. A h5ad file can be provided as a `str` and it 
548        will be read in.
549
550    cell_labels : np.ndarray | str
551        If type `str`, the labels for `scmkl` to learn are captured 
552        from `adata.obs['cell_labels']`. Else, a `np.ndarray` of cell 
553        phenotypes corresponding with the cells in `adata.X`.
554
555    group_dict : dict | str
556        Dictionary containing feature grouping information (i.e. 
557        `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 
558        np.array([...]), ...}`. A pickle file can be provided as a `str` 
559        and it will be read in.
560
561    obs_names : None | np.ndarray
562        The cell names corresponding to `X` to be assigned to output 
563        object `.obs_names` attribute.
564
565    use_raw : bool
566        If `False`, will use `adata.X` to create new `adata`. Else, 
567        will use `adata.raw.X`.
568
569    scale_data : bool  
570        If `True`, data matrix is log transformed and standard 
571        scaled. 
572        
573    split_data : None | np.ndarray
574        If `None`, data will be split stratified by cell labels. 
575        Else, is an array of precalculated train/test split 
576        corresponding to samples. Can include labels for entire
577        dataset to benchmark performance or for only training
578        data to classify unknown cell types (i.e. `np.array(['train', 
579        'test', ..., 'train'])`.
580
581    D : int 
582        Number of Random Fourier Features used to calculate Z. 
583        Should be a positive integer. Higher values of D will 
584        increase classification accuracy at the cost of computation 
585        time. If set to `None`, will be calculated given number of 
586        samples. 
587    
588    remove_features : bool
589        If `True`, will remove features from `X` and `feature_names` 
590        not in `group_dict` and remove features from groupings not in 
591        `feature_names`.
592
593    train_ratio : float
594        Ratio of number of training samples to entire data set. Note:
595        if a threshold is applied, the ratio training samples may 
596        decrease depending on class balance and `class_threshold`
597        parameter if `allow_multiclass = True`.
598
599    distance_metric : str
600        The pairwise distance metric used to estimate sigma. Must
601        be one of the options used in `scipy.spatial.distance.cdist`.
602
603    kernel_type : str
604        The approximated kernel function used to calculate Zs.
605        Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`.
606
607    random_state : int
608        Integer random_state used to set the seed for 
609        reproducibilty.
610
611    allow_multiclass : bool
612        If `False`, will ensure that cell labels are binary.
613
614    class_threshold : str | int
615        Number of samples allowed in the training data for each cell
616        class in the training data. If `'median'`, the median number 
617        of cells per cell class will be the threshold for number of 
618        samples per class.
619
620    reduction: str | None
621        Choose which dimension reduction technique to perform on 
622        features within a group. 'svd' will run 
623        `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 
624        by an array of 1s down to 50 dimensions.
625        
626    tfidf: bool
627        Whether to calculate TFIDF transformation on peaks within 
628        groupings.
629        
630    Returns
631    -------
632    adata : ad.AnnData
633        AnnData with the following attributes and keys:
634
635        `adata.X` (array_like):
636            Data matrix.
637    
638        `adata.var_names` (array_like): 
639            Feature names corresponding to `adata.X`.
640
641        `adata.obs['labels']` (array_like):
642            cell classes/phenotypes from `cell_labels`.
643
644        `adata.uns['train_indices']` (array_like):
645            Indices for training data. 
646
647        `adata.uns['test_indices']` (array_like)
648            Indices for testing data.
649
650        `adata.uns['group_dict']` (dict):
651            Grouping information.
652
653        `adata.uns['seed_obj']` (np.random._generator.Generator): 
654            Seed object with seed equal to 100 * `random_state`.
655
656        `adata.uns['D']` (int):
657            Number of dimensions to scMKL with.
658
659        `adata.uns['scale_data']` (bool):
660            Whether or not data is log and z-score transformed.
661
662        `adata.uns['distance_metric']` (str): 
663            Distance metric as given.
664    
665        `adata.uns['kernel_type']` (str): 
666            Kernel function as given.
667
668        `adata.uns['svd']` (bool): 
669            Whether to calculate SVD reduction.
670
671        `adata.uns['tfidf']` (bool): 
672            Whether to calculate TF-IDF per grouping.
673
674    Examples
675    --------
676    >>> adata = ad.read_h5ad('MCF7_rna.h5ad')
677    >>> group_dict = np.load('hallmark_genesets.pkl', 
678    >>>                      allow_pickle = True)
679    >>> 
680    >>> 
681    >>> # The labels in adata.obs we want to learn are 'celltypes'
682    >>> adata = scmkl.format_adata(adata, 'celltypes', 
683    ...                            group_dict)
684    >>> adata
685    AnnData object with n_obs × n_vars = 1000 × 4341
686    obs: 'labels'
687    uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
688    'distance_metric', 'train_indices', 'test_indices'
689    """
690    if str == type(adata):
691        adata = ad.read_h5ad(adata)
692
693    if str == type(group_dict):
694        group_dict = np.load(group_dict, allow_pickle=True)
695        
696    if str == type(cell_labels):
697        err_msg = f"{cell_labels} is not in `adata.obs`"
698        assert cell_labels in adata.obs.keys(), err_msg
699        cell_labels = adata.obs[cell_labels].to_numpy()
700    
701    if use_raw:
702        assert adata.raw, "`adata.raw` is empty, set `use_raw` to `False`"
703        X = adata.raw.X
704    else:
705        X = adata.X
706
707    adata = create_adata(X, adata.var_names.to_numpy().copy(), cell_labels, 
708                         group_dict, adata.obs_names.to_numpy().copy(), 
709                         scale_data, split_data, D, remove_features, 
710                         train_ratio, distance_metric, kernel_type, 
711                         random_state, allow_multiclass, class_threshold, 
712                         reduction, tfidf)
713
714    return adata
def calculate_d(num_samples: int):
198def calculate_d(num_samples : int):
199    """
200    This function calculates the optimal number of dimensions for 
201    performance. See https://doi.org/10.48550/arXiv.1806.09178 for more
202    information.
203
204    Parameters
205    ----------
206    num_samples : int
207        The number of samples in the data set including both training
208        and testing sets.
209
210    Returns
211    -------
212    d : int
213        The optimal number of dimensions to run scMKL with the given 
214        data set.
215
216    Examples
217    --------
218    >>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz')
219    >>>
220    >>> num_cells = raw_counts.shape[0]
221    >>> d = scmkl.calculate_d(num_cells)
222    >>> d
223    161
224    """
225    d = int(np.sqrt(num_samples)*np.log(np.log(num_samples)))
226
227    return int(np.max([d, 100]))

This function calculates the optimal number of dimensions for performance. See https://doi.org/10.48550/arXiv.1806.09178 for more information.

Parameters
  • num_samples (int): The number of samples in the data set including both training and testing sets.
Returns
  • d (int): The optimal number of dimensions to run scMKL with the given data set.
Examples
>>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz')
>>>
>>> num_cells = raw_counts.shape[0]
>>> d = scmkl.calculate_d(num_cells)
>>> d
161
def sort_samples(train_indices, test_indices):
230def sort_samples(train_indices, test_indices):
231    """
232    Ensures that samples in adata obj are all training, then all 
233    testing.
234
235    Parameters
236    ----------
237    train_indices : np.ndarray
238        Indices in ad.AnnData object for training.
239    
240    test_indices : np.ndarray
241        Indices in ad.AnnData object for testing.
242
243    Returns
244    -------
245    sort_idc : np.ndarray
246        Ordered indices that will sort ad.AnnData object as all 
247        training samples, then all testing.
248
249    train_indices : np.ndarray
250        The new training indices given the new index order, `sort_idc`.
251
252    test_indices : np.ndarray
253        The new testing indices given the new index order, `sort_idc`.
254    """
255    sort_idc = np.concatenate([train_indices, test_indices])
256
257    train_indices = np.arange(0, train_indices.shape[0])
258    test_indices = np.arange(train_indices.shape[0], 
259                             train_indices.shape[0] + test_indices.shape[0])
260    
261    return sort_idc, train_indices, test_indices

Ensures that samples in adata obj are all training, then all testing.

Parameters
  • train_indices (np.ndarray): Indices in ad.AnnData object for training.
  • test_indices (np.ndarray): Indices in ad.AnnData object for testing.
Returns
  • sort_idc (np.ndarray): Ordered indices that will sort ad.AnnData object as all training samples, then all testing.
  • train_indices (np.ndarray): The new training indices given the new index order, sort_idc.
  • test_indices (np.ndarray): The new testing indices given the new index order, sort_idc.
def create_adata( X: scipy.sparse._csc.csc_matrix | numpy.ndarray | pandas.core.frame.DataFrame, feature_names: numpy.ndarray, cell_labels: numpy.ndarray, group_dict: dict, obs_names: None | numpy.ndarray = None, scale_data: bool = True, split_data: numpy.ndarray | None = None, D: int | None = None, remove_features: bool = True, train_ratio: float = 0.8, distance_metric: str = 'euclidean', kernel_type: str = 'Gaussian', random_state: int = 1, allow_multiclass: bool = False, class_threshold: str | int = 'median', reduction: str | None = None, tfidf: bool = False):
264def create_adata(X: scipy.sparse._csc.csc_matrix | np.ndarray | pd.DataFrame, 
265                 feature_names: np.ndarray, cell_labels: np.ndarray, 
266                 group_dict: dict, obs_names: None | np.ndarray=None, 
267                 scale_data: bool=True, split_data: np.ndarray | None=None, 
268                 D: int | None=None, remove_features: bool=True, 
269                 train_ratio: float=0.8, distance_metric: str='euclidean', 
270                 kernel_type: str='Gaussian', random_state: int=1, 
271                 allow_multiclass: bool = False, 
272                 class_threshold: str | int = 'median',
273                 reduction: str | None = None, tfidf: bool = False):
274    """
275    Function to create an AnnData object to carry all relevant 
276    information going forward.
277
278    Parameters
279    ----------
280    X : scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame
281        A data matrix of cells by features (sparse array 
282        recommended for large datasets).
283
284    feature_names : np.ndarray
285        Array of feature names corresponding with the features 
286        in `X`.
287
288    cell_labels : np.ndarray
289        A numpy array of cell phenotypes corresponding with 
290        the cells in `X`.
291
292    group_dict : dict 
293        Dictionary containing feature grouping information (i.e. 
294        `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 
295        np.array([...]), ...}`.
296
297    obs_names : None | np.ndarray
298        The cell names corresponding to `X` to be assigned to output 
299        object `.obs_names` attribute.
300
301    scale_data : bool  
302        If `True`, data matrix is log transformed and standard 
303        scaled. 
304        
305    split_data : None | np.ndarray
306        If `None`, data will be split stratified by cell labels. 
307        Else, is an array of precalculated train/test split 
308        corresponding to samples. Can include labels for entire
309        dataset to benchmark performance or for only training
310        data to classify unknown cell types (i.e. `np.array(['train', 
311        'test', ..., 'train'])`.
312
313    D : int 
314        Number of Random Fourier Features used to calculate Z. 
315        Should be a positive integer. Higher values of D will 
316        increase classification accuracy at the cost of computation 
317        time. If set to `None`, will be calculated given number of 
318        samples. 
319    
320    remove_features : bool
321        If `True`, will remove features from `X` and `feature_names` 
322        not in `group_dict` and remove features from groupings not in 
323        `feature_names`.
324
325    train_ratio : float
326        Ratio of number of training samples to entire data set. Note:
327        if a threshold is applied, the ratio training samples may 
328        decrease depending on class balance and `class_threshold`
329        parameter if `allow_multiclass = True`.
330
331    distance_metric : str
332        The pairwise distance metric used to estimate sigma. Must
333        be one of the options used in `scipy.spatial.distance.cdist`.
334
335    kernel_type : str
336        The approximated kernel function used to calculate Zs.
337        Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`.
338
339    random_state : int
340        Integer random_state used to set the seed for 
341        reproducibilty.
342
343    allow_multiclass : bool
344        If `False`, will ensure that cell labels are binary.
345
346    class_threshold : str | int
347        Number of samples allowed in the training data for each cell
348        class in the training data. If `'median'`, the median number 
349        of cells per cell class will be the threshold for number of 
350        samples per class.
351
352    reduction: str | None
353        Choose which dimension reduction technique to perform on 
354        features within a group. 'svd' will run 
355        `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 
356        by an array of 1s down to 50 dimensions.
357        
358    tfidf: bool
359        Whether to calculate TFIDF transformation on peaks within 
360        groupings.
361        
362    Returns
363    -------
364    adata : ad.AnnData
365        AnnData with the following attributes and keys:
366
367        `adata.X` (array_like):
368            Data matrix.
369    
370        `adata.var_names` (array_like): 
371            Feature names corresponding to `adata.X`.
372
373        `adata.obs['labels']` (array_like):
374            cell classes/phenotypes from `cell_labels`.
375
376        `adata.uns['train_indices']` (array_like):
377            Indices for training data. 
378
379        `adata.uns['test_indices']` (array_like)
380            Indices for testing data.
381
382        `adata.uns['group_dict']` (dict):
383            Grouping information.
384
385        `adata.uns['seed_obj']` (np.random._generator.Generator): 
386            Seed object with seed equal to 100 * `random_state`.
387
388        `adata.uns['D']` (int):
389            Number of dimensions to scMKL with.
390
391        `adata.uns['scale_data']` (bool):
392            Whether or not data is log and z-score transformed.
393
394        `adata.uns['distance_metric']` (str): 
395            Distance metric as given.
396    
397        `adata.uns['kernel_type']` (str): 
398            Kernel function as given.
399
400        `adata.uns['svd']` (bool): 
401            Whether to calculate SVD reduction.
402
403        `adata.uns['tfidf']` (bool): 
404            Whether to calculate TF-IDF per grouping.
405
406    Examples
407    --------
408    >>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz')
409    >>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True)
410    >>> group_dict = np.load('hallmark_genesets.pkl', 
411    >>>                      allow_pickle = True)
412    >>> 
413    >>> adata = scmkl.create_adata(X = data_mat, 
414    ...                            feature_names = gene_names, 
415    ...                            group_dict = group_dict)
416    >>> adata
417    AnnData object with n_obs × n_vars = 1000 × 4341
418    obs: 'labels'
419    uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
420    'distance_metric', 'train_indices', 'test_indices'
421    """
422
423    assert X.shape[1] == len(feature_names), ("Different number of features "
424                                              "in X than feature names")
425    
426    if not allow_multiclass:
427        assert len(np.unique(cell_labels)) == 2, ("cell_labels must contain "
428                                                  "2 classes")
429    if D is not None:    
430        assert isinstance(D, int) and D > 0, 'D must be a positive integer'
431
432    kernel_options = ['gaussian', 'laplacian', 'cauchy']
433    assert kernel_type.lower() in kernel_options, ("Given kernel type not "
434                                                   "implemented. Gaussian, "
435                                                   "Laplacian, and Cauchy "
436                                                   "are the acceptable "
437                                                   "types.")
438
439    # Create adata object and add column names
440    adata = ad.AnnData(X)
441    adata.var_names = feature_names
442
443    if isinstance(obs_names, (np.ndarray)):
444        adata.obs_names = obs_names
445
446    filtered_feature_names, group_dict = _filter_features(feature_names, 
447                                                          group_dict)
448
449    if remove_features:
450        warnings.filterwarnings('ignore', category = ad.ImplicitModificationWarning)
451        adata = adata[:, filtered_feature_names]
452    
453    gc.collect()
454
455    # Add metadata to adata object
456    adata.uns['group_dict'] = group_dict
457    adata.uns['seed_obj'] = np.random.default_rng(100*random_state)
458    adata.uns['scale_data'] = scale_data
459    adata.uns['D'] = D if D is not None else calculate_d(adata.shape[0])
460    adata.uns['kernel_type'] = kernel_type
461    adata.uns['distance_metric'] = distance_metric
462    adata.uns['reduction'] = reduction if isinstance(reduction, str) else 'None'
463    adata.uns['tfidf'] = tfidf
464
465    if (split_data is None):
466        assert X.shape[0] == len(cell_labels), ("Different number of cells "
467                                                "than labels")
468        adata.obs['labels'] = cell_labels
469
470        if (allow_multiclass == False):
471            split = _binary_split(cell_labels, 
472                                  seed_obj = adata.uns['seed_obj'],
473                                  train_ratio = train_ratio)
474            train_indices, test_indices = split
475
476        elif (allow_multiclass == True):
477            split = _multi_class_split(cell_labels, 
478                                       seed_obj = adata.uns['seed_obj'], 
479                                       class_threshold = class_threshold,
480                                       train_ratio = train_ratio)
481            train_indices, test_indices = split
482
483        adata.uns['labeled_test'] = True
484
485    else:
486        x_eq_labs = X.shape[0] == len(cell_labels)
487        train_eq_labs = X.shape[0] == len(cell_labels)
488        assert x_eq_labs or train_eq_labs, ("Must give labels for all cells "
489                                            "or only for training cells")
490        
491        train_indices = np.where(split_data == 'train')[0]
492        test_indices = np.where(split_data == 'test')[0]
493
494        if len(cell_labels) == len(train_indices):
495
496            padded_cell_labels = np.zeros((X.shape[0])).astype('object')
497            padded_cell_labels[train_indices] = cell_labels
498            padded_cell_labels[test_indices] = 'padded_test_label'
499
500            adata.obs['labels'] = padded_cell_labels
501            adata.uns['labeled_test'] = False
502
503        elif len(cell_labels) == len(split_data):
504            adata.obs['labels'] = cell_labels
505            adata.uns['labeled_test'] = True
506
507    # Ensuring all train samples are first in adata object followed by test
508    sort_idx, train_indices, test_indices = sort_samples(train_indices, 
509                                                         test_indices)
510
511    adata = adata[sort_idx]
512
513    if not isinstance(obs_names, (np.ndarray)):
514        adata.obs = adata.obs.reset_index(drop=True)
515        adata.obs.index = adata.obs.index.astype('O')
516
517    adata.uns['train_indices'] = train_indices
518    adata.uns['test_indices'] = test_indices
519
520    if not scale_data:
521        print("WARNING: Data will not be log transformed and scaled. "
522              "To change this behavior, set scale_data to True")
523
524    return adata

Function to create an AnnData object to carry all relevant information going forward.

Parameters
  • X (scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame): A data matrix of cells by features (sparse array recommended for large datasets).
  • feature_names (np.ndarray): Array of feature names corresponding with the features in X.
  • cell_labels (np.ndarray): A numpy array of cell phenotypes corresponding with the cells in X.
  • group_dict (dict): Dictionary containing feature grouping information (i.e. {geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: np.array([...]), ...}.
  • obs_names (None | np.ndarray): The cell names corresponding to X to be assigned to output object .obs_names attribute.
  • scale_data (bool): If True, data matrix is log transformed and standard scaled.
  • split_data (None | np.ndarray): If None, data will be split stratified by cell labels. Else, is an array of precalculated train/test split corresponding to samples. Can include labels for entire dataset to benchmark performance or for only training data to classify unknown cell types (i.e. np.array(['train', 'test', ..., 'train']).
  • D (int): Number of Random Fourier Features used to calculate Z. Should be a positive integer. Higher values of D will increase classification accuracy at the cost of computation time. If set to None, will be calculated given number of samples.
  • remove_features (bool): If True, will remove features from X and feature_names not in group_dict and remove features from groupings not in feature_names.
  • train_ratio (float): Ratio of number of training samples to entire data set. Note: if a threshold is applied, the ratio training samples may decrease depending on class balance and class_threshold parameter if allow_multiclass = True.
  • distance_metric (str): The pairwise distance metric used to estimate sigma. Must be one of the options used in scipy.spatial.distance.cdist.
  • kernel_type (str): The approximated kernel function used to calculate Zs. Must be one of 'Gaussian', 'Laplacian', or 'Cauchy'.
  • random_state (int): Integer random_state used to set the seed for reproducibilty.
  • allow_multiclass (bool): If False, will ensure that cell labels are binary.
  • class_threshold (str | int): Number of samples allowed in the training data for each cell class in the training data. If 'median', the median number of cells per cell class will be the threshold for number of samples per class.
  • reduction (str | None): Choose which dimension reduction technique to perform on features within a group. 'svd' will run sklearn.decomposition.TruncatedSVD, 'linear' will multiply by an array of 1s down to 50 dimensions.
  • tfidf (bool): Whether to calculate TFIDF transformation on peaks within groupings.
Returns
  • adata (ad.AnnData): AnnData with the following attributes and keys:

    adata.X (array_like): Data matrix.

    adata.var_names (array_like): Feature names corresponding to adata.X.

    adata.obs['labels'] (array_like): cell classes/phenotypes from cell_labels.

    adata.uns['train_indices'] (array_like): Indices for training data.

    adata.uns['test_indices'] (array_like) Indices for testing data.

    adata.uns['group_dict'] (dict): Grouping information.

    adata.uns['seed_obj'] (np.random._generator.Generator): Seed object with seed equal to 100 * random_state.

    adata.uns['D'] (int): Number of dimensions to scMKL with.

    adata.uns['scale_data'] (bool): Whether or not data is log and z-score transformed.

    adata.uns['distance_metric'] (str): Distance metric as given.

    adata.uns['kernel_type'] (str): Kernel function as given.

    adata.uns['svd'] (bool): Whether to calculate SVD reduction.

    adata.uns['tfidf'] (bool): Whether to calculate TF-IDF per grouping.

Examples
>>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz')
>>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True)
>>> group_dict = np.load('hallmark_genesets.pkl', 
>>>                      allow_pickle = True)
>>> 
>>> adata = scmkl.create_adata(X = data_mat, 
...                            feature_names = gene_names, 
...                            group_dict = group_dict)
>>> adata
AnnData object with n_obs × n_vars = 1000 × 4341
obs: 'labels'
uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
'distance_metric', 'train_indices', 'test_indices'
def format_adata( adata: anndata._core.anndata.AnnData | str, cell_labels: numpy.ndarray | str, group_dict: dict | str, use_raw: bool = False, scale_data: bool = True, split_data: numpy.ndarray | None = None, D: int | None = None, remove_features: bool = True, train_ratio: float = 0.8, distance_metric: str = 'euclidean', kernel_type: str = 'Gaussian', random_state: int = 1, allow_multiclass: bool = False, class_threshold: str | int = 'median', reduction: str | None = None, tfidf: bool = False):
527def format_adata(adata: ad.AnnData | str, cell_labels: np.ndarray | str, 
528                 group_dict: dict | str, use_raw: bool=False, 
529                 scale_data: bool=True, split_data: np.ndarray | None=None, 
530                 D: int | None=None, remove_features: bool=True, 
531                 train_ratio: float=0.8, distance_metric: str='euclidean', 
532                 kernel_type: str='Gaussian', random_state: int=1, 
533                 allow_multiclass: bool = False, 
534                 class_threshold: str | int = 'median', 
535                 reduction: str | None = None, tfidf: bool = False):
536    """
537    Function to format an `ad.AnnData` object to carry all relevant 
538    information going forward. `adata.obs_names` will be retained.
539
540    **NOTE: Information not needed for running `scmkl` will be 
541    removed.**
542
543    Parameters
544    ----------
545    adata : ad.AnnData
546        Object with data for `scmkl` to be applied to. Only requirment 
547        is that `.var_names` is correct and data matrix is in `adata.X` 
548        or `adata.raw.X`. A h5ad file can be provided as a `str` and it 
549        will be read in.
550
551    cell_labels : np.ndarray | str
552        If type `str`, the labels for `scmkl` to learn are captured 
553        from `adata.obs['cell_labels']`. Else, a `np.ndarray` of cell 
554        phenotypes corresponding with the cells in `adata.X`.
555
556    group_dict : dict | str
557        Dictionary containing feature grouping information (i.e. 
558        `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 
559        np.array([...]), ...}`. A pickle file can be provided as a `str` 
560        and it will be read in.
561
562    obs_names : None | np.ndarray
563        The cell names corresponding to `X` to be assigned to output 
564        object `.obs_names` attribute.
565
566    use_raw : bool
567        If `False`, will use `adata.X` to create new `adata`. Else, 
568        will use `adata.raw.X`.
569
570    scale_data : bool  
571        If `True`, data matrix is log transformed and standard 
572        scaled. 
573        
574    split_data : None | np.ndarray
575        If `None`, data will be split stratified by cell labels. 
576        Else, is an array of precalculated train/test split 
577        corresponding to samples. Can include labels for entire
578        dataset to benchmark performance or for only training
579        data to classify unknown cell types (i.e. `np.array(['train', 
580        'test', ..., 'train'])`.
581
582    D : int 
583        Number of Random Fourier Features used to calculate Z. 
584        Should be a positive integer. Higher values of D will 
585        increase classification accuracy at the cost of computation 
586        time. If set to `None`, will be calculated given number of 
587        samples. 
588    
589    remove_features : bool
590        If `True`, will remove features from `X` and `feature_names` 
591        not in `group_dict` and remove features from groupings not in 
592        `feature_names`.
593
594    train_ratio : float
595        Ratio of number of training samples to entire data set. Note:
596        if a threshold is applied, the ratio training samples may 
597        decrease depending on class balance and `class_threshold`
598        parameter if `allow_multiclass = True`.
599
600    distance_metric : str
601        The pairwise distance metric used to estimate sigma. Must
602        be one of the options used in `scipy.spatial.distance.cdist`.
603
604    kernel_type : str
605        The approximated kernel function used to calculate Zs.
606        Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`.
607
608    random_state : int
609        Integer random_state used to set the seed for 
610        reproducibilty.
611
612    allow_multiclass : bool
613        If `False`, will ensure that cell labels are binary.
614
615    class_threshold : str | int
616        Number of samples allowed in the training data for each cell
617        class in the training data. If `'median'`, the median number 
618        of cells per cell class will be the threshold for number of 
619        samples per class.
620
621    reduction: str | None
622        Choose which dimension reduction technique to perform on 
623        features within a group. 'svd' will run 
624        `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 
625        by an array of 1s down to 50 dimensions.
626        
627    tfidf: bool
628        Whether to calculate TFIDF transformation on peaks within 
629        groupings.
630        
631    Returns
632    -------
633    adata : ad.AnnData
634        AnnData with the following attributes and keys:
635
636        `adata.X` (array_like):
637            Data matrix.
638    
639        `adata.var_names` (array_like): 
640            Feature names corresponding to `adata.X`.
641
642        `adata.obs['labels']` (array_like):
643            cell classes/phenotypes from `cell_labels`.
644
645        `adata.uns['train_indices']` (array_like):
646            Indices for training data. 
647
648        `adata.uns['test_indices']` (array_like)
649            Indices for testing data.
650
651        `adata.uns['group_dict']` (dict):
652            Grouping information.
653
654        `adata.uns['seed_obj']` (np.random._generator.Generator): 
655            Seed object with seed equal to 100 * `random_state`.
656
657        `adata.uns['D']` (int):
658            Number of dimensions to scMKL with.
659
660        `adata.uns['scale_data']` (bool):
661            Whether or not data is log and z-score transformed.
662
663        `adata.uns['distance_metric']` (str): 
664            Distance metric as given.
665    
666        `adata.uns['kernel_type']` (str): 
667            Kernel function as given.
668
669        `adata.uns['svd']` (bool): 
670            Whether to calculate SVD reduction.
671
672        `adata.uns['tfidf']` (bool): 
673            Whether to calculate TF-IDF per grouping.
674
675    Examples
676    --------
677    >>> adata = ad.read_h5ad('MCF7_rna.h5ad')
678    >>> group_dict = np.load('hallmark_genesets.pkl', 
679    >>>                      allow_pickle = True)
680    >>> 
681    >>> 
682    >>> # The labels in adata.obs we want to learn are 'celltypes'
683    >>> adata = scmkl.format_adata(adata, 'celltypes', 
684    ...                            group_dict)
685    >>> adata
686    AnnData object with n_obs × n_vars = 1000 × 4341
687    obs: 'labels'
688    uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
689    'distance_metric', 'train_indices', 'test_indices'
690    """
691    if str == type(adata):
692        adata = ad.read_h5ad(adata)
693
694    if str == type(group_dict):
695        group_dict = np.load(group_dict, allow_pickle=True)
696        
697    if str == type(cell_labels):
698        err_msg = f"{cell_labels} is not in `adata.obs`"
699        assert cell_labels in adata.obs.keys(), err_msg
700        cell_labels = adata.obs[cell_labels].to_numpy()
701    
702    if use_raw:
703        assert adata.raw, "`adata.raw` is empty, set `use_raw` to `False`"
704        X = adata.raw.X
705    else:
706        X = adata.X
707
708    adata = create_adata(X, adata.var_names.to_numpy().copy(), cell_labels, 
709                         group_dict, adata.obs_names.to_numpy().copy(), 
710                         scale_data, split_data, D, remove_features, 
711                         train_ratio, distance_metric, kernel_type, 
712                         random_state, allow_multiclass, class_threshold, 
713                         reduction, tfidf)
714
715    return adata

Function to format an ad.AnnData object to carry all relevant information going forward. adata.obs_names will be retained.

NOTE: Information not needed for running scmkl will be removed.

Parameters
  • adata (ad.AnnData): Object with data for scmkl to be applied to. Only requirment is that .var_names is correct and data matrix is in adata.X or adata.raw.X. A h5ad file can be provided as a str and it will be read in.
  • cell_labels (np.ndarray | str): If type str, the labels for scmkl to learn are captured from adata.obs['cell_labels']. Else, a np.ndarray of cell phenotypes corresponding with the cells in adata.X.
  • group_dict (dict | str): Dictionary containing feature grouping information (i.e. {geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: np.array([...]), ...}. A pickle file can be provided as a str and it will be read in.
  • obs_names (None | np.ndarray): The cell names corresponding to X to be assigned to output object .obs_names attribute.
  • use_raw (bool): If False, will use adata.X to create new adata. Else, will use adata.raw.X.
  • scale_data (bool): If True, data matrix is log transformed and standard scaled.
  • split_data (None | np.ndarray): If None, data will be split stratified by cell labels. Else, is an array of precalculated train/test split corresponding to samples. Can include labels for entire dataset to benchmark performance or for only training data to classify unknown cell types (i.e. np.array(['train', 'test', ..., 'train']).
  • D (int): Number of Random Fourier Features used to calculate Z. Should be a positive integer. Higher values of D will increase classification accuracy at the cost of computation time. If set to None, will be calculated given number of samples.
  • remove_features (bool): If True, will remove features from X and feature_names not in group_dict and remove features from groupings not in feature_names.
  • train_ratio (float): Ratio of number of training samples to entire data set. Note: if a threshold is applied, the ratio training samples may decrease depending on class balance and class_threshold parameter if allow_multiclass = True.
  • distance_metric (str): The pairwise distance metric used to estimate sigma. Must be one of the options used in scipy.spatial.distance.cdist.
  • kernel_type (str): The approximated kernel function used to calculate Zs. Must be one of 'Gaussian', 'Laplacian', or 'Cauchy'.
  • random_state (int): Integer random_state used to set the seed for reproducibilty.
  • allow_multiclass (bool): If False, will ensure that cell labels are binary.
  • class_threshold (str | int): Number of samples allowed in the training data for each cell class in the training data. If 'median', the median number of cells per cell class will be the threshold for number of samples per class.
  • reduction (str | None): Choose which dimension reduction technique to perform on features within a group. 'svd' will run sklearn.decomposition.TruncatedSVD, 'linear' will multiply by an array of 1s down to 50 dimensions.
  • tfidf (bool): Whether to calculate TFIDF transformation on peaks within groupings.
Returns
  • adata (ad.AnnData): AnnData with the following attributes and keys:

    adata.X (array_like): Data matrix.

    adata.var_names (array_like): Feature names corresponding to adata.X.

    adata.obs['labels'] (array_like): cell classes/phenotypes from cell_labels.

    adata.uns['train_indices'] (array_like): Indices for training data.

    adata.uns['test_indices'] (array_like) Indices for testing data.

    adata.uns['group_dict'] (dict): Grouping information.

    adata.uns['seed_obj'] (np.random._generator.Generator): Seed object with seed equal to 100 * random_state.

    adata.uns['D'] (int): Number of dimensions to scMKL with.

    adata.uns['scale_data'] (bool): Whether or not data is log and z-score transformed.

    adata.uns['distance_metric'] (str): Distance metric as given.

    adata.uns['kernel_type'] (str): Kernel function as given.

    adata.uns['svd'] (bool): Whether to calculate SVD reduction.

    adata.uns['tfidf'] (bool): Whether to calculate TF-IDF per grouping.

Examples
>>> adata = ad.read_h5ad('MCF7_rna.h5ad')
>>> group_dict = np.load('hallmark_genesets.pkl', 
>>>                      allow_pickle = True)
>>> 
>>> 
>>> # The labels in adata.obs we want to learn are 'celltypes'
>>> adata = scmkl.format_adata(adata, 'celltypes', 
...                            group_dict)
>>> adata
AnnData object with n_obs × n_vars = 1000 × 4341
obs: 'labels'
uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
'distance_metric', 'train_indices', 'test_indices'