scmkl.create_adata

  1import numpy as np
  2import anndata as ad
  3import scipy
  4import pandas as pd
  5import gc
  6import warnings
  7
  8
  9def filter_features(feature_names: np.ndarray, group_dict: dict):
 10    """
 11    Function to remove features only in feature names or group_dict.
 12    Any features not included in group_dict will be removed from the
 13    matrix. Also puts the features in the same relative order (of 
 14    included features)
 15    
 16    Parameters
 17    ----------
 18    feature_names : np.ndarray
 19        Numpy array of corresponding feature names.
 20
 21    group_dict : dict
 22        Dictionary containing feature grouping information.
 23                 Example: {geneset: np.array(gene_1, gene_2, ..., 
 24                 gene_n)}
 25    Returns
 26    -------
 27    feature_names : np.ndarray
 28        Numpy array of corresponding feature names from group_dict.
 29
 30    group_dict : dict
 31        Dictionary containing features overlapping input grouping 
 32        information and full feature names.
 33    """
 34    group_features = set()
 35    feature_set = set(feature_names)
 36
 37    # Store all objects in dictionary in set
 38    for group in group_dict.keys():
 39        group_features.update(set(group_dict[group]))
 40
 41        # Finds intersection between group features and features in data
 42        # Converts to nd.array and sorts to preserve order of feature names
 43        group_feats = list(feature_set.intersection(set(group_dict[group])))
 44        group_dict[group] = np.sort(np.array(group_feats))
 45
 46    # Only keeping groupings that have at least two features
 47    group_dict = {group : group_dict[group] for group in group_dict.keys()
 48                  if len(group_dict[group]) > 1}
 49
 50    group_features = np.array(list(group_features.intersection(feature_set)))
 51
 52    return group_features, group_dict
 53
 54
 55def multi_class_split(y: np.ndarray, seed_obj: np.random._generator.Generator, 
 56                      class_threshold: str | int | None=None, 
 57                      train_ratio: float=0.8):
 58    """
 59    Function for calculating the training and testing cell positions 
 60    for multiclass data sets.
 61
 62    Parameters
 63    ----------
 64    y : array_like
 65        Should be an iterable object cooresponding to samples in 
 66        `ad.AnnData` object.
 67
 68    seed_obj : np.random._generator.Generator
 69        Seed used to randomly sample and split data.
 70
 71    train_ratio : float
 72        Ratio of number of training samples to entire data set. 
 73        Note: if a threshold is applied, the ratio training samples 
 74        may decrease depending on class balance and `class_threshold`
 75        parameter.
 76
 77    class_threshold : str | int
 78        If is type `int`, classes with more samples than 
 79        class_threshold will be sampled. If `'median'`, 
 80        samples will be sampled to the median number of samples per 
 81        class.
 82
 83    Returns
 84    -------
 85    train_indices : np.ndarray
 86        Indices for training samples.
 87
 88    test_indices : np.ndarray
 89        Indices for testing samples.
 90    """
 91    uniq_labels = np.unique(y)
 92
 93    # Finding indices for each cell class
 94    class_positions = {class_ : np.where(y == class_)[0] 
 95                       for class_ in uniq_labels}
 96    
 97    # Capturing training indices while maintaining original class proportions
 98    train_samples = {class_ : seed_obj.choice(class_positions[class_], 
 99                                              int(len(class_positions[class_])
100                                                  * train_ratio), 
101                                              replace = False)
102                        for class_ in class_positions.keys()}
103    
104    # Capturing testing indices while maintaining original class proportions
105    test_samples = {class_ : np.setdiff1d(class_positions[class_], 
106                                          train_samples[class_])
107                    for class_ in class_positions.keys()}
108    
109    # Applying threshold for samples per class
110    if class_threshold == 'median':
111        cells_per_class = [len(values) for values in train_samples.values()]
112        class_threshold = int(np.median(cells_per_class))
113
114    if isinstance(class_threshold, int):
115        # Down sample to class_threshold
116        for class_ in train_samples.keys():
117            if len(train_samples[class_]) > class_threshold:
118                train_samples[class_] = seed_obj.choice(train_samples[class_], 
119                                                        class_threshold)
120            
121    train_indices = np.array([idx for class_ in train_samples.keys()
122                                  for idx in train_samples[class_]])
123    
124    test_indices = np.array([idx for class_ in test_samples.keys()
125                                 for idx in test_samples[class_]])
126    
127    return train_indices, test_indices
128
129
130def binary_split(y: np.ndarray, train_indices: np.ndarray | None=None, 
131                  train_ratio: float=0.8,
132                  seed_obj: np.random._generator.Generator=np.random.default_rng(100)):
133    """
134    Function to calculate training and testing indices for given 
135    dataset. If train indices are given, it will calculate the test 
136    indices. If train_indices == None, then it calculates both indices, 
137    preserving the ratio of each label in y
138
139    Parameters
140    ----------
141    y : np.ndarray
142        Numpy array of cell labels. Can have any number of classes 
143        for this function.
144
145    train_indices : np.ndarray | None
146        Optional array of pre-determined training indices
147
148    train_ratio : float
149        Decimal value ratio of features in training/testing sets
150
151    seed_obj : np.random._generator.Generator
152        Numpy random state used for random processes. Can be 
153        specified for reproducubility or set by default.
154    
155    
156    Returns
157    -------
158    train_indices : np.ndarray
159        Array of indices of training cells.
160
161    test_indices : np.ndarray:
162        Array of indices of testing cells.
163    """
164    # If train indices aren't provided
165    if train_indices is None:
166
167        unique_labels = np.unique(y)
168        train_indices = []
169
170        for label in unique_labels:
171
172            # Find indices of each unique label
173            label_indices = np.where(y == label)[0]
174
175            # Sample these indices according to train ratio
176            n = int(len(label_indices) * train_ratio)
177            train_label_indices = seed_obj.choice(label_indices, n, 
178                                                  replace = False)
179            train_indices.extend(train_label_indices)
180    else:
181        assert len(train_indices) <= len(y), ("More train indices than there "
182                                              "are samples")
183
184    train_indices = np.array(train_indices)
185
186    # Test indices are the indices not in the train_indices
187    test_indices = np.setdiff1d(np.arange(len(y)), train_indices, 
188                                assume_unique = True)
189
190    return train_indices, test_indices
191
192
193def get_median_size(adata: ad.AnnData, other_factor: float=1.5):
194    """
195    Returns the median size of training plus testing samples per cell 
196    type. Used to calculate D for multiclass runs.
197
198    Parameters
199    ----------
200    adata : ad.AnnData
201        An ad.AnnData object with `test_indices` in `.uns` keys and 
202        `labels` in `.obs` keys.
203
204    Returns
205    -------
206    median_size : int
207        The median size of training plus testing samples across cell 
208        types. 
209    """
210    n_test = adata.uns['test_indices'].shape[0]
211
212    _, n_cts = np.unique(adata.obs['labels'][adata.uns['train_indices']], 
213                         return_counts=True)
214    sizes = [n_test + (other_factor*count) for count in n_cts]
215
216    return np.median(sizes)
217 
218
219def calculate_d(num_samples : int):
220    """
221    This function calculates the optimal number of dimensions for 
222    performance. See https://doi.org/10.48550/arXiv.1806.09178 for more
223    information.
224
225    Parameters
226    ----------
227    num_samples : int
228        The number of samples in the data set including both training
229        and testing sets.
230
231    Returns
232    -------
233    d : int
234        The optimal number of dimensions to run scMKL with the given 
235        data set.
236
237    Examples
238    --------
239    >>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz')
240    >>>
241    >>> num_cells = raw_counts.shape[0]
242    >>> d = scmkl.calculate_d(num_cells)
243    >>> d
244    161
245    """
246    d = int(np.sqrt(num_samples)*np.log(np.log(num_samples)))
247
248    return int(np.max([d, 100]))
249
250
251def get_optimal_d(adata: ad.AnnData, D: int | None, allow_multiclass: bool, 
252                  other_factor: float=1.5):
253    """
254    Takes the ad.AnnData object and input D. If D is type `int`, D will 
255    be return. If D is `None` and `allow_multiclass is False`, 
256    `scmkl.calculate_d(adata.shape[0])` will be returned. Else, median 
257    size of training and testing will be calculated and 
258    `scmkl.calculate_d(median_size)` will be returned.
259
260    Parameters
261    ----------
262    adata : ad.AnnData
263        An ad.AnnData object with `test_indices` in `.uns` keys and 
264        `labels` in `.obs` keys.
265    
266    D : int | None
267        The D provided as an `int` or `None` if optimal D should be 
268        calculated.
269
270    allow_multiclass : bool
271        Should be `False` if labels are binary. Else, should be `True` 
272        indicating there are more than two classes.
273
274    Returns
275    -------
276    d : int
277        Either the input or calculated optimal d for the experiment. 
278    """
279    if D is not None:    
280        assert isinstance(D, int) and D > 0, 'D must be a positive integer.'
281        return D
282    
283    if allow_multiclass:
284        size = get_median_size(adata, other_factor)
285    else:
286        size = adata.shape[0]
287
288    return calculate_d(size)
289        
290
291def sort_samples(train_indices, test_indices):
292    """
293    Ensures that samples in adata obj are all training, then all 
294    testing.
295
296    Parameters
297    ----------
298    train_indices : np.ndarray
299        Indices in ad.AnnData object for training.
300    
301    test_indices : np.ndarray
302        Indices in ad.AnnData object for testing.
303
304    Returns
305    -------
306    sort_idc : np.ndarray
307        Ordered indices that will sort ad.AnnData object as all 
308        training samples, then all testing.
309
310    train_indices : np.ndarray
311        The new training indices given the new index order, `sort_idc`.
312
313    test_indices : np.ndarray
314        The new testing indices given the new index order, `sort_idc`.
315    """
316    sort_idc = np.concatenate([train_indices, test_indices])
317
318    train_indices = np.arange(0, train_indices.shape[0])
319    test_indices = np.arange(train_indices.shape[0], 
320                             train_indices.shape[0] + test_indices.shape[0])
321    
322    return sort_idc, train_indices, test_indices
323
324
325def create_adata(X: scipy.sparse._csc.csc_matrix | np.ndarray | pd.DataFrame, 
326                 feature_names: np.ndarray, cell_labels: np.ndarray, 
327                 group_dict: dict, obs_names: None | np.ndarray=None, 
328                 scale_data: bool=True, transform_data: bool=False, 
329                 split_data: np.ndarray | None=None, D: int | None=None, 
330                 remove_features: bool=True, train_ratio: float=0.8, 
331                 distance_metric: str='euclidean', kernel_type: str='Gaussian', 
332                 random_state: int=1, allow_multiclass: bool = False, 
333                 class_threshold: str | int | None = None,
334                 reduction: str | None = None, tfidf: bool = False, 
335                 other_factor: float=1.5):
336    """
337    Function to create an AnnData object to carry all relevant 
338    information going forward.
339
340    Parameters
341    ----------
342    X : scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame
343        A data matrix of cells by features (sparse array 
344        recommended for large datasets).
345
346    feature_names : np.ndarray
347        Array of feature names corresponding with the features 
348        in `X`.
349
350    cell_labels : np.ndarray
351        A numpy array of cell phenotypes corresponding with 
352        the cells in `X`.
353
354    group_dict : dict 
355        Dictionary containing feature grouping information (i.e. 
356        `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 
357        np.array([...]), ...}`.
358
359    obs_names : None | np.ndarray
360        The cell names corresponding to `X` to be assigned to output 
361        object `.obs_names` attribute.
362
363    scale_data : bool  
364        If `True`, data matrix is log transformed and standard 
365        scaled. Default is `True`.
366
367    transform_data : bool
368        If `True`, data will be log1p transformed (recommended for 
369        counts data). Default is `False`.   
370    
371    split_data : None | np.ndarray
372        If `None`, data will be split stratified by cell labels. 
373        Else, is an array of precalculated train/test split 
374        corresponding to samples. Can include labels for entire
375        dataset to benchmark performance or for only training
376        data to classify unknown cell types (i.e. `np.array(['train', 
377        'test', ..., 'train'])`.
378
379    D : int 
380        Number of Random Fourier Features used to calculate Z. 
381        Should be a positive integer. Higher values of D will 
382        increase classification accuracy at the cost of computation 
383        time. If set to `None`, will be calculated given number of 
384        samples. 
385    
386    remove_features : bool
387        If `True`, will remove features from `X` and `feature_names` 
388        not in `group_dict` and remove features from groupings not in 
389        `feature_names`.
390
391    train_ratio : float
392        Ratio of number of training samples to entire data set. Note:
393        if a threshold is applied, the ratio training samples may 
394        decrease depending on class balance and `class_threshold`
395        parameter if `allow_multiclass = True`.
396
397    distance_metric : str
398        The pairwise distance metric used to estimate sigma. Must
399        be one of the options used in `scipy.spatial.distance.cdist`.
400
401    kernel_type : str
402        The approximated kernel function used to calculate Zs.
403        Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`.
404
405    random_state : int
406        Integer random_state used to set the seed for 
407        reproducibilty.
408
409    allow_multiclass : bool
410        If `False`, will ensure that cell labels are binary.
411
412    class_threshold : str | int
413        Number of samples allowed in the training data for each cell
414        class in the training data. If `'median'`, the median number 
415        of cells per cell class will be the threshold for number of 
416        samples per class.
417
418    reduction: str | None
419        Choose which dimension reduction technique to perform on 
420        features within a group. 'svd' will run 
421        `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 
422        by an array of 1s down to 50 dimensions. 'pca' will replace 
423        each group values with 50 PCs from principal component 
424        analysis.
425        
426    tfidf: bool
427        Whether to calculate TFIDF transformation on peaks within 
428        groupings.
429        
430    Returns
431    -------
432    adata : ad.AnnData
433        AnnData with the following attributes and keys:
434
435        `adata.X` (array_like):
436            Data matrix.
437    
438        `adata.var_names` (array_like): 
439            Feature names corresponding to `adata.X`.
440
441        `adata.obs['labels']` (array_like):
442            cell classes/phenotypes from `cell_labels`.
443
444        `adata.uns['train_indices']` (array_like):
445            Indices for training data. 
446
447        `adata.uns['test_indices']` (array_like)
448            Indices for testing data.
449
450        `adata.uns['group_dict']` (dict):
451            Grouping information.
452
453        `adata.uns['seed_obj']` (np.random._generator.Generator): 
454            Seed object with seed equal to 100 * `random_state`.
455
456        `adata.uns['D']` (int):
457            Number of dimensions to scMKL with.
458
459        `adata.uns['scale_data']` (bool):
460            Whether or not data is scaled.
461
462        `adata.uns['transform_data']` (bool):
463            Whether or not data is log1p transformed.
464
465        `adata.uns['distance_metric']` (str): 
466            Distance metric as given.
467    
468        `adata.uns['kernel_type']` (str): 
469            Kernel function as given.
470
471        `adata.uns['svd']` (bool): 
472            Whether to calculate SVD reduction.
473
474        `adata.uns['tfidf']` (bool): 
475            Whether to calculate TF-IDF per grouping.
476
477    Examples
478    --------
479    >>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz')
480    >>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True)
481    >>> group_dict = np.load('hallmark_genesets.pkl', 
482    >>>                      allow_pickle = True)
483    >>> 
484    >>> adata = scmkl.create_adata(X = data_mat, 
485    ...                            feature_names = gene_names, 
486    ...                            group_dict = group_dict)
487    >>> adata
488    AnnData object with n_obs × n_vars = 1000 × 4341
489    obs: 'labels'
490    uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
491    'distance_metric', 'train_indices', 'test_indices'
492    """
493
494    assert X.shape[1] == len(feature_names), ("Different number of features "
495                                              "in X than feature names.")
496    
497    if not allow_multiclass:
498        assert len(np.unique(cell_labels)) == 2, ("cell_labels must contain "
499                                                  "2 classes.")
500
501    kernel_options = ['gaussian', 'laplacian', 'cauchy']
502    assert kernel_type.lower() in kernel_options, ("Given kernel type not "
503                                                   "implemented. Gaussian, "
504                                                   "Laplacian, and Cauchy "
505                                                   "are the acceptable "
506                                                   "types.")
507
508    # Create adata object and add column names
509    adata = ad.AnnData(X)
510    adata.var_names = feature_names
511
512    if isinstance(obs_names, (np.ndarray)):
513        adata.obs_names = obs_names
514
515    filtered_feature_names, group_dict = filter_features(feature_names, 
516                                                          group_dict)
517    
518    # Ensuring that there are common features between feature_names and groups
519    overlap_err = ("No common features between feature names and grouping "
520                   "dict. Check grouping.")
521    assert len(filtered_feature_names) > 0, overlap_err
522
523    if remove_features:
524        warnings.filterwarnings('ignore', category = ad.ImplicitModificationWarning)
525        adata = adata[:, filtered_feature_names]
526    
527    gc.collect()
528
529    # Add metadata to adata object
530    adata.uns['group_dict'] = group_dict
531    adata.uns['seed_obj'] = np.random.default_rng(100*random_state)
532    adata.uns['scale_data'] = scale_data
533    adata.uns['transform_data'] = transform_data
534    adata.uns['kernel_type'] = kernel_type
535    adata.uns['distance_metric'] = distance_metric
536    adata.uns['reduction'] = reduction if isinstance(reduction, str) else 'None'
537    adata.uns['tfidf'] = tfidf
538
539    if (split_data is None):
540        assert X.shape[0] == len(cell_labels), ("Different number of cells "
541                                                "than labels")
542        adata.obs['labels'] = cell_labels
543
544        if (allow_multiclass == False):
545            split = binary_split(cell_labels, 
546                                  seed_obj = adata.uns['seed_obj'],
547                                  train_ratio = train_ratio)
548            train_indices, test_indices = split
549
550        elif (allow_multiclass == True):
551            split = multi_class_split(cell_labels, 
552                                       seed_obj = adata.uns['seed_obj'], 
553                                       class_threshold = class_threshold,
554                                       train_ratio = train_ratio)
555            train_indices, test_indices = split
556
557        adata.uns['labeled_test'] = True
558
559    else:
560        sd_err_message = "`split_data` argument must be of type np.ndarray"
561        assert isinstance(split_data, np.ndarray), sd_err_message
562        x_eq_labs = X.shape[0] == len(cell_labels)
563        train_eq_labs = X.shape[0] == len(cell_labels)
564        assert x_eq_labs or train_eq_labs, ("Must give labels for all cells "
565                                            "or only for training cells")
566        
567        train_indices = np.where(split_data == 'train')[0]
568        test_indices = np.where(split_data == 'test')[0]
569
570        if len(cell_labels) == len(train_indices):
571
572            padded_cell_labels = np.zeros((X.shape[0])).astype('object')
573            padded_cell_labels[train_indices] = cell_labels
574            padded_cell_labels[test_indices] = 'padded_test_label'
575
576            adata.obs['labels'] = padded_cell_labels
577            adata.uns['labeled_test'] = False
578
579        elif len(cell_labels) == len(split_data):
580            adata.obs['labels'] = cell_labels
581            adata.uns['labeled_test'] = True
582
583    # Ensuring all train samples are first in adata object followed by test
584    sort_idx, train_indices, test_indices = sort_samples(train_indices, 
585                                                         test_indices)
586
587    adata = adata[sort_idx]
588
589    if not isinstance(obs_names, (np.ndarray)):
590        adata.obs = adata.obs.reset_index(drop=True)
591        adata.obs.index = adata.obs.index.astype('O')
592
593    adata.uns['train_indices'] = train_indices
594    adata.uns['test_indices'] = test_indices
595
596    adata.uns['D'] = get_optimal_d(adata, D, allow_multiclass, other_factor)
597
598    if not scale_data:
599        print("WARNING: Data will not be scaled. "
600              "To change this behavior, set scale_data to True")
601    if not transform_data:
602        print("WARNING: Data will not be transformed."
603              "To change this behavior, set transform_data to True")
604
605    return adata
606
607
608def format_adata(adata: ad.AnnData | str, cell_labels: np.ndarray | str, 
609                 group_dict: dict | str, use_raw: bool=False, 
610                 scale_data: bool=True, transform_data: bool=False, 
611                 split_data: np.ndarray | None=None, D: int | None=None, 
612                 remove_features: bool=True, train_ratio: float=0.8, 
613                 distance_metric: str='euclidean', kernel_type: str='Gaussian', 
614                 random_state: int=1, allow_multiclass: bool = False, 
615                 class_threshold: str | int | None=None, 
616                 reduction: str | None = None, tfidf: bool = False):
617    """
618    Function to format an `ad.AnnData` object to carry all relevant 
619    information going forward. `adata.obs_names` will be retained.
620
621    **NOTE: Information not needed for running `scmkl` will be 
622    removed.**
623
624    Parameters
625    ----------
626    adata : ad.AnnData
627        Object with data for `scmkl` to be applied to. Only requirment 
628        is that `.var_names` is correct and data matrix is in `adata.X` 
629        or `adata.raw.X`. A h5ad file can be provided as a `str` and it 
630        will be read in.
631
632    cell_labels : np.ndarray | str
633        If type `str`, the labels for `scmkl` to learn are captured 
634        from `adata.obs['cell_labels']`. Else, a `np.ndarray` of cell 
635        phenotypes corresponding with the cells in `adata.X`.
636
637    group_dict : dict | str
638        Dictionary containing feature grouping information (i.e. 
639        `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 
640        np.array([...]), ...}`. A pickle file can be provided as a `str` 
641        and it will be read in.
642
643    obs_names : None | np.ndarray
644        The cell names corresponding to `X` to be assigned to output 
645        object `.obs_names` attribute.
646
647    use_raw : bool
648        If `False`, will use `adata.X` to create new `adata`. Else, 
649        will use `adata.raw.X`.
650
651    scale_data : bool  
652        If `True`, data matrix is log transformed and standard 
653        scaled. 
654
655    transform_data : bool
656        If `True`, data will be log1p transformed (recommended for 
657        counts data). Default is `False`. 
658        
659    split_data : None | np.ndarray
660        If `None`, data will be split stratified by cell labels. 
661        Else, is an array of precalculated train/test split 
662        corresponding to samples. Can include labels for entire
663        dataset to benchmark performance or for only training
664        data to classify unknown cell types (i.e. `np.array(['train', 
665        'test', ..., 'train'])`.
666
667    D : int 
668        Number of Random Fourier Features used to calculate Z. 
669        Should be a positive integer. Higher values of D will 
670        increase classification accuracy at the cost of computation 
671        time. If set to `None`, will be calculated given number of 
672        samples. 
673    
674    remove_features : bool
675        If `True`, will remove features from `X` and `feature_names` 
676        not in `group_dict` and remove features from groupings not in 
677        `feature_names`.
678
679    train_ratio : float
680        Ratio of number of training samples to entire data set. Note:
681        if a threshold is applied, the ratio training samples may 
682        decrease depending on class balance and `class_threshold`
683        parameter if `allow_multiclass = True`.
684
685    distance_metric : str
686        The pairwise distance metric used to estimate sigma. Must
687        be one of the options used in `scipy.spatial.distance.cdist`.
688
689    kernel_type : str
690        The approximated kernel function used to calculate Zs.
691        Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`.
692
693    random_state : int
694        Integer random_state used to set the seed for 
695        reproducibilty.
696
697    allow_multiclass : bool
698        If `False`, will ensure that cell labels are binary.
699
700    class_threshold : str | int
701        Number of samples allowed in the training data for each cell
702        class in the training data. If `'median'`, the median number 
703        of cells per cell class will be the threshold for number of 
704        samples per class.
705
706    reduction: str | None
707        Choose which dimension reduction technique to perform on 
708        features within a group. 'svd' will run 
709        `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 
710        by an array of 1s down to 50 dimensions.
711        
712    tfidf: bool
713        Whether to calculate TFIDF transformation on peaks within 
714        groupings.
715        
716    Returns
717    -------
718    adata : ad.AnnData
719        AnnData with the following attributes and keys:
720
721        `adata.X` (array_like):
722            Data matrix.
723    
724        `adata.var_names` (array_like): 
725            Feature names corresponding to `adata.X`.
726
727        `adata.obs['labels']` (array_like):
728            cell classes/phenotypes from `cell_labels`.
729
730        `adata.uns['train_indices']` (array_like):
731            Indices for training data. 
732
733        `adata.uns['test_indices']` (array_like)
734            Indices for testing data.
735
736        `adata.uns['group_dict']` (dict):
737            Grouping information.
738
739        `adata.uns['seed_obj']` (np.random._generator.Generator): 
740            Seed object with seed equal to 100 * `random_state`.
741
742        `adata.uns['D']` (int):
743            Number of dimensions to scMKL with.
744
745        `adata.uns['scale_data']` (bool):
746            Whether or not data is scaled.
747
748        `adata.uns['transform_data']` (bool):
749            Whether or not data is log1p transformed.
750
751        `adata.uns['distance_metric']` (str): 
752            Distance metric as given.
753    
754        `adata.uns['kernel_type']` (str): 
755            Kernel function as given.
756
757        `adata.uns['svd']` (bool): 
758            Whether to calculate SVD reduction.
759
760        `adata.uns['tfidf']` (bool): 
761            Whether to calculate TF-IDF per grouping.
762
763    Examples
764    --------
765    >>> adata = ad.read_h5ad('MCF7_rna.h5ad')
766    >>> group_dict = np.load('hallmark_genesets.pkl', 
767    >>>                      allow_pickle = True)
768    >>> 
769    >>> 
770    >>> # The labels in adata.obs we want to learn are 'celltypes'
771    >>> adata = scmkl.format_adata(adata, 'celltypes', 
772    ...                            group_dict)
773    >>> adata
774    AnnData object with n_obs × n_vars = 1000 × 4341
775    obs: 'labels'
776    uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
777    'distance_metric', 'train_indices', 'test_indices'
778    """
779    if str == type(adata):
780        adata = ad.read_h5ad(adata)
781
782    if str == type(group_dict):
783        group_dict = np.load(group_dict, allow_pickle=True)
784        
785    if str == type(cell_labels):
786        err_msg = f"{cell_labels} is not in `adata.obs`"
787        assert cell_labels in adata.obs.keys(), err_msg
788        cell_labels = adata.obs[cell_labels].to_numpy()
789    
790    if use_raw:
791        assert adata.raw, "`adata.raw` is empty, set `use_raw` to `False`"
792        X = adata.raw.X
793    else:
794        X = adata.X
795
796    adata = create_adata(X, adata.var_names.to_numpy().copy(), cell_labels, 
797                         group_dict, adata.obs_names.to_numpy().copy(), 
798                         scale_data, transform_data, split_data, D, remove_features, 
799                         train_ratio, distance_metric, kernel_type, 
800                         random_state, allow_multiclass, class_threshold, 
801                         reduction, tfidf)
802
803    return adata
def filter_features(feature_names: numpy.ndarray, group_dict: dict):
10def filter_features(feature_names: np.ndarray, group_dict: dict):
11    """
12    Function to remove features only in feature names or group_dict.
13    Any features not included in group_dict will be removed from the
14    matrix. Also puts the features in the same relative order (of 
15    included features)
16    
17    Parameters
18    ----------
19    feature_names : np.ndarray
20        Numpy array of corresponding feature names.
21
22    group_dict : dict
23        Dictionary containing feature grouping information.
24                 Example: {geneset: np.array(gene_1, gene_2, ..., 
25                 gene_n)}
26    Returns
27    -------
28    feature_names : np.ndarray
29        Numpy array of corresponding feature names from group_dict.
30
31    group_dict : dict
32        Dictionary containing features overlapping input grouping 
33        information and full feature names.
34    """
35    group_features = set()
36    feature_set = set(feature_names)
37
38    # Store all objects in dictionary in set
39    for group in group_dict.keys():
40        group_features.update(set(group_dict[group]))
41
42        # Finds intersection between group features and features in data
43        # Converts to nd.array and sorts to preserve order of feature names
44        group_feats = list(feature_set.intersection(set(group_dict[group])))
45        group_dict[group] = np.sort(np.array(group_feats))
46
47    # Only keeping groupings that have at least two features
48    group_dict = {group : group_dict[group] for group in group_dict.keys()
49                  if len(group_dict[group]) > 1}
50
51    group_features = np.array(list(group_features.intersection(feature_set)))
52
53    return group_features, group_dict

Function to remove features only in feature names or group_dict. Any features not included in group_dict will be removed from the matrix. Also puts the features in the same relative order (of included features)

Parameters
  • feature_names (np.ndarray): Numpy array of corresponding feature names.
  • group_dict (dict): Dictionary containing feature grouping information. Example: {geneset: np.array(gene_1, gene_2, ..., gene_n)}
Returns
  • feature_names (np.ndarray): Numpy array of corresponding feature names from group_dict.
  • group_dict (dict): Dictionary containing features overlapping input grouping information and full feature names.
def multi_class_split( y: numpy.ndarray, seed_obj: numpy.random._generator.Generator, class_threshold: str | int | None = None, train_ratio: float = 0.8):
 56def multi_class_split(y: np.ndarray, seed_obj: np.random._generator.Generator, 
 57                      class_threshold: str | int | None=None, 
 58                      train_ratio: float=0.8):
 59    """
 60    Function for calculating the training and testing cell positions 
 61    for multiclass data sets.
 62
 63    Parameters
 64    ----------
 65    y : array_like
 66        Should be an iterable object cooresponding to samples in 
 67        `ad.AnnData` object.
 68
 69    seed_obj : np.random._generator.Generator
 70        Seed used to randomly sample and split data.
 71
 72    train_ratio : float
 73        Ratio of number of training samples to entire data set. 
 74        Note: if a threshold is applied, the ratio training samples 
 75        may decrease depending on class balance and `class_threshold`
 76        parameter.
 77
 78    class_threshold : str | int
 79        If is type `int`, classes with more samples than 
 80        class_threshold will be sampled. If `'median'`, 
 81        samples will be sampled to the median number of samples per 
 82        class.
 83
 84    Returns
 85    -------
 86    train_indices : np.ndarray
 87        Indices for training samples.
 88
 89    test_indices : np.ndarray
 90        Indices for testing samples.
 91    """
 92    uniq_labels = np.unique(y)
 93
 94    # Finding indices for each cell class
 95    class_positions = {class_ : np.where(y == class_)[0] 
 96                       for class_ in uniq_labels}
 97    
 98    # Capturing training indices while maintaining original class proportions
 99    train_samples = {class_ : seed_obj.choice(class_positions[class_], 
100                                              int(len(class_positions[class_])
101                                                  * train_ratio), 
102                                              replace = False)
103                        for class_ in class_positions.keys()}
104    
105    # Capturing testing indices while maintaining original class proportions
106    test_samples = {class_ : np.setdiff1d(class_positions[class_], 
107                                          train_samples[class_])
108                    for class_ in class_positions.keys()}
109    
110    # Applying threshold for samples per class
111    if class_threshold == 'median':
112        cells_per_class = [len(values) for values in train_samples.values()]
113        class_threshold = int(np.median(cells_per_class))
114
115    if isinstance(class_threshold, int):
116        # Down sample to class_threshold
117        for class_ in train_samples.keys():
118            if len(train_samples[class_]) > class_threshold:
119                train_samples[class_] = seed_obj.choice(train_samples[class_], 
120                                                        class_threshold)
121            
122    train_indices = np.array([idx for class_ in train_samples.keys()
123                                  for idx in train_samples[class_]])
124    
125    test_indices = np.array([idx for class_ in test_samples.keys()
126                                 for idx in test_samples[class_]])
127    
128    return train_indices, test_indices

Function for calculating the training and testing cell positions for multiclass data sets.

Parameters
  • y (array_like): Should be an iterable object cooresponding to samples in ad.AnnData object.
  • seed_obj (np.random._generator.Generator): Seed used to randomly sample and split data.
  • train_ratio (float): Ratio of number of training samples to entire data set. Note: if a threshold is applied, the ratio training samples may decrease depending on class balance and class_threshold parameter.
  • class_threshold (str | int): If is type int, classes with more samples than class_threshold will be sampled. If 'median', samples will be sampled to the median number of samples per class.
Returns
  • train_indices (np.ndarray): Indices for training samples.
  • test_indices (np.ndarray): Indices for testing samples.
def binary_split( y: numpy.ndarray, train_indices: numpy.ndarray | None = None, train_ratio: float = 0.8, seed_obj: numpy.random._generator.Generator = Generator(PCG64) at 0x7726C990D460):
131def binary_split(y: np.ndarray, train_indices: np.ndarray | None=None, 
132                  train_ratio: float=0.8,
133                  seed_obj: np.random._generator.Generator=np.random.default_rng(100)):
134    """
135    Function to calculate training and testing indices for given 
136    dataset. If train indices are given, it will calculate the test 
137    indices. If train_indices == None, then it calculates both indices, 
138    preserving the ratio of each label in y
139
140    Parameters
141    ----------
142    y : np.ndarray
143        Numpy array of cell labels. Can have any number of classes 
144        for this function.
145
146    train_indices : np.ndarray | None
147        Optional array of pre-determined training indices
148
149    train_ratio : float
150        Decimal value ratio of features in training/testing sets
151
152    seed_obj : np.random._generator.Generator
153        Numpy random state used for random processes. Can be 
154        specified for reproducubility or set by default.
155    
156    
157    Returns
158    -------
159    train_indices : np.ndarray
160        Array of indices of training cells.
161
162    test_indices : np.ndarray:
163        Array of indices of testing cells.
164    """
165    # If train indices aren't provided
166    if train_indices is None:
167
168        unique_labels = np.unique(y)
169        train_indices = []
170
171        for label in unique_labels:
172
173            # Find indices of each unique label
174            label_indices = np.where(y == label)[0]
175
176            # Sample these indices according to train ratio
177            n = int(len(label_indices) * train_ratio)
178            train_label_indices = seed_obj.choice(label_indices, n, 
179                                                  replace = False)
180            train_indices.extend(train_label_indices)
181    else:
182        assert len(train_indices) <= len(y), ("More train indices than there "
183                                              "are samples")
184
185    train_indices = np.array(train_indices)
186
187    # Test indices are the indices not in the train_indices
188    test_indices = np.setdiff1d(np.arange(len(y)), train_indices, 
189                                assume_unique = True)
190
191    return train_indices, test_indices

Function to calculate training and testing indices for given dataset. If train indices are given, it will calculate the test indices. If train_indices == None, then it calculates both indices, preserving the ratio of each label in y

Parameters
  • y (np.ndarray): Numpy array of cell labels. Can have any number of classes for this function.
  • train_indices (np.ndarray | None): Optional array of pre-determined training indices
  • train_ratio (float): Decimal value ratio of features in training/testing sets
  • seed_obj (np.random._generator.Generator): Numpy random state used for random processes. Can be specified for reproducubility or set by default.
Returns
  • train_indices (np.ndarray): Array of indices of training cells.
  • test_indices (np.ndarray:): Array of indices of testing cells.
def get_median_size(adata: anndata._core.anndata.AnnData, other_factor: float = 1.5):
194def get_median_size(adata: ad.AnnData, other_factor: float=1.5):
195    """
196    Returns the median size of training plus testing samples per cell 
197    type. Used to calculate D for multiclass runs.
198
199    Parameters
200    ----------
201    adata : ad.AnnData
202        An ad.AnnData object with `test_indices` in `.uns` keys and 
203        `labels` in `.obs` keys.
204
205    Returns
206    -------
207    median_size : int
208        The median size of training plus testing samples across cell 
209        types. 
210    """
211    n_test = adata.uns['test_indices'].shape[0]
212
213    _, n_cts = np.unique(adata.obs['labels'][adata.uns['train_indices']], 
214                         return_counts=True)
215    sizes = [n_test + (other_factor*count) for count in n_cts]
216
217    return np.median(sizes)

Returns the median size of training plus testing samples per cell type. Used to calculate D for multiclass runs.

Parameters
  • adata (ad.AnnData): An ad.AnnData object with test_indices in .uns keys and labels in .obs keys.
Returns
  • median_size (int): The median size of training plus testing samples across cell types.
def calculate_d(num_samples: int):
220def calculate_d(num_samples : int):
221    """
222    This function calculates the optimal number of dimensions for 
223    performance. See https://doi.org/10.48550/arXiv.1806.09178 for more
224    information.
225
226    Parameters
227    ----------
228    num_samples : int
229        The number of samples in the data set including both training
230        and testing sets.
231
232    Returns
233    -------
234    d : int
235        The optimal number of dimensions to run scMKL with the given 
236        data set.
237
238    Examples
239    --------
240    >>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz')
241    >>>
242    >>> num_cells = raw_counts.shape[0]
243    >>> d = scmkl.calculate_d(num_cells)
244    >>> d
245    161
246    """
247    d = int(np.sqrt(num_samples)*np.log(np.log(num_samples)))
248
249    return int(np.max([d, 100]))

This function calculates the optimal number of dimensions for performance. See https://doi.org/10.48550/arXiv.1806.09178 for more information.

Parameters
  • num_samples (int): The number of samples in the data set including both training and testing sets.
Returns
  • d (int): The optimal number of dimensions to run scMKL with the given data set.
Examples
>>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz')
>>>
>>> num_cells = raw_counts.shape[0]
>>> d = scmkl.calculate_d(num_cells)
>>> d
161
def get_optimal_d( adata: anndata._core.anndata.AnnData, D: int | None, allow_multiclass: bool, other_factor: float = 1.5):
252def get_optimal_d(adata: ad.AnnData, D: int | None, allow_multiclass: bool, 
253                  other_factor: float=1.5):
254    """
255    Takes the ad.AnnData object and input D. If D is type `int`, D will 
256    be return. If D is `None` and `allow_multiclass is False`, 
257    `scmkl.calculate_d(adata.shape[0])` will be returned. Else, median 
258    size of training and testing will be calculated and 
259    `scmkl.calculate_d(median_size)` will be returned.
260
261    Parameters
262    ----------
263    adata : ad.AnnData
264        An ad.AnnData object with `test_indices` in `.uns` keys and 
265        `labels` in `.obs` keys.
266    
267    D : int | None
268        The D provided as an `int` or `None` if optimal D should be 
269        calculated.
270
271    allow_multiclass : bool
272        Should be `False` if labels are binary. Else, should be `True` 
273        indicating there are more than two classes.
274
275    Returns
276    -------
277    d : int
278        Either the input or calculated optimal d for the experiment. 
279    """
280    if D is not None:    
281        assert isinstance(D, int) and D > 0, 'D must be a positive integer.'
282        return D
283    
284    if allow_multiclass:
285        size = get_median_size(adata, other_factor)
286    else:
287        size = adata.shape[0]
288
289    return calculate_d(size)

Takes the ad.AnnData object and input D. If D is type int, D will be return. If D is None and allow_multiclass is False, scmkl.calculate_d(adata.shape[0]) will be returned. Else, median size of training and testing will be calculated and scmkl.calculate_d(median_size) will be returned.

Parameters
  • adata (ad.AnnData): An ad.AnnData object with test_indices in .uns keys and labels in .obs keys.
  • D (int | None): The D provided as an int or None if optimal D should be calculated.
  • allow_multiclass (bool): Should be False if labels are binary. Else, should be True indicating there are more than two classes.
Returns
  • d (int): Either the input or calculated optimal d for the experiment.
def sort_samples(train_indices, test_indices):
292def sort_samples(train_indices, test_indices):
293    """
294    Ensures that samples in adata obj are all training, then all 
295    testing.
296
297    Parameters
298    ----------
299    train_indices : np.ndarray
300        Indices in ad.AnnData object for training.
301    
302    test_indices : np.ndarray
303        Indices in ad.AnnData object for testing.
304
305    Returns
306    -------
307    sort_idc : np.ndarray
308        Ordered indices that will sort ad.AnnData object as all 
309        training samples, then all testing.
310
311    train_indices : np.ndarray
312        The new training indices given the new index order, `sort_idc`.
313
314    test_indices : np.ndarray
315        The new testing indices given the new index order, `sort_idc`.
316    """
317    sort_idc = np.concatenate([train_indices, test_indices])
318
319    train_indices = np.arange(0, train_indices.shape[0])
320    test_indices = np.arange(train_indices.shape[0], 
321                             train_indices.shape[0] + test_indices.shape[0])
322    
323    return sort_idc, train_indices, test_indices

Ensures that samples in adata obj are all training, then all testing.

Parameters
  • train_indices (np.ndarray): Indices in ad.AnnData object for training.
  • test_indices (np.ndarray): Indices in ad.AnnData object for testing.
Returns
  • sort_idc (np.ndarray): Ordered indices that will sort ad.AnnData object as all training samples, then all testing.
  • train_indices (np.ndarray): The new training indices given the new index order, sort_idc.
  • test_indices (np.ndarray): The new testing indices given the new index order, sort_idc.
def create_adata( X: scipy.sparse._csc.csc_matrix | numpy.ndarray | pandas.core.frame.DataFrame, feature_names: numpy.ndarray, cell_labels: numpy.ndarray, group_dict: dict, obs_names: None | numpy.ndarray = None, scale_data: bool = True, transform_data: bool = False, split_data: numpy.ndarray | None = None, D: int | None = None, remove_features: bool = True, train_ratio: float = 0.8, distance_metric: str = 'euclidean', kernel_type: str = 'Gaussian', random_state: int = 1, allow_multiclass: bool = False, class_threshold: str | int | None = None, reduction: str | None = None, tfidf: bool = False, other_factor: float = 1.5):
326def create_adata(X: scipy.sparse._csc.csc_matrix | np.ndarray | pd.DataFrame, 
327                 feature_names: np.ndarray, cell_labels: np.ndarray, 
328                 group_dict: dict, obs_names: None | np.ndarray=None, 
329                 scale_data: bool=True, transform_data: bool=False, 
330                 split_data: np.ndarray | None=None, D: int | None=None, 
331                 remove_features: bool=True, train_ratio: float=0.8, 
332                 distance_metric: str='euclidean', kernel_type: str='Gaussian', 
333                 random_state: int=1, allow_multiclass: bool = False, 
334                 class_threshold: str | int | None = None,
335                 reduction: str | None = None, tfidf: bool = False, 
336                 other_factor: float=1.5):
337    """
338    Function to create an AnnData object to carry all relevant 
339    information going forward.
340
341    Parameters
342    ----------
343    X : scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame
344        A data matrix of cells by features (sparse array 
345        recommended for large datasets).
346
347    feature_names : np.ndarray
348        Array of feature names corresponding with the features 
349        in `X`.
350
351    cell_labels : np.ndarray
352        A numpy array of cell phenotypes corresponding with 
353        the cells in `X`.
354
355    group_dict : dict 
356        Dictionary containing feature grouping information (i.e. 
357        `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 
358        np.array([...]), ...}`.
359
360    obs_names : None | np.ndarray
361        The cell names corresponding to `X` to be assigned to output 
362        object `.obs_names` attribute.
363
364    scale_data : bool  
365        If `True`, data matrix is log transformed and standard 
366        scaled. Default is `True`.
367
368    transform_data : bool
369        If `True`, data will be log1p transformed (recommended for 
370        counts data). Default is `False`.   
371    
372    split_data : None | np.ndarray
373        If `None`, data will be split stratified by cell labels. 
374        Else, is an array of precalculated train/test split 
375        corresponding to samples. Can include labels for entire
376        dataset to benchmark performance or for only training
377        data to classify unknown cell types (i.e. `np.array(['train', 
378        'test', ..., 'train'])`.
379
380    D : int 
381        Number of Random Fourier Features used to calculate Z. 
382        Should be a positive integer. Higher values of D will 
383        increase classification accuracy at the cost of computation 
384        time. If set to `None`, will be calculated given number of 
385        samples. 
386    
387    remove_features : bool
388        If `True`, will remove features from `X` and `feature_names` 
389        not in `group_dict` and remove features from groupings not in 
390        `feature_names`.
391
392    train_ratio : float
393        Ratio of number of training samples to entire data set. Note:
394        if a threshold is applied, the ratio training samples may 
395        decrease depending on class balance and `class_threshold`
396        parameter if `allow_multiclass = True`.
397
398    distance_metric : str
399        The pairwise distance metric used to estimate sigma. Must
400        be one of the options used in `scipy.spatial.distance.cdist`.
401
402    kernel_type : str
403        The approximated kernel function used to calculate Zs.
404        Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`.
405
406    random_state : int
407        Integer random_state used to set the seed for 
408        reproducibilty.
409
410    allow_multiclass : bool
411        If `False`, will ensure that cell labels are binary.
412
413    class_threshold : str | int
414        Number of samples allowed in the training data for each cell
415        class in the training data. If `'median'`, the median number 
416        of cells per cell class will be the threshold for number of 
417        samples per class.
418
419    reduction: str | None
420        Choose which dimension reduction technique to perform on 
421        features within a group. 'svd' will run 
422        `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 
423        by an array of 1s down to 50 dimensions. 'pca' will replace 
424        each group values with 50 PCs from principal component 
425        analysis.
426        
427    tfidf: bool
428        Whether to calculate TFIDF transformation on peaks within 
429        groupings.
430        
431    Returns
432    -------
433    adata : ad.AnnData
434        AnnData with the following attributes and keys:
435
436        `adata.X` (array_like):
437            Data matrix.
438    
439        `adata.var_names` (array_like): 
440            Feature names corresponding to `adata.X`.
441
442        `adata.obs['labels']` (array_like):
443            cell classes/phenotypes from `cell_labels`.
444
445        `adata.uns['train_indices']` (array_like):
446            Indices for training data. 
447
448        `adata.uns['test_indices']` (array_like)
449            Indices for testing data.
450
451        `adata.uns['group_dict']` (dict):
452            Grouping information.
453
454        `adata.uns['seed_obj']` (np.random._generator.Generator): 
455            Seed object with seed equal to 100 * `random_state`.
456
457        `adata.uns['D']` (int):
458            Number of dimensions to scMKL with.
459
460        `adata.uns['scale_data']` (bool):
461            Whether or not data is scaled.
462
463        `adata.uns['transform_data']` (bool):
464            Whether or not data is log1p transformed.
465
466        `adata.uns['distance_metric']` (str): 
467            Distance metric as given.
468    
469        `adata.uns['kernel_type']` (str): 
470            Kernel function as given.
471
472        `adata.uns['svd']` (bool): 
473            Whether to calculate SVD reduction.
474
475        `adata.uns['tfidf']` (bool): 
476            Whether to calculate TF-IDF per grouping.
477
478    Examples
479    --------
480    >>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz')
481    >>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True)
482    >>> group_dict = np.load('hallmark_genesets.pkl', 
483    >>>                      allow_pickle = True)
484    >>> 
485    >>> adata = scmkl.create_adata(X = data_mat, 
486    ...                            feature_names = gene_names, 
487    ...                            group_dict = group_dict)
488    >>> adata
489    AnnData object with n_obs × n_vars = 1000 × 4341
490    obs: 'labels'
491    uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
492    'distance_metric', 'train_indices', 'test_indices'
493    """
494
495    assert X.shape[1] == len(feature_names), ("Different number of features "
496                                              "in X than feature names.")
497    
498    if not allow_multiclass:
499        assert len(np.unique(cell_labels)) == 2, ("cell_labels must contain "
500                                                  "2 classes.")
501
502    kernel_options = ['gaussian', 'laplacian', 'cauchy']
503    assert kernel_type.lower() in kernel_options, ("Given kernel type not "
504                                                   "implemented. Gaussian, "
505                                                   "Laplacian, and Cauchy "
506                                                   "are the acceptable "
507                                                   "types.")
508
509    # Create adata object and add column names
510    adata = ad.AnnData(X)
511    adata.var_names = feature_names
512
513    if isinstance(obs_names, (np.ndarray)):
514        adata.obs_names = obs_names
515
516    filtered_feature_names, group_dict = filter_features(feature_names, 
517                                                          group_dict)
518    
519    # Ensuring that there are common features between feature_names and groups
520    overlap_err = ("No common features between feature names and grouping "
521                   "dict. Check grouping.")
522    assert len(filtered_feature_names) > 0, overlap_err
523
524    if remove_features:
525        warnings.filterwarnings('ignore', category = ad.ImplicitModificationWarning)
526        adata = adata[:, filtered_feature_names]
527    
528    gc.collect()
529
530    # Add metadata to adata object
531    adata.uns['group_dict'] = group_dict
532    adata.uns['seed_obj'] = np.random.default_rng(100*random_state)
533    adata.uns['scale_data'] = scale_data
534    adata.uns['transform_data'] = transform_data
535    adata.uns['kernel_type'] = kernel_type
536    adata.uns['distance_metric'] = distance_metric
537    adata.uns['reduction'] = reduction if isinstance(reduction, str) else 'None'
538    adata.uns['tfidf'] = tfidf
539
540    if (split_data is None):
541        assert X.shape[0] == len(cell_labels), ("Different number of cells "
542                                                "than labels")
543        adata.obs['labels'] = cell_labels
544
545        if (allow_multiclass == False):
546            split = binary_split(cell_labels, 
547                                  seed_obj = adata.uns['seed_obj'],
548                                  train_ratio = train_ratio)
549            train_indices, test_indices = split
550
551        elif (allow_multiclass == True):
552            split = multi_class_split(cell_labels, 
553                                       seed_obj = adata.uns['seed_obj'], 
554                                       class_threshold = class_threshold,
555                                       train_ratio = train_ratio)
556            train_indices, test_indices = split
557
558        adata.uns['labeled_test'] = True
559
560    else:
561        sd_err_message = "`split_data` argument must be of type np.ndarray"
562        assert isinstance(split_data, np.ndarray), sd_err_message
563        x_eq_labs = X.shape[0] == len(cell_labels)
564        train_eq_labs = X.shape[0] == len(cell_labels)
565        assert x_eq_labs or train_eq_labs, ("Must give labels for all cells "
566                                            "or only for training cells")
567        
568        train_indices = np.where(split_data == 'train')[0]
569        test_indices = np.where(split_data == 'test')[0]
570
571        if len(cell_labels) == len(train_indices):
572
573            padded_cell_labels = np.zeros((X.shape[0])).astype('object')
574            padded_cell_labels[train_indices] = cell_labels
575            padded_cell_labels[test_indices] = 'padded_test_label'
576
577            adata.obs['labels'] = padded_cell_labels
578            adata.uns['labeled_test'] = False
579
580        elif len(cell_labels) == len(split_data):
581            adata.obs['labels'] = cell_labels
582            adata.uns['labeled_test'] = True
583
584    # Ensuring all train samples are first in adata object followed by test
585    sort_idx, train_indices, test_indices = sort_samples(train_indices, 
586                                                         test_indices)
587
588    adata = adata[sort_idx]
589
590    if not isinstance(obs_names, (np.ndarray)):
591        adata.obs = adata.obs.reset_index(drop=True)
592        adata.obs.index = adata.obs.index.astype('O')
593
594    adata.uns['train_indices'] = train_indices
595    adata.uns['test_indices'] = test_indices
596
597    adata.uns['D'] = get_optimal_d(adata, D, allow_multiclass, other_factor)
598
599    if not scale_data:
600        print("WARNING: Data will not be scaled. "
601              "To change this behavior, set scale_data to True")
602    if not transform_data:
603        print("WARNING: Data will not be transformed."
604              "To change this behavior, set transform_data to True")
605
606    return adata

Function to create an AnnData object to carry all relevant information going forward.

Parameters
  • X (scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame): A data matrix of cells by features (sparse array recommended for large datasets).
  • feature_names (np.ndarray): Array of feature names corresponding with the features in X.
  • cell_labels (np.ndarray): A numpy array of cell phenotypes corresponding with the cells in X.
  • group_dict (dict): Dictionary containing feature grouping information (i.e. {geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: np.array([...]), ...}.
  • obs_names (None | np.ndarray): The cell names corresponding to X to be assigned to output object .obs_names attribute.
  • scale_data (bool): If True, data matrix is log transformed and standard scaled. Default is True.
  • transform_data (bool): If True, data will be log1p transformed (recommended for counts data). Default is False.
  • split_data (None | np.ndarray): If None, data will be split stratified by cell labels. Else, is an array of precalculated train/test split corresponding to samples. Can include labels for entire dataset to benchmark performance or for only training data to classify unknown cell types (i.e. np.array(['train', 'test', ..., 'train']).
  • D (int): Number of Random Fourier Features used to calculate Z. Should be a positive integer. Higher values of D will increase classification accuracy at the cost of computation time. If set to None, will be calculated given number of samples.
  • remove_features (bool): If True, will remove features from X and feature_names not in group_dict and remove features from groupings not in feature_names.
  • train_ratio (float): Ratio of number of training samples to entire data set. Note: if a threshold is applied, the ratio training samples may decrease depending on class balance and class_threshold parameter if allow_multiclass = True.
  • distance_metric (str): The pairwise distance metric used to estimate sigma. Must be one of the options used in scipy.spatial.distance.cdist.
  • kernel_type (str): The approximated kernel function used to calculate Zs. Must be one of 'Gaussian', 'Laplacian', or 'Cauchy'.
  • random_state (int): Integer random_state used to set the seed for reproducibilty.
  • allow_multiclass (bool): If False, will ensure that cell labels are binary.
  • class_threshold (str | int): Number of samples allowed in the training data for each cell class in the training data. If 'median', the median number of cells per cell class will be the threshold for number of samples per class.
  • reduction (str | None): Choose which dimension reduction technique to perform on features within a group. 'svd' will run sklearn.decomposition.TruncatedSVD, 'linear' will multiply by an array of 1s down to 50 dimensions. 'pca' will replace each group values with 50 PCs from principal component analysis.
  • tfidf (bool): Whether to calculate TFIDF transformation on peaks within groupings.
Returns
  • adata (ad.AnnData): AnnData with the following attributes and keys:

    adata.X (array_like): Data matrix.

    adata.var_names (array_like): Feature names corresponding to adata.X.

    adata.obs['labels'] (array_like): cell classes/phenotypes from cell_labels.

    adata.uns['train_indices'] (array_like): Indices for training data.

    adata.uns['test_indices'] (array_like) Indices for testing data.

    adata.uns['group_dict'] (dict): Grouping information.

    adata.uns['seed_obj'] (np.random._generator.Generator): Seed object with seed equal to 100 * random_state.

    adata.uns['D'] (int): Number of dimensions to scMKL with.

    adata.uns['scale_data'] (bool): Whether or not data is scaled.

    adata.uns['transform_data'] (bool): Whether or not data is log1p transformed.

    adata.uns['distance_metric'] (str): Distance metric as given.

    adata.uns['kernel_type'] (str): Kernel function as given.

    adata.uns['svd'] (bool): Whether to calculate SVD reduction.

    adata.uns['tfidf'] (bool): Whether to calculate TF-IDF per grouping.

Examples
>>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz')
>>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True)
>>> group_dict = np.load('hallmark_genesets.pkl', 
>>>                      allow_pickle = True)
>>> 
>>> adata = scmkl.create_adata(X = data_mat, 
...                            feature_names = gene_names, 
...                            group_dict = group_dict)
>>> adata
AnnData object with n_obs × n_vars = 1000 × 4341
obs: 'labels'
uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
'distance_metric', 'train_indices', 'test_indices'
def format_adata( adata: anndata._core.anndata.AnnData | str, cell_labels: numpy.ndarray | str, group_dict: dict | str, use_raw: bool = False, scale_data: bool = True, transform_data: bool = False, split_data: numpy.ndarray | None = None, D: int | None = None, remove_features: bool = True, train_ratio: float = 0.8, distance_metric: str = 'euclidean', kernel_type: str = 'Gaussian', random_state: int = 1, allow_multiclass: bool = False, class_threshold: str | int | None = None, reduction: str | None = None, tfidf: bool = False):
609def format_adata(adata: ad.AnnData | str, cell_labels: np.ndarray | str, 
610                 group_dict: dict | str, use_raw: bool=False, 
611                 scale_data: bool=True, transform_data: bool=False, 
612                 split_data: np.ndarray | None=None, D: int | None=None, 
613                 remove_features: bool=True, train_ratio: float=0.8, 
614                 distance_metric: str='euclidean', kernel_type: str='Gaussian', 
615                 random_state: int=1, allow_multiclass: bool = False, 
616                 class_threshold: str | int | None=None, 
617                 reduction: str | None = None, tfidf: bool = False):
618    """
619    Function to format an `ad.AnnData` object to carry all relevant 
620    information going forward. `adata.obs_names` will be retained.
621
622    **NOTE: Information not needed for running `scmkl` will be 
623    removed.**
624
625    Parameters
626    ----------
627    adata : ad.AnnData
628        Object with data for `scmkl` to be applied to. Only requirment 
629        is that `.var_names` is correct and data matrix is in `adata.X` 
630        or `adata.raw.X`. A h5ad file can be provided as a `str` and it 
631        will be read in.
632
633    cell_labels : np.ndarray | str
634        If type `str`, the labels for `scmkl` to learn are captured 
635        from `adata.obs['cell_labels']`. Else, a `np.ndarray` of cell 
636        phenotypes corresponding with the cells in `adata.X`.
637
638    group_dict : dict | str
639        Dictionary containing feature grouping information (i.e. 
640        `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 
641        np.array([...]), ...}`. A pickle file can be provided as a `str` 
642        and it will be read in.
643
644    obs_names : None | np.ndarray
645        The cell names corresponding to `X` to be assigned to output 
646        object `.obs_names` attribute.
647
648    use_raw : bool
649        If `False`, will use `adata.X` to create new `adata`. Else, 
650        will use `adata.raw.X`.
651
652    scale_data : bool  
653        If `True`, data matrix is log transformed and standard 
654        scaled. 
655
656    transform_data : bool
657        If `True`, data will be log1p transformed (recommended for 
658        counts data). Default is `False`. 
659        
660    split_data : None | np.ndarray
661        If `None`, data will be split stratified by cell labels. 
662        Else, is an array of precalculated train/test split 
663        corresponding to samples. Can include labels for entire
664        dataset to benchmark performance or for only training
665        data to classify unknown cell types (i.e. `np.array(['train', 
666        'test', ..., 'train'])`.
667
668    D : int 
669        Number of Random Fourier Features used to calculate Z. 
670        Should be a positive integer. Higher values of D will 
671        increase classification accuracy at the cost of computation 
672        time. If set to `None`, will be calculated given number of 
673        samples. 
674    
675    remove_features : bool
676        If `True`, will remove features from `X` and `feature_names` 
677        not in `group_dict` and remove features from groupings not in 
678        `feature_names`.
679
680    train_ratio : float
681        Ratio of number of training samples to entire data set. Note:
682        if a threshold is applied, the ratio training samples may 
683        decrease depending on class balance and `class_threshold`
684        parameter if `allow_multiclass = True`.
685
686    distance_metric : str
687        The pairwise distance metric used to estimate sigma. Must
688        be one of the options used in `scipy.spatial.distance.cdist`.
689
690    kernel_type : str
691        The approximated kernel function used to calculate Zs.
692        Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`.
693
694    random_state : int
695        Integer random_state used to set the seed for 
696        reproducibilty.
697
698    allow_multiclass : bool
699        If `False`, will ensure that cell labels are binary.
700
701    class_threshold : str | int
702        Number of samples allowed in the training data for each cell
703        class in the training data. If `'median'`, the median number 
704        of cells per cell class will be the threshold for number of 
705        samples per class.
706
707    reduction: str | None
708        Choose which dimension reduction technique to perform on 
709        features within a group. 'svd' will run 
710        `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 
711        by an array of 1s down to 50 dimensions.
712        
713    tfidf: bool
714        Whether to calculate TFIDF transformation on peaks within 
715        groupings.
716        
717    Returns
718    -------
719    adata : ad.AnnData
720        AnnData with the following attributes and keys:
721
722        `adata.X` (array_like):
723            Data matrix.
724    
725        `adata.var_names` (array_like): 
726            Feature names corresponding to `adata.X`.
727
728        `adata.obs['labels']` (array_like):
729            cell classes/phenotypes from `cell_labels`.
730
731        `adata.uns['train_indices']` (array_like):
732            Indices for training data. 
733
734        `adata.uns['test_indices']` (array_like)
735            Indices for testing data.
736
737        `adata.uns['group_dict']` (dict):
738            Grouping information.
739
740        `adata.uns['seed_obj']` (np.random._generator.Generator): 
741            Seed object with seed equal to 100 * `random_state`.
742
743        `adata.uns['D']` (int):
744            Number of dimensions to scMKL with.
745
746        `adata.uns['scale_data']` (bool):
747            Whether or not data is scaled.
748
749        `adata.uns['transform_data']` (bool):
750            Whether or not data is log1p transformed.
751
752        `adata.uns['distance_metric']` (str): 
753            Distance metric as given.
754    
755        `adata.uns['kernel_type']` (str): 
756            Kernel function as given.
757
758        `adata.uns['svd']` (bool): 
759            Whether to calculate SVD reduction.
760
761        `adata.uns['tfidf']` (bool): 
762            Whether to calculate TF-IDF per grouping.
763
764    Examples
765    --------
766    >>> adata = ad.read_h5ad('MCF7_rna.h5ad')
767    >>> group_dict = np.load('hallmark_genesets.pkl', 
768    >>>                      allow_pickle = True)
769    >>> 
770    >>> 
771    >>> # The labels in adata.obs we want to learn are 'celltypes'
772    >>> adata = scmkl.format_adata(adata, 'celltypes', 
773    ...                            group_dict)
774    >>> adata
775    AnnData object with n_obs × n_vars = 1000 × 4341
776    obs: 'labels'
777    uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
778    'distance_metric', 'train_indices', 'test_indices'
779    """
780    if str == type(adata):
781        adata = ad.read_h5ad(adata)
782
783    if str == type(group_dict):
784        group_dict = np.load(group_dict, allow_pickle=True)
785        
786    if str == type(cell_labels):
787        err_msg = f"{cell_labels} is not in `adata.obs`"
788        assert cell_labels in adata.obs.keys(), err_msg
789        cell_labels = adata.obs[cell_labels].to_numpy()
790    
791    if use_raw:
792        assert adata.raw, "`adata.raw` is empty, set `use_raw` to `False`"
793        X = adata.raw.X
794    else:
795        X = adata.X
796
797    adata = create_adata(X, adata.var_names.to_numpy().copy(), cell_labels, 
798                         group_dict, adata.obs_names.to_numpy().copy(), 
799                         scale_data, transform_data, split_data, D, remove_features, 
800                         train_ratio, distance_metric, kernel_type, 
801                         random_state, allow_multiclass, class_threshold, 
802                         reduction, tfidf)
803
804    return adata

Function to format an ad.AnnData object to carry all relevant information going forward. adata.obs_names will be retained.

NOTE: Information not needed for running scmkl will be removed.

Parameters
  • adata (ad.AnnData): Object with data for scmkl to be applied to. Only requirment is that .var_names is correct and data matrix is in adata.X or adata.raw.X. A h5ad file can be provided as a str and it will be read in.
  • cell_labels (np.ndarray | str): If type str, the labels for scmkl to learn are captured from adata.obs['cell_labels']. Else, a np.ndarray of cell phenotypes corresponding with the cells in adata.X.
  • group_dict (dict | str): Dictionary containing feature grouping information (i.e. {geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: np.array([...]), ...}. A pickle file can be provided as a str and it will be read in.
  • obs_names (None | np.ndarray): The cell names corresponding to X to be assigned to output object .obs_names attribute.
  • use_raw (bool): If False, will use adata.X to create new adata. Else, will use adata.raw.X.
  • scale_data (bool): If True, data matrix is log transformed and standard scaled.
  • transform_data (bool): If True, data will be log1p transformed (recommended for counts data). Default is False.
  • split_data (None | np.ndarray): If None, data will be split stratified by cell labels. Else, is an array of precalculated train/test split corresponding to samples. Can include labels for entire dataset to benchmark performance or for only training data to classify unknown cell types (i.e. np.array(['train', 'test', ..., 'train']).
  • D (int): Number of Random Fourier Features used to calculate Z. Should be a positive integer. Higher values of D will increase classification accuracy at the cost of computation time. If set to None, will be calculated given number of samples.
  • remove_features (bool): If True, will remove features from X and feature_names not in group_dict and remove features from groupings not in feature_names.
  • train_ratio (float): Ratio of number of training samples to entire data set. Note: if a threshold is applied, the ratio training samples may decrease depending on class balance and class_threshold parameter if allow_multiclass = True.
  • distance_metric (str): The pairwise distance metric used to estimate sigma. Must be one of the options used in scipy.spatial.distance.cdist.
  • kernel_type (str): The approximated kernel function used to calculate Zs. Must be one of 'Gaussian', 'Laplacian', or 'Cauchy'.
  • random_state (int): Integer random_state used to set the seed for reproducibilty.
  • allow_multiclass (bool): If False, will ensure that cell labels are binary.
  • class_threshold (str | int): Number of samples allowed in the training data for each cell class in the training data. If 'median', the median number of cells per cell class will be the threshold for number of samples per class.
  • reduction (str | None): Choose which dimension reduction technique to perform on features within a group. 'svd' will run sklearn.decomposition.TruncatedSVD, 'linear' will multiply by an array of 1s down to 50 dimensions.
  • tfidf (bool): Whether to calculate TFIDF transformation on peaks within groupings.
Returns
  • adata (ad.AnnData): AnnData with the following attributes and keys:

    adata.X (array_like): Data matrix.

    adata.var_names (array_like): Feature names corresponding to adata.X.

    adata.obs['labels'] (array_like): cell classes/phenotypes from cell_labels.

    adata.uns['train_indices'] (array_like): Indices for training data.

    adata.uns['test_indices'] (array_like) Indices for testing data.

    adata.uns['group_dict'] (dict): Grouping information.

    adata.uns['seed_obj'] (np.random._generator.Generator): Seed object with seed equal to 100 * random_state.

    adata.uns['D'] (int): Number of dimensions to scMKL with.

    adata.uns['scale_data'] (bool): Whether or not data is scaled.

    adata.uns['transform_data'] (bool): Whether or not data is log1p transformed.

    adata.uns['distance_metric'] (str): Distance metric as given.

    adata.uns['kernel_type'] (str): Kernel function as given.

    adata.uns['svd'] (bool): Whether to calculate SVD reduction.

    adata.uns['tfidf'] (bool): Whether to calculate TF-IDF per grouping.

Examples
>>> adata = ad.read_h5ad('MCF7_rna.h5ad')
>>> group_dict = np.load('hallmark_genesets.pkl', 
>>>                      allow_pickle = True)
>>> 
>>> 
>>> # The labels in adata.obs we want to learn are 'celltypes'
>>> adata = scmkl.format_adata(adata, 'celltypes', 
...                            group_dict)
>>> adata
AnnData object with n_obs × n_vars = 1000 × 4341
obs: 'labels'
uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 
'distance_metric', 'train_indices', 'test_indices'