scmkl.create_adata
1import numpy as np 2import anndata as ad 3import scipy 4import pandas as pd 5import gc 6import warnings 7 8 9def filter_features(feature_names: np.ndarray, group_dict: dict): 10 """ 11 Function to remove features only in feature names or group_dict. 12 Any features not included in group_dict will be removed from the 13 matrix. Also puts the features in the same relative order (of 14 included features) 15 16 Parameters 17 ---------- 18 feature_names : np.ndarray 19 Numpy array of corresponding feature names. 20 21 group_dict : dict 22 Dictionary containing feature grouping information. 23 Example: {geneset: np.array(gene_1, gene_2, ..., 24 gene_n)} 25 Returns 26 ------- 27 feature_names : np.ndarray 28 Numpy array of corresponding feature names from group_dict. 29 30 group_dict : dict 31 Dictionary containing features overlapping input grouping 32 information and full feature names. 33 """ 34 group_features = set() 35 feature_set = set(feature_names) 36 37 # Store all objects in dictionary in set 38 for group in group_dict.keys(): 39 group_features.update(set(group_dict[group])) 40 41 # Finds intersection between group features and features in data 42 # Converts to nd.array and sorts to preserve order of feature names 43 group_feats = list(feature_set.intersection(set(group_dict[group]))) 44 group_dict[group] = np.sort(np.array(group_feats)) 45 46 # Only keeping groupings that have at least two features 47 group_dict = {group : group_dict[group] for group in group_dict.keys() 48 if len(group_dict[group]) > 1} 49 50 group_features = np.array(list(group_features.intersection(feature_set))) 51 52 return group_features, group_dict 53 54 55def multi_class_split(y: np.ndarray, seed_obj: np.random._generator.Generator, 56 class_threshold: str | int | None=None, 57 train_ratio: float=0.8): 58 """ 59 Function for calculating the training and testing cell positions 60 for multiclass data sets. 61 62 Parameters 63 ---------- 64 y : array_like 65 Should be an iterable object cooresponding to samples in 66 `ad.AnnData` object. 67 68 seed_obj : np.random._generator.Generator 69 Seed used to randomly sample and split data. 70 71 train_ratio : float 72 Ratio of number of training samples to entire data set. 73 Note: if a threshold is applied, the ratio training samples 74 may decrease depending on class balance and `class_threshold` 75 parameter. 76 77 class_threshold : str | int 78 If is type `int`, classes with more samples than 79 class_threshold will be sampled. If `'median'`, 80 samples will be sampled to the median number of samples per 81 class. 82 83 Returns 84 ------- 85 train_indices : np.ndarray 86 Indices for training samples. 87 88 test_indices : np.ndarray 89 Indices for testing samples. 90 """ 91 uniq_labels = np.unique(y) 92 93 # Finding indices for each cell class 94 class_positions = {class_ : np.where(y == class_)[0] 95 for class_ in uniq_labels} 96 97 # Capturing training indices while maintaining original class proportions 98 train_samples = {class_ : seed_obj.choice(class_positions[class_], 99 int(len(class_positions[class_]) 100 * train_ratio), 101 replace = False) 102 for class_ in class_positions.keys()} 103 104 # Capturing testing indices while maintaining original class proportions 105 test_samples = {class_ : np.setdiff1d(class_positions[class_], 106 train_samples[class_]) 107 for class_ in class_positions.keys()} 108 109 # Applying threshold for samples per class 110 if class_threshold == 'median': 111 cells_per_class = [len(values) for values in train_samples.values()] 112 class_threshold = int(np.median(cells_per_class)) 113 114 if isinstance(class_threshold, int): 115 # Down sample to class_threshold 116 for class_ in train_samples.keys(): 117 if len(train_samples[class_]) > class_threshold: 118 train_samples[class_] = seed_obj.choice(train_samples[class_], 119 class_threshold) 120 121 train_indices = np.array([idx for class_ in train_samples.keys() 122 for idx in train_samples[class_]]) 123 124 test_indices = np.array([idx for class_ in test_samples.keys() 125 for idx in test_samples[class_]]) 126 127 return train_indices, test_indices 128 129 130def binary_split(y: np.ndarray, train_indices: np.ndarray | None=None, 131 train_ratio: float=0.8, 132 seed_obj: np.random._generator.Generator=np.random.default_rng(100)): 133 """ 134 Function to calculate training and testing indices for given 135 dataset. If train indices are given, it will calculate the test 136 indices. If train_indices == None, then it calculates both indices, 137 preserving the ratio of each label in y 138 139 Parameters 140 ---------- 141 y : np.ndarray 142 Numpy array of cell labels. Can have any number of classes 143 for this function. 144 145 train_indices : np.ndarray | None 146 Optional array of pre-determined training indices 147 148 train_ratio : float 149 Decimal value ratio of features in training/testing sets 150 151 seed_obj : np.random._generator.Generator 152 Numpy random state used for random processes. Can be 153 specified for reproducubility or set by default. 154 155 156 Returns 157 ------- 158 train_indices : np.ndarray 159 Array of indices of training cells. 160 161 test_indices : np.ndarray: 162 Array of indices of testing cells. 163 """ 164 # If train indices aren't provided 165 if train_indices is None: 166 167 unique_labels = np.unique(y) 168 train_indices = [] 169 170 for label in unique_labels: 171 172 # Find indices of each unique label 173 label_indices = np.where(y == label)[0] 174 175 # Sample these indices according to train ratio 176 n = int(len(label_indices) * train_ratio) 177 train_label_indices = seed_obj.choice(label_indices, n, 178 replace = False) 179 train_indices.extend(train_label_indices) 180 else: 181 assert len(train_indices) <= len(y), ("More train indices than there " 182 "are samples") 183 184 train_indices = np.array(train_indices) 185 186 # Test indices are the indices not in the train_indices 187 test_indices = np.setdiff1d(np.arange(len(y)), train_indices, 188 assume_unique = True) 189 190 return train_indices, test_indices 191 192 193def get_median_size(adata: ad.AnnData, other_factor: float=1.5): 194 """ 195 Returns the median size of training plus testing samples per cell 196 type. Used to calculate D for multiclass runs. 197 198 Parameters 199 ---------- 200 adata : ad.AnnData 201 An ad.AnnData object with `test_indices` in `.uns` keys and 202 `labels` in `.obs` keys. 203 204 Returns 205 ------- 206 median_size : int 207 The median size of training plus testing samples across cell 208 types. 209 """ 210 n_test = adata.uns['test_indices'].shape[0] 211 212 _, n_cts = np.unique(adata.obs['labels'][adata.uns['train_indices']], 213 return_counts=True) 214 sizes = [n_test + (other_factor*count) for count in n_cts] 215 216 return np.median(sizes) 217 218 219def calculate_d(num_samples : int): 220 """ 221 This function calculates the optimal number of dimensions for 222 performance. See https://doi.org/10.48550/arXiv.1806.09178 for more 223 information. 224 225 Parameters 226 ---------- 227 num_samples : int 228 The number of samples in the data set including both training 229 and testing sets. 230 231 Returns 232 ------- 233 d : int 234 The optimal number of dimensions to run scMKL with the given 235 data set. 236 237 Examples 238 -------- 239 >>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz') 240 >>> 241 >>> num_cells = raw_counts.shape[0] 242 >>> d = scmkl.calculate_d(num_cells) 243 >>> d 244 161 245 """ 246 d = int(np.sqrt(num_samples)*np.log(np.log(num_samples))) 247 248 return int(np.max([d, 100])) 249 250 251def get_optimal_d(adata: ad.AnnData, D: int | None, allow_multiclass: bool, 252 other_factor: float=1.5): 253 """ 254 Takes the ad.AnnData object and input D. If D is type `int`, D will 255 be return. If D is `None` and `allow_multiclass is False`, 256 `scmkl.calculate_d(adata.shape[0])` will be returned. Else, median 257 size of training and testing will be calculated and 258 `scmkl.calculate_d(median_size)` will be returned. 259 260 Parameters 261 ---------- 262 adata : ad.AnnData 263 An ad.AnnData object with `test_indices` in `.uns` keys and 264 `labels` in `.obs` keys. 265 266 D : int | None 267 The D provided as an `int` or `None` if optimal D should be 268 calculated. 269 270 allow_multiclass : bool 271 Should be `False` if labels are binary. Else, should be `True` 272 indicating there are more than two classes. 273 274 Returns 275 ------- 276 d : int 277 Either the input or calculated optimal d for the experiment. 278 """ 279 if D is not None: 280 assert isinstance(D, int) and D > 0, 'D must be a positive integer.' 281 return D 282 283 if allow_multiclass: 284 size = get_median_size(adata, other_factor) 285 else: 286 size = adata.shape[0] 287 288 return calculate_d(size) 289 290 291def sort_samples(train_indices, test_indices): 292 """ 293 Ensures that samples in adata obj are all training, then all 294 testing. 295 296 Parameters 297 ---------- 298 train_indices : np.ndarray 299 Indices in ad.AnnData object for training. 300 301 test_indices : np.ndarray 302 Indices in ad.AnnData object for testing. 303 304 Returns 305 ------- 306 sort_idc : np.ndarray 307 Ordered indices that will sort ad.AnnData object as all 308 training samples, then all testing. 309 310 train_indices : np.ndarray 311 The new training indices given the new index order, `sort_idc`. 312 313 test_indices : np.ndarray 314 The new testing indices given the new index order, `sort_idc`. 315 """ 316 sort_idc = np.concatenate([train_indices, test_indices]) 317 318 train_indices = np.arange(0, train_indices.shape[0]) 319 test_indices = np.arange(train_indices.shape[0], 320 train_indices.shape[0] + test_indices.shape[0]) 321 322 return sort_idc, train_indices, test_indices 323 324 325def create_adata(X: scipy.sparse._csc.csc_matrix | np.ndarray | pd.DataFrame, 326 feature_names: np.ndarray, cell_labels: np.ndarray, 327 group_dict: dict, obs_names: None | np.ndarray=None, 328 scale_data: bool=True, transform_data: bool=False, 329 split_data: np.ndarray | None=None, D: int | None=None, 330 remove_features: bool=True, train_ratio: float=0.8, 331 distance_metric: str='euclidean', kernel_type: str='Gaussian', 332 random_state: int=1, allow_multiclass: bool = False, 333 class_threshold: str | int | None = None, 334 reduction: str | None = None, tfidf: bool = False, 335 other_factor: float=1.5): 336 """ 337 Function to create an AnnData object to carry all relevant 338 information going forward. 339 340 Parameters 341 ---------- 342 X : scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame 343 A data matrix of cells by features (sparse array 344 recommended for large datasets). 345 346 feature_names : np.ndarray 347 Array of feature names corresponding with the features 348 in `X`. 349 350 cell_labels : np.ndarray 351 A numpy array of cell phenotypes corresponding with 352 the cells in `X`. 353 354 group_dict : dict 355 Dictionary containing feature grouping information (i.e. 356 `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 357 np.array([...]), ...}`. 358 359 obs_names : None | np.ndarray 360 The cell names corresponding to `X` to be assigned to output 361 object `.obs_names` attribute. 362 363 scale_data : bool 364 If `True`, data matrix is log transformed and standard 365 scaled. Default is `True`. 366 367 transform_data : bool 368 If `True`, data will be log1p transformed (recommended for 369 counts data). Default is `False`. 370 371 split_data : None | np.ndarray 372 If `None`, data will be split stratified by cell labels. 373 Else, is an array of precalculated train/test split 374 corresponding to samples. Can include labels for entire 375 dataset to benchmark performance or for only training 376 data to classify unknown cell types (i.e. `np.array(['train', 377 'test', ..., 'train'])`. 378 379 D : int 380 Number of Random Fourier Features used to calculate Z. 381 Should be a positive integer. Higher values of D will 382 increase classification accuracy at the cost of computation 383 time. If set to `None`, will be calculated given number of 384 samples. 385 386 remove_features : bool 387 If `True`, will remove features from `X` and `feature_names` 388 not in `group_dict` and remove features from groupings not in 389 `feature_names`. 390 391 train_ratio : float 392 Ratio of number of training samples to entire data set. Note: 393 if a threshold is applied, the ratio training samples may 394 decrease depending on class balance and `class_threshold` 395 parameter if `allow_multiclass = True`. 396 397 distance_metric : str 398 The pairwise distance metric used to estimate sigma. Must 399 be one of the options used in `scipy.spatial.distance.cdist`. 400 401 kernel_type : str 402 The approximated kernel function used to calculate Zs. 403 Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`. 404 405 random_state : int 406 Integer random_state used to set the seed for 407 reproducibilty. 408 409 allow_multiclass : bool 410 If `False`, will ensure that cell labels are binary. 411 412 class_threshold : str | int 413 Number of samples allowed in the training data for each cell 414 class in the training data. If `'median'`, the median number 415 of cells per cell class will be the threshold for number of 416 samples per class. 417 418 reduction: str | None 419 Choose which dimension reduction technique to perform on 420 features within a group. 'svd' will run 421 `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 422 by an array of 1s down to 50 dimensions. 'pca' will replace 423 each group values with 50 PCs from principal component 424 analysis. 425 426 tfidf: bool 427 Whether to calculate TFIDF transformation on peaks within 428 groupings. 429 430 Returns 431 ------- 432 adata : ad.AnnData 433 AnnData with the following attributes and keys: 434 435 `adata.X` (array_like): 436 Data matrix. 437 438 `adata.var_names` (array_like): 439 Feature names corresponding to `adata.X`. 440 441 `adata.obs['labels']` (array_like): 442 cell classes/phenotypes from `cell_labels`. 443 444 `adata.uns['train_indices']` (array_like): 445 Indices for training data. 446 447 `adata.uns['test_indices']` (array_like) 448 Indices for testing data. 449 450 `adata.uns['group_dict']` (dict): 451 Grouping information. 452 453 `adata.uns['seed_obj']` (np.random._generator.Generator): 454 Seed object with seed equal to 100 * `random_state`. 455 456 `adata.uns['D']` (int): 457 Number of dimensions to scMKL with. 458 459 `adata.uns['scale_data']` (bool): 460 Whether or not data is scaled. 461 462 `adata.uns['transform_data']` (bool): 463 Whether or not data is log1p transformed. 464 465 `adata.uns['distance_metric']` (str): 466 Distance metric as given. 467 468 `adata.uns['kernel_type']` (str): 469 Kernel function as given. 470 471 `adata.uns['svd']` (bool): 472 Whether to calculate SVD reduction. 473 474 `adata.uns['tfidf']` (bool): 475 Whether to calculate TF-IDF per grouping. 476 477 Examples 478 -------- 479 >>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz') 480 >>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True) 481 >>> group_dict = np.load('hallmark_genesets.pkl', 482 >>> allow_pickle = True) 483 >>> 484 >>> adata = scmkl.create_adata(X = data_mat, 485 ... feature_names = gene_names, 486 ... group_dict = group_dict) 487 >>> adata 488 AnnData object with n_obs × n_vars = 1000 × 4341 489 obs: 'labels' 490 uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 491 'distance_metric', 'train_indices', 'test_indices' 492 """ 493 494 assert X.shape[1] == len(feature_names), ("Different number of features " 495 "in X than feature names.") 496 497 if not allow_multiclass: 498 assert len(np.unique(cell_labels)) == 2, ("cell_labels must contain " 499 "2 classes.") 500 501 kernel_options = ['gaussian', 'laplacian', 'cauchy'] 502 assert kernel_type.lower() in kernel_options, ("Given kernel type not " 503 "implemented. Gaussian, " 504 "Laplacian, and Cauchy " 505 "are the acceptable " 506 "types.") 507 508 # Create adata object and add column names 509 adata = ad.AnnData(X) 510 adata.var_names = feature_names 511 512 if isinstance(obs_names, (np.ndarray)): 513 adata.obs_names = obs_names 514 515 filtered_feature_names, group_dict = filter_features(feature_names, 516 group_dict) 517 518 # Ensuring that there are common features between feature_names and groups 519 overlap_err = ("No common features between feature names and grouping " 520 "dict. Check grouping.") 521 assert len(filtered_feature_names) > 0, overlap_err 522 523 if remove_features: 524 warnings.filterwarnings('ignore', category = ad.ImplicitModificationWarning) 525 adata = adata[:, filtered_feature_names] 526 527 gc.collect() 528 529 # Add metadata to adata object 530 adata.uns['group_dict'] = group_dict 531 adata.uns['seed_obj'] = np.random.default_rng(100*random_state) 532 adata.uns['scale_data'] = scale_data 533 adata.uns['transform_data'] = transform_data 534 adata.uns['kernel_type'] = kernel_type 535 adata.uns['distance_metric'] = distance_metric 536 adata.uns['reduction'] = reduction if isinstance(reduction, str) else 'None' 537 adata.uns['tfidf'] = tfidf 538 539 if (split_data is None): 540 assert X.shape[0] == len(cell_labels), ("Different number of cells " 541 "than labels") 542 adata.obs['labels'] = cell_labels 543 544 if (allow_multiclass == False): 545 split = binary_split(cell_labels, 546 seed_obj = adata.uns['seed_obj'], 547 train_ratio = train_ratio) 548 train_indices, test_indices = split 549 550 elif (allow_multiclass == True): 551 split = multi_class_split(cell_labels, 552 seed_obj = adata.uns['seed_obj'], 553 class_threshold = class_threshold, 554 train_ratio = train_ratio) 555 train_indices, test_indices = split 556 557 adata.uns['labeled_test'] = True 558 559 else: 560 sd_err_message = "`split_data` argument must be of type np.ndarray" 561 assert isinstance(split_data, np.ndarray), sd_err_message 562 x_eq_labs = X.shape[0] == len(cell_labels) 563 train_eq_labs = X.shape[0] == len(cell_labels) 564 assert x_eq_labs or train_eq_labs, ("Must give labels for all cells " 565 "or only for training cells") 566 567 train_indices = np.where(split_data == 'train')[0] 568 test_indices = np.where(split_data == 'test')[0] 569 570 if len(cell_labels) == len(train_indices): 571 572 padded_cell_labels = np.zeros((X.shape[0])).astype('object') 573 padded_cell_labels[train_indices] = cell_labels 574 padded_cell_labels[test_indices] = 'padded_test_label' 575 576 adata.obs['labels'] = padded_cell_labels 577 adata.uns['labeled_test'] = False 578 579 elif len(cell_labels) == len(split_data): 580 adata.obs['labels'] = cell_labels 581 adata.uns['labeled_test'] = True 582 583 # Ensuring all train samples are first in adata object followed by test 584 sort_idx, train_indices, test_indices = sort_samples(train_indices, 585 test_indices) 586 587 adata = adata[sort_idx] 588 589 if not isinstance(obs_names, (np.ndarray)): 590 adata.obs = adata.obs.reset_index(drop=True) 591 adata.obs.index = adata.obs.index.astype('O') 592 593 adata.uns['train_indices'] = train_indices 594 adata.uns['test_indices'] = test_indices 595 596 adata.uns['D'] = get_optimal_d(adata, D, allow_multiclass, other_factor) 597 598 if not scale_data: 599 print("WARNING: Data will not be scaled. " 600 "To change this behavior, set scale_data to True") 601 if not transform_data: 602 print("WARNING: Data will not be transformed." 603 "To change this behavior, set transform_data to True") 604 605 return adata 606 607 608def format_adata(adata: ad.AnnData | str, cell_labels: np.ndarray | str, 609 group_dict: dict | str, use_raw: bool=False, 610 scale_data: bool=True, transform_data: bool=False, 611 split_data: np.ndarray | None=None, D: int | None=None, 612 remove_features: bool=True, train_ratio: float=0.8, 613 distance_metric: str='euclidean', kernel_type: str='Gaussian', 614 random_state: int=1, allow_multiclass: bool = False, 615 class_threshold: str | int | None=None, 616 reduction: str | None = None, tfidf: bool = False): 617 """ 618 Function to format an `ad.AnnData` object to carry all relevant 619 information going forward. `adata.obs_names` will be retained. 620 621 **NOTE: Information not needed for running `scmkl` will be 622 removed.** 623 624 Parameters 625 ---------- 626 adata : ad.AnnData 627 Object with data for `scmkl` to be applied to. Only requirment 628 is that `.var_names` is correct and data matrix is in `adata.X` 629 or `adata.raw.X`. A h5ad file can be provided as a `str` and it 630 will be read in. 631 632 cell_labels : np.ndarray | str 633 If type `str`, the labels for `scmkl` to learn are captured 634 from `adata.obs['cell_labels']`. Else, a `np.ndarray` of cell 635 phenotypes corresponding with the cells in `adata.X`. 636 637 group_dict : dict | str 638 Dictionary containing feature grouping information (i.e. 639 `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 640 np.array([...]), ...}`. A pickle file can be provided as a `str` 641 and it will be read in. 642 643 obs_names : None | np.ndarray 644 The cell names corresponding to `X` to be assigned to output 645 object `.obs_names` attribute. 646 647 use_raw : bool 648 If `False`, will use `adata.X` to create new `adata`. Else, 649 will use `adata.raw.X`. 650 651 scale_data : bool 652 If `True`, data matrix is log transformed and standard 653 scaled. 654 655 transform_data : bool 656 If `True`, data will be log1p transformed (recommended for 657 counts data). Default is `False`. 658 659 split_data : None | np.ndarray 660 If `None`, data will be split stratified by cell labels. 661 Else, is an array of precalculated train/test split 662 corresponding to samples. Can include labels for entire 663 dataset to benchmark performance or for only training 664 data to classify unknown cell types (i.e. `np.array(['train', 665 'test', ..., 'train'])`. 666 667 D : int 668 Number of Random Fourier Features used to calculate Z. 669 Should be a positive integer. Higher values of D will 670 increase classification accuracy at the cost of computation 671 time. If set to `None`, will be calculated given number of 672 samples. 673 674 remove_features : bool 675 If `True`, will remove features from `X` and `feature_names` 676 not in `group_dict` and remove features from groupings not in 677 `feature_names`. 678 679 train_ratio : float 680 Ratio of number of training samples to entire data set. Note: 681 if a threshold is applied, the ratio training samples may 682 decrease depending on class balance and `class_threshold` 683 parameter if `allow_multiclass = True`. 684 685 distance_metric : str 686 The pairwise distance metric used to estimate sigma. Must 687 be one of the options used in `scipy.spatial.distance.cdist`. 688 689 kernel_type : str 690 The approximated kernel function used to calculate Zs. 691 Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`. 692 693 random_state : int 694 Integer random_state used to set the seed for 695 reproducibilty. 696 697 allow_multiclass : bool 698 If `False`, will ensure that cell labels are binary. 699 700 class_threshold : str | int 701 Number of samples allowed in the training data for each cell 702 class in the training data. If `'median'`, the median number 703 of cells per cell class will be the threshold for number of 704 samples per class. 705 706 reduction: str | None 707 Choose which dimension reduction technique to perform on 708 features within a group. 'svd' will run 709 `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 710 by an array of 1s down to 50 dimensions. 711 712 tfidf: bool 713 Whether to calculate TFIDF transformation on peaks within 714 groupings. 715 716 Returns 717 ------- 718 adata : ad.AnnData 719 AnnData with the following attributes and keys: 720 721 `adata.X` (array_like): 722 Data matrix. 723 724 `adata.var_names` (array_like): 725 Feature names corresponding to `adata.X`. 726 727 `adata.obs['labels']` (array_like): 728 cell classes/phenotypes from `cell_labels`. 729 730 `adata.uns['train_indices']` (array_like): 731 Indices for training data. 732 733 `adata.uns['test_indices']` (array_like) 734 Indices for testing data. 735 736 `adata.uns['group_dict']` (dict): 737 Grouping information. 738 739 `adata.uns['seed_obj']` (np.random._generator.Generator): 740 Seed object with seed equal to 100 * `random_state`. 741 742 `adata.uns['D']` (int): 743 Number of dimensions to scMKL with. 744 745 `adata.uns['scale_data']` (bool): 746 Whether or not data is scaled. 747 748 `adata.uns['transform_data']` (bool): 749 Whether or not data is log1p transformed. 750 751 `adata.uns['distance_metric']` (str): 752 Distance metric as given. 753 754 `adata.uns['kernel_type']` (str): 755 Kernel function as given. 756 757 `adata.uns['svd']` (bool): 758 Whether to calculate SVD reduction. 759 760 `adata.uns['tfidf']` (bool): 761 Whether to calculate TF-IDF per grouping. 762 763 Examples 764 -------- 765 >>> adata = ad.read_h5ad('MCF7_rna.h5ad') 766 >>> group_dict = np.load('hallmark_genesets.pkl', 767 >>> allow_pickle = True) 768 >>> 769 >>> 770 >>> # The labels in adata.obs we want to learn are 'celltypes' 771 >>> adata = scmkl.format_adata(adata, 'celltypes', 772 ... group_dict) 773 >>> adata 774 AnnData object with n_obs × n_vars = 1000 × 4341 775 obs: 'labels' 776 uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 777 'distance_metric', 'train_indices', 'test_indices' 778 """ 779 if str == type(adata): 780 adata = ad.read_h5ad(adata) 781 782 if str == type(group_dict): 783 group_dict = np.load(group_dict, allow_pickle=True) 784 785 if str == type(cell_labels): 786 err_msg = f"{cell_labels} is not in `adata.obs`" 787 assert cell_labels in adata.obs.keys(), err_msg 788 cell_labels = adata.obs[cell_labels].to_numpy() 789 790 if use_raw: 791 assert adata.raw, "`adata.raw` is empty, set `use_raw` to `False`" 792 X = adata.raw.X 793 else: 794 X = adata.X 795 796 adata = create_adata(X, adata.var_names.to_numpy().copy(), cell_labels, 797 group_dict, adata.obs_names.to_numpy().copy(), 798 scale_data, transform_data, split_data, D, remove_features, 799 train_ratio, distance_metric, kernel_type, 800 random_state, allow_multiclass, class_threshold, 801 reduction, tfidf) 802 803 return adata
10def filter_features(feature_names: np.ndarray, group_dict: dict): 11 """ 12 Function to remove features only in feature names or group_dict. 13 Any features not included in group_dict will be removed from the 14 matrix. Also puts the features in the same relative order (of 15 included features) 16 17 Parameters 18 ---------- 19 feature_names : np.ndarray 20 Numpy array of corresponding feature names. 21 22 group_dict : dict 23 Dictionary containing feature grouping information. 24 Example: {geneset: np.array(gene_1, gene_2, ..., 25 gene_n)} 26 Returns 27 ------- 28 feature_names : np.ndarray 29 Numpy array of corresponding feature names from group_dict. 30 31 group_dict : dict 32 Dictionary containing features overlapping input grouping 33 information and full feature names. 34 """ 35 group_features = set() 36 feature_set = set(feature_names) 37 38 # Store all objects in dictionary in set 39 for group in group_dict.keys(): 40 group_features.update(set(group_dict[group])) 41 42 # Finds intersection between group features and features in data 43 # Converts to nd.array and sorts to preserve order of feature names 44 group_feats = list(feature_set.intersection(set(group_dict[group]))) 45 group_dict[group] = np.sort(np.array(group_feats)) 46 47 # Only keeping groupings that have at least two features 48 group_dict = {group : group_dict[group] for group in group_dict.keys() 49 if len(group_dict[group]) > 1} 50 51 group_features = np.array(list(group_features.intersection(feature_set))) 52 53 return group_features, group_dict
Function to remove features only in feature names or group_dict. Any features not included in group_dict will be removed from the matrix. Also puts the features in the same relative order (of included features)
Parameters
- feature_names (np.ndarray): Numpy array of corresponding feature names.
- group_dict (dict): Dictionary containing feature grouping information. Example: {geneset: np.array(gene_1, gene_2, ..., gene_n)}
Returns
- feature_names (np.ndarray): Numpy array of corresponding feature names from group_dict.
- group_dict (dict): Dictionary containing features overlapping input grouping information and full feature names.
56def multi_class_split(y: np.ndarray, seed_obj: np.random._generator.Generator, 57 class_threshold: str | int | None=None, 58 train_ratio: float=0.8): 59 """ 60 Function for calculating the training and testing cell positions 61 for multiclass data sets. 62 63 Parameters 64 ---------- 65 y : array_like 66 Should be an iterable object cooresponding to samples in 67 `ad.AnnData` object. 68 69 seed_obj : np.random._generator.Generator 70 Seed used to randomly sample and split data. 71 72 train_ratio : float 73 Ratio of number of training samples to entire data set. 74 Note: if a threshold is applied, the ratio training samples 75 may decrease depending on class balance and `class_threshold` 76 parameter. 77 78 class_threshold : str | int 79 If is type `int`, classes with more samples than 80 class_threshold will be sampled. If `'median'`, 81 samples will be sampled to the median number of samples per 82 class. 83 84 Returns 85 ------- 86 train_indices : np.ndarray 87 Indices for training samples. 88 89 test_indices : np.ndarray 90 Indices for testing samples. 91 """ 92 uniq_labels = np.unique(y) 93 94 # Finding indices for each cell class 95 class_positions = {class_ : np.where(y == class_)[0] 96 for class_ in uniq_labels} 97 98 # Capturing training indices while maintaining original class proportions 99 train_samples = {class_ : seed_obj.choice(class_positions[class_], 100 int(len(class_positions[class_]) 101 * train_ratio), 102 replace = False) 103 for class_ in class_positions.keys()} 104 105 # Capturing testing indices while maintaining original class proportions 106 test_samples = {class_ : np.setdiff1d(class_positions[class_], 107 train_samples[class_]) 108 for class_ in class_positions.keys()} 109 110 # Applying threshold for samples per class 111 if class_threshold == 'median': 112 cells_per_class = [len(values) for values in train_samples.values()] 113 class_threshold = int(np.median(cells_per_class)) 114 115 if isinstance(class_threshold, int): 116 # Down sample to class_threshold 117 for class_ in train_samples.keys(): 118 if len(train_samples[class_]) > class_threshold: 119 train_samples[class_] = seed_obj.choice(train_samples[class_], 120 class_threshold) 121 122 train_indices = np.array([idx for class_ in train_samples.keys() 123 for idx in train_samples[class_]]) 124 125 test_indices = np.array([idx for class_ in test_samples.keys() 126 for idx in test_samples[class_]]) 127 128 return train_indices, test_indices
Function for calculating the training and testing cell positions for multiclass data sets.
Parameters
- y (array_like):
Should be an iterable object cooresponding to samples in
ad.AnnDataobject. - seed_obj (np.random._generator.Generator): Seed used to randomly sample and split data.
- train_ratio (float):
Ratio of number of training samples to entire data set.
Note: if a threshold is applied, the ratio training samples
may decrease depending on class balance and
class_thresholdparameter. - class_threshold (str | int):
If is type
int, classes with more samples than class_threshold will be sampled. If'median', samples will be sampled to the median number of samples per class.
Returns
- train_indices (np.ndarray): Indices for training samples.
- test_indices (np.ndarray): Indices for testing samples.
131def binary_split(y: np.ndarray, train_indices: np.ndarray | None=None, 132 train_ratio: float=0.8, 133 seed_obj: np.random._generator.Generator=np.random.default_rng(100)): 134 """ 135 Function to calculate training and testing indices for given 136 dataset. If train indices are given, it will calculate the test 137 indices. If train_indices == None, then it calculates both indices, 138 preserving the ratio of each label in y 139 140 Parameters 141 ---------- 142 y : np.ndarray 143 Numpy array of cell labels. Can have any number of classes 144 for this function. 145 146 train_indices : np.ndarray | None 147 Optional array of pre-determined training indices 148 149 train_ratio : float 150 Decimal value ratio of features in training/testing sets 151 152 seed_obj : np.random._generator.Generator 153 Numpy random state used for random processes. Can be 154 specified for reproducubility or set by default. 155 156 157 Returns 158 ------- 159 train_indices : np.ndarray 160 Array of indices of training cells. 161 162 test_indices : np.ndarray: 163 Array of indices of testing cells. 164 """ 165 # If train indices aren't provided 166 if train_indices is None: 167 168 unique_labels = np.unique(y) 169 train_indices = [] 170 171 for label in unique_labels: 172 173 # Find indices of each unique label 174 label_indices = np.where(y == label)[0] 175 176 # Sample these indices according to train ratio 177 n = int(len(label_indices) * train_ratio) 178 train_label_indices = seed_obj.choice(label_indices, n, 179 replace = False) 180 train_indices.extend(train_label_indices) 181 else: 182 assert len(train_indices) <= len(y), ("More train indices than there " 183 "are samples") 184 185 train_indices = np.array(train_indices) 186 187 # Test indices are the indices not in the train_indices 188 test_indices = np.setdiff1d(np.arange(len(y)), train_indices, 189 assume_unique = True) 190 191 return train_indices, test_indices
Function to calculate training and testing indices for given dataset. If train indices are given, it will calculate the test indices. If train_indices == None, then it calculates both indices, preserving the ratio of each label in y
Parameters
- y (np.ndarray): Numpy array of cell labels. Can have any number of classes for this function.
- train_indices (np.ndarray | None): Optional array of pre-determined training indices
- train_ratio (float): Decimal value ratio of features in training/testing sets
- seed_obj (np.random._generator.Generator): Numpy random state used for random processes. Can be specified for reproducubility or set by default.
Returns
- train_indices (np.ndarray): Array of indices of training cells.
- test_indices (np.ndarray:): Array of indices of testing cells.
194def get_median_size(adata: ad.AnnData, other_factor: float=1.5): 195 """ 196 Returns the median size of training plus testing samples per cell 197 type. Used to calculate D for multiclass runs. 198 199 Parameters 200 ---------- 201 adata : ad.AnnData 202 An ad.AnnData object with `test_indices` in `.uns` keys and 203 `labels` in `.obs` keys. 204 205 Returns 206 ------- 207 median_size : int 208 The median size of training plus testing samples across cell 209 types. 210 """ 211 n_test = adata.uns['test_indices'].shape[0] 212 213 _, n_cts = np.unique(adata.obs['labels'][adata.uns['train_indices']], 214 return_counts=True) 215 sizes = [n_test + (other_factor*count) for count in n_cts] 216 217 return np.median(sizes)
Returns the median size of training plus testing samples per cell type. Used to calculate D for multiclass runs.
Parameters
- adata (ad.AnnData):
An ad.AnnData object with
test_indicesin.unskeys andlabelsin.obskeys.
Returns
- median_size (int): The median size of training plus testing samples across cell types.
220def calculate_d(num_samples : int): 221 """ 222 This function calculates the optimal number of dimensions for 223 performance. See https://doi.org/10.48550/arXiv.1806.09178 for more 224 information. 225 226 Parameters 227 ---------- 228 num_samples : int 229 The number of samples in the data set including both training 230 and testing sets. 231 232 Returns 233 ------- 234 d : int 235 The optimal number of dimensions to run scMKL with the given 236 data set. 237 238 Examples 239 -------- 240 >>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz') 241 >>> 242 >>> num_cells = raw_counts.shape[0] 243 >>> d = scmkl.calculate_d(num_cells) 244 >>> d 245 161 246 """ 247 d = int(np.sqrt(num_samples)*np.log(np.log(num_samples))) 248 249 return int(np.max([d, 100]))
This function calculates the optimal number of dimensions for performance. See https://doi.org/10.48550/arXiv.1806.09178 for more information.
Parameters
- num_samples (int): The number of samples in the data set including both training and testing sets.
Returns
- d (int): The optimal number of dimensions to run scMKL with the given data set.
Examples
>>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz')
>>>
>>> num_cells = raw_counts.shape[0]
>>> d = scmkl.calculate_d(num_cells)
>>> d
161
252def get_optimal_d(adata: ad.AnnData, D: int | None, allow_multiclass: bool, 253 other_factor: float=1.5): 254 """ 255 Takes the ad.AnnData object and input D. If D is type `int`, D will 256 be return. If D is `None` and `allow_multiclass is False`, 257 `scmkl.calculate_d(adata.shape[0])` will be returned. Else, median 258 size of training and testing will be calculated and 259 `scmkl.calculate_d(median_size)` will be returned. 260 261 Parameters 262 ---------- 263 adata : ad.AnnData 264 An ad.AnnData object with `test_indices` in `.uns` keys and 265 `labels` in `.obs` keys. 266 267 D : int | None 268 The D provided as an `int` or `None` if optimal D should be 269 calculated. 270 271 allow_multiclass : bool 272 Should be `False` if labels are binary. Else, should be `True` 273 indicating there are more than two classes. 274 275 Returns 276 ------- 277 d : int 278 Either the input or calculated optimal d for the experiment. 279 """ 280 if D is not None: 281 assert isinstance(D, int) and D > 0, 'D must be a positive integer.' 282 return D 283 284 if allow_multiclass: 285 size = get_median_size(adata, other_factor) 286 else: 287 size = adata.shape[0] 288 289 return calculate_d(size)
Takes the ad.AnnData object and input D. If D is type int, D will
be return. If D is None and allow_multiclass is False,
scmkl.calculate_d(adata.shape[0]) will be returned. Else, median
size of training and testing will be calculated and
scmkl.calculate_d(median_size) will be returned.
Parameters
- adata (ad.AnnData):
An ad.AnnData object with
test_indicesin.unskeys andlabelsin.obskeys. - D (int | None):
The D provided as an
intorNoneif optimal D should be calculated. - allow_multiclass (bool):
Should be
Falseif labels are binary. Else, should beTrueindicating there are more than two classes.
Returns
- d (int): Either the input or calculated optimal d for the experiment.
292def sort_samples(train_indices, test_indices): 293 """ 294 Ensures that samples in adata obj are all training, then all 295 testing. 296 297 Parameters 298 ---------- 299 train_indices : np.ndarray 300 Indices in ad.AnnData object for training. 301 302 test_indices : np.ndarray 303 Indices in ad.AnnData object for testing. 304 305 Returns 306 ------- 307 sort_idc : np.ndarray 308 Ordered indices that will sort ad.AnnData object as all 309 training samples, then all testing. 310 311 train_indices : np.ndarray 312 The new training indices given the new index order, `sort_idc`. 313 314 test_indices : np.ndarray 315 The new testing indices given the new index order, `sort_idc`. 316 """ 317 sort_idc = np.concatenate([train_indices, test_indices]) 318 319 train_indices = np.arange(0, train_indices.shape[0]) 320 test_indices = np.arange(train_indices.shape[0], 321 train_indices.shape[0] + test_indices.shape[0]) 322 323 return sort_idc, train_indices, test_indices
Ensures that samples in adata obj are all training, then all testing.
Parameters
- train_indices (np.ndarray): Indices in ad.AnnData object for training.
- test_indices (np.ndarray): Indices in ad.AnnData object for testing.
Returns
- sort_idc (np.ndarray): Ordered indices that will sort ad.AnnData object as all training samples, then all testing.
- train_indices (np.ndarray):
The new training indices given the new index order,
sort_idc. - test_indices (np.ndarray):
The new testing indices given the new index order,
sort_idc.
326def create_adata(X: scipy.sparse._csc.csc_matrix | np.ndarray | pd.DataFrame, 327 feature_names: np.ndarray, cell_labels: np.ndarray, 328 group_dict: dict, obs_names: None | np.ndarray=None, 329 scale_data: bool=True, transform_data: bool=False, 330 split_data: np.ndarray | None=None, D: int | None=None, 331 remove_features: bool=True, train_ratio: float=0.8, 332 distance_metric: str='euclidean', kernel_type: str='Gaussian', 333 random_state: int=1, allow_multiclass: bool = False, 334 class_threshold: str | int | None = None, 335 reduction: str | None = None, tfidf: bool = False, 336 other_factor: float=1.5): 337 """ 338 Function to create an AnnData object to carry all relevant 339 information going forward. 340 341 Parameters 342 ---------- 343 X : scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame 344 A data matrix of cells by features (sparse array 345 recommended for large datasets). 346 347 feature_names : np.ndarray 348 Array of feature names corresponding with the features 349 in `X`. 350 351 cell_labels : np.ndarray 352 A numpy array of cell phenotypes corresponding with 353 the cells in `X`. 354 355 group_dict : dict 356 Dictionary containing feature grouping information (i.e. 357 `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 358 np.array([...]), ...}`. 359 360 obs_names : None | np.ndarray 361 The cell names corresponding to `X` to be assigned to output 362 object `.obs_names` attribute. 363 364 scale_data : bool 365 If `True`, data matrix is log transformed and standard 366 scaled. Default is `True`. 367 368 transform_data : bool 369 If `True`, data will be log1p transformed (recommended for 370 counts data). Default is `False`. 371 372 split_data : None | np.ndarray 373 If `None`, data will be split stratified by cell labels. 374 Else, is an array of precalculated train/test split 375 corresponding to samples. Can include labels for entire 376 dataset to benchmark performance or for only training 377 data to classify unknown cell types (i.e. `np.array(['train', 378 'test', ..., 'train'])`. 379 380 D : int 381 Number of Random Fourier Features used to calculate Z. 382 Should be a positive integer. Higher values of D will 383 increase classification accuracy at the cost of computation 384 time. If set to `None`, will be calculated given number of 385 samples. 386 387 remove_features : bool 388 If `True`, will remove features from `X` and `feature_names` 389 not in `group_dict` and remove features from groupings not in 390 `feature_names`. 391 392 train_ratio : float 393 Ratio of number of training samples to entire data set. Note: 394 if a threshold is applied, the ratio training samples may 395 decrease depending on class balance and `class_threshold` 396 parameter if `allow_multiclass = True`. 397 398 distance_metric : str 399 The pairwise distance metric used to estimate sigma. Must 400 be one of the options used in `scipy.spatial.distance.cdist`. 401 402 kernel_type : str 403 The approximated kernel function used to calculate Zs. 404 Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`. 405 406 random_state : int 407 Integer random_state used to set the seed for 408 reproducibilty. 409 410 allow_multiclass : bool 411 If `False`, will ensure that cell labels are binary. 412 413 class_threshold : str | int 414 Number of samples allowed in the training data for each cell 415 class in the training data. If `'median'`, the median number 416 of cells per cell class will be the threshold for number of 417 samples per class. 418 419 reduction: str | None 420 Choose which dimension reduction technique to perform on 421 features within a group. 'svd' will run 422 `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 423 by an array of 1s down to 50 dimensions. 'pca' will replace 424 each group values with 50 PCs from principal component 425 analysis. 426 427 tfidf: bool 428 Whether to calculate TFIDF transformation on peaks within 429 groupings. 430 431 Returns 432 ------- 433 adata : ad.AnnData 434 AnnData with the following attributes and keys: 435 436 `adata.X` (array_like): 437 Data matrix. 438 439 `adata.var_names` (array_like): 440 Feature names corresponding to `adata.X`. 441 442 `adata.obs['labels']` (array_like): 443 cell classes/phenotypes from `cell_labels`. 444 445 `adata.uns['train_indices']` (array_like): 446 Indices for training data. 447 448 `adata.uns['test_indices']` (array_like) 449 Indices for testing data. 450 451 `adata.uns['group_dict']` (dict): 452 Grouping information. 453 454 `adata.uns['seed_obj']` (np.random._generator.Generator): 455 Seed object with seed equal to 100 * `random_state`. 456 457 `adata.uns['D']` (int): 458 Number of dimensions to scMKL with. 459 460 `adata.uns['scale_data']` (bool): 461 Whether or not data is scaled. 462 463 `adata.uns['transform_data']` (bool): 464 Whether or not data is log1p transformed. 465 466 `adata.uns['distance_metric']` (str): 467 Distance metric as given. 468 469 `adata.uns['kernel_type']` (str): 470 Kernel function as given. 471 472 `adata.uns['svd']` (bool): 473 Whether to calculate SVD reduction. 474 475 `adata.uns['tfidf']` (bool): 476 Whether to calculate TF-IDF per grouping. 477 478 Examples 479 -------- 480 >>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz') 481 >>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True) 482 >>> group_dict = np.load('hallmark_genesets.pkl', 483 >>> allow_pickle = True) 484 >>> 485 >>> adata = scmkl.create_adata(X = data_mat, 486 ... feature_names = gene_names, 487 ... group_dict = group_dict) 488 >>> adata 489 AnnData object with n_obs × n_vars = 1000 × 4341 490 obs: 'labels' 491 uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 492 'distance_metric', 'train_indices', 'test_indices' 493 """ 494 495 assert X.shape[1] == len(feature_names), ("Different number of features " 496 "in X than feature names.") 497 498 if not allow_multiclass: 499 assert len(np.unique(cell_labels)) == 2, ("cell_labels must contain " 500 "2 classes.") 501 502 kernel_options = ['gaussian', 'laplacian', 'cauchy'] 503 assert kernel_type.lower() in kernel_options, ("Given kernel type not " 504 "implemented. Gaussian, " 505 "Laplacian, and Cauchy " 506 "are the acceptable " 507 "types.") 508 509 # Create adata object and add column names 510 adata = ad.AnnData(X) 511 adata.var_names = feature_names 512 513 if isinstance(obs_names, (np.ndarray)): 514 adata.obs_names = obs_names 515 516 filtered_feature_names, group_dict = filter_features(feature_names, 517 group_dict) 518 519 # Ensuring that there are common features between feature_names and groups 520 overlap_err = ("No common features between feature names and grouping " 521 "dict. Check grouping.") 522 assert len(filtered_feature_names) > 0, overlap_err 523 524 if remove_features: 525 warnings.filterwarnings('ignore', category = ad.ImplicitModificationWarning) 526 adata = adata[:, filtered_feature_names] 527 528 gc.collect() 529 530 # Add metadata to adata object 531 adata.uns['group_dict'] = group_dict 532 adata.uns['seed_obj'] = np.random.default_rng(100*random_state) 533 adata.uns['scale_data'] = scale_data 534 adata.uns['transform_data'] = transform_data 535 adata.uns['kernel_type'] = kernel_type 536 adata.uns['distance_metric'] = distance_metric 537 adata.uns['reduction'] = reduction if isinstance(reduction, str) else 'None' 538 adata.uns['tfidf'] = tfidf 539 540 if (split_data is None): 541 assert X.shape[0] == len(cell_labels), ("Different number of cells " 542 "than labels") 543 adata.obs['labels'] = cell_labels 544 545 if (allow_multiclass == False): 546 split = binary_split(cell_labels, 547 seed_obj = adata.uns['seed_obj'], 548 train_ratio = train_ratio) 549 train_indices, test_indices = split 550 551 elif (allow_multiclass == True): 552 split = multi_class_split(cell_labels, 553 seed_obj = adata.uns['seed_obj'], 554 class_threshold = class_threshold, 555 train_ratio = train_ratio) 556 train_indices, test_indices = split 557 558 adata.uns['labeled_test'] = True 559 560 else: 561 sd_err_message = "`split_data` argument must be of type np.ndarray" 562 assert isinstance(split_data, np.ndarray), sd_err_message 563 x_eq_labs = X.shape[0] == len(cell_labels) 564 train_eq_labs = X.shape[0] == len(cell_labels) 565 assert x_eq_labs or train_eq_labs, ("Must give labels for all cells " 566 "or only for training cells") 567 568 train_indices = np.where(split_data == 'train')[0] 569 test_indices = np.where(split_data == 'test')[0] 570 571 if len(cell_labels) == len(train_indices): 572 573 padded_cell_labels = np.zeros((X.shape[0])).astype('object') 574 padded_cell_labels[train_indices] = cell_labels 575 padded_cell_labels[test_indices] = 'padded_test_label' 576 577 adata.obs['labels'] = padded_cell_labels 578 adata.uns['labeled_test'] = False 579 580 elif len(cell_labels) == len(split_data): 581 adata.obs['labels'] = cell_labels 582 adata.uns['labeled_test'] = True 583 584 # Ensuring all train samples are first in adata object followed by test 585 sort_idx, train_indices, test_indices = sort_samples(train_indices, 586 test_indices) 587 588 adata = adata[sort_idx] 589 590 if not isinstance(obs_names, (np.ndarray)): 591 adata.obs = adata.obs.reset_index(drop=True) 592 adata.obs.index = adata.obs.index.astype('O') 593 594 adata.uns['train_indices'] = train_indices 595 adata.uns['test_indices'] = test_indices 596 597 adata.uns['D'] = get_optimal_d(adata, D, allow_multiclass, other_factor) 598 599 if not scale_data: 600 print("WARNING: Data will not be scaled. " 601 "To change this behavior, set scale_data to True") 602 if not transform_data: 603 print("WARNING: Data will not be transformed." 604 "To change this behavior, set transform_data to True") 605 606 return adata
Function to create an AnnData object to carry all relevant information going forward.
Parameters
- X (scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame): A data matrix of cells by features (sparse array recommended for large datasets).
- feature_names (np.ndarray):
Array of feature names corresponding with the features
in
X. - cell_labels (np.ndarray):
A numpy array of cell phenotypes corresponding with
the cells in
X. - group_dict (dict):
Dictionary containing feature grouping information (i.e.
{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: np.array([...]), ...}. - obs_names (None | np.ndarray):
The cell names corresponding to
Xto be assigned to output object.obs_namesattribute. - scale_data (bool):
If
True, data matrix is log transformed and standard scaled. Default isTrue. - transform_data (bool):
If
True, data will be log1p transformed (recommended for counts data). Default isFalse. - split_data (None | np.ndarray):
If
None, data will be split stratified by cell labels. Else, is an array of precalculated train/test split corresponding to samples. Can include labels for entire dataset to benchmark performance or for only training data to classify unknown cell types (i.e.np.array(['train', 'test', ..., 'train']). - D (int):
Number of Random Fourier Features used to calculate Z.
Should be a positive integer. Higher values of D will
increase classification accuracy at the cost of computation
time. If set to
None, will be calculated given number of samples. - remove_features (bool):
If
True, will remove features fromXandfeature_namesnot ingroup_dictand remove features from groupings not infeature_names. - train_ratio (float):
Ratio of number of training samples to entire data set. Note:
if a threshold is applied, the ratio training samples may
decrease depending on class balance and
class_thresholdparameter ifallow_multiclass = True. - distance_metric (str):
The pairwise distance metric used to estimate sigma. Must
be one of the options used in
scipy.spatial.distance.cdist. - kernel_type (str):
The approximated kernel function used to calculate Zs.
Must be one of
'Gaussian','Laplacian', or'Cauchy'. - random_state (int): Integer random_state used to set the seed for reproducibilty.
- allow_multiclass (bool):
If
False, will ensure that cell labels are binary. - class_threshold (str | int):
Number of samples allowed in the training data for each cell
class in the training data. If
'median', the median number of cells per cell class will be the threshold for number of samples per class. - reduction (str | None):
Choose which dimension reduction technique to perform on
features within a group. 'svd' will run
sklearn.decomposition.TruncatedSVD, 'linear' will multiply by an array of 1s down to 50 dimensions. 'pca' will replace each group values with 50 PCs from principal component analysis. - tfidf (bool): Whether to calculate TFIDF transformation on peaks within groupings.
Returns
adata (ad.AnnData): AnnData with the following attributes and keys:
adata.X(array_like): Data matrix.adata.var_names(array_like): Feature names corresponding toadata.X.adata.obs['labels'](array_like): cell classes/phenotypes fromcell_labels.adata.uns['train_indices'](array_like): Indices for training data.adata.uns['test_indices'](array_like) Indices for testing data.adata.uns['group_dict'](dict): Grouping information.adata.uns['seed_obj'](np.random._generator.Generator): Seed object with seed equal to 100 *random_state.adata.uns['D'](int): Number of dimensions to scMKL with.adata.uns['scale_data'](bool): Whether or not data is scaled.adata.uns['transform_data'](bool): Whether or not data is log1p transformed.adata.uns['distance_metric'](str): Distance metric as given.adata.uns['kernel_type'](str): Kernel function as given.adata.uns['svd'](bool): Whether to calculate SVD reduction.adata.uns['tfidf'](bool): Whether to calculate TF-IDF per grouping.
Examples
>>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz')
>>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True)
>>> group_dict = np.load('hallmark_genesets.pkl',
>>> allow_pickle = True)
>>>
>>> adata = scmkl.create_adata(X = data_mat,
... feature_names = gene_names,
... group_dict = group_dict)
>>> adata
AnnData object with n_obs × n_vars = 1000 × 4341
obs: 'labels'
uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type',
'distance_metric', 'train_indices', 'test_indices'
609def format_adata(adata: ad.AnnData | str, cell_labels: np.ndarray | str, 610 group_dict: dict | str, use_raw: bool=False, 611 scale_data: bool=True, transform_data: bool=False, 612 split_data: np.ndarray | None=None, D: int | None=None, 613 remove_features: bool=True, train_ratio: float=0.8, 614 distance_metric: str='euclidean', kernel_type: str='Gaussian', 615 random_state: int=1, allow_multiclass: bool = False, 616 class_threshold: str | int | None=None, 617 reduction: str | None = None, tfidf: bool = False): 618 """ 619 Function to format an `ad.AnnData` object to carry all relevant 620 information going forward. `adata.obs_names` will be retained. 621 622 **NOTE: Information not needed for running `scmkl` will be 623 removed.** 624 625 Parameters 626 ---------- 627 adata : ad.AnnData 628 Object with data for `scmkl` to be applied to. Only requirment 629 is that `.var_names` is correct and data matrix is in `adata.X` 630 or `adata.raw.X`. A h5ad file can be provided as a `str` and it 631 will be read in. 632 633 cell_labels : np.ndarray | str 634 If type `str`, the labels for `scmkl` to learn are captured 635 from `adata.obs['cell_labels']`. Else, a `np.ndarray` of cell 636 phenotypes corresponding with the cells in `adata.X`. 637 638 group_dict : dict | str 639 Dictionary containing feature grouping information (i.e. 640 `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 641 np.array([...]), ...}`. A pickle file can be provided as a `str` 642 and it will be read in. 643 644 obs_names : None | np.ndarray 645 The cell names corresponding to `X` to be assigned to output 646 object `.obs_names` attribute. 647 648 use_raw : bool 649 If `False`, will use `adata.X` to create new `adata`. Else, 650 will use `adata.raw.X`. 651 652 scale_data : bool 653 If `True`, data matrix is log transformed and standard 654 scaled. 655 656 transform_data : bool 657 If `True`, data will be log1p transformed (recommended for 658 counts data). Default is `False`. 659 660 split_data : None | np.ndarray 661 If `None`, data will be split stratified by cell labels. 662 Else, is an array of precalculated train/test split 663 corresponding to samples. Can include labels for entire 664 dataset to benchmark performance or for only training 665 data to classify unknown cell types (i.e. `np.array(['train', 666 'test', ..., 'train'])`. 667 668 D : int 669 Number of Random Fourier Features used to calculate Z. 670 Should be a positive integer. Higher values of D will 671 increase classification accuracy at the cost of computation 672 time. If set to `None`, will be calculated given number of 673 samples. 674 675 remove_features : bool 676 If `True`, will remove features from `X` and `feature_names` 677 not in `group_dict` and remove features from groupings not in 678 `feature_names`. 679 680 train_ratio : float 681 Ratio of number of training samples to entire data set. Note: 682 if a threshold is applied, the ratio training samples may 683 decrease depending on class balance and `class_threshold` 684 parameter if `allow_multiclass = True`. 685 686 distance_metric : str 687 The pairwise distance metric used to estimate sigma. Must 688 be one of the options used in `scipy.spatial.distance.cdist`. 689 690 kernel_type : str 691 The approximated kernel function used to calculate Zs. 692 Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`. 693 694 random_state : int 695 Integer random_state used to set the seed for 696 reproducibilty. 697 698 allow_multiclass : bool 699 If `False`, will ensure that cell labels are binary. 700 701 class_threshold : str | int 702 Number of samples allowed in the training data for each cell 703 class in the training data. If `'median'`, the median number 704 of cells per cell class will be the threshold for number of 705 samples per class. 706 707 reduction: str | None 708 Choose which dimension reduction technique to perform on 709 features within a group. 'svd' will run 710 `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 711 by an array of 1s down to 50 dimensions. 712 713 tfidf: bool 714 Whether to calculate TFIDF transformation on peaks within 715 groupings. 716 717 Returns 718 ------- 719 adata : ad.AnnData 720 AnnData with the following attributes and keys: 721 722 `adata.X` (array_like): 723 Data matrix. 724 725 `adata.var_names` (array_like): 726 Feature names corresponding to `adata.X`. 727 728 `adata.obs['labels']` (array_like): 729 cell classes/phenotypes from `cell_labels`. 730 731 `adata.uns['train_indices']` (array_like): 732 Indices for training data. 733 734 `adata.uns['test_indices']` (array_like) 735 Indices for testing data. 736 737 `adata.uns['group_dict']` (dict): 738 Grouping information. 739 740 `adata.uns['seed_obj']` (np.random._generator.Generator): 741 Seed object with seed equal to 100 * `random_state`. 742 743 `adata.uns['D']` (int): 744 Number of dimensions to scMKL with. 745 746 `adata.uns['scale_data']` (bool): 747 Whether or not data is scaled. 748 749 `adata.uns['transform_data']` (bool): 750 Whether or not data is log1p transformed. 751 752 `adata.uns['distance_metric']` (str): 753 Distance metric as given. 754 755 `adata.uns['kernel_type']` (str): 756 Kernel function as given. 757 758 `adata.uns['svd']` (bool): 759 Whether to calculate SVD reduction. 760 761 `adata.uns['tfidf']` (bool): 762 Whether to calculate TF-IDF per grouping. 763 764 Examples 765 -------- 766 >>> adata = ad.read_h5ad('MCF7_rna.h5ad') 767 >>> group_dict = np.load('hallmark_genesets.pkl', 768 >>> allow_pickle = True) 769 >>> 770 >>> 771 >>> # The labels in adata.obs we want to learn are 'celltypes' 772 >>> adata = scmkl.format_adata(adata, 'celltypes', 773 ... group_dict) 774 >>> adata 775 AnnData object with n_obs × n_vars = 1000 × 4341 776 obs: 'labels' 777 uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 778 'distance_metric', 'train_indices', 'test_indices' 779 """ 780 if str == type(adata): 781 adata = ad.read_h5ad(adata) 782 783 if str == type(group_dict): 784 group_dict = np.load(group_dict, allow_pickle=True) 785 786 if str == type(cell_labels): 787 err_msg = f"{cell_labels} is not in `adata.obs`" 788 assert cell_labels in adata.obs.keys(), err_msg 789 cell_labels = adata.obs[cell_labels].to_numpy() 790 791 if use_raw: 792 assert adata.raw, "`adata.raw` is empty, set `use_raw` to `False`" 793 X = adata.raw.X 794 else: 795 X = adata.X 796 797 adata = create_adata(X, adata.var_names.to_numpy().copy(), cell_labels, 798 group_dict, adata.obs_names.to_numpy().copy(), 799 scale_data, transform_data, split_data, D, remove_features, 800 train_ratio, distance_metric, kernel_type, 801 random_state, allow_multiclass, class_threshold, 802 reduction, tfidf) 803 804 return adata
Function to format an ad.AnnData object to carry all relevant
information going forward. adata.obs_names will be retained.
NOTE: Information not needed for running scmkl will be
removed.
Parameters
- adata (ad.AnnData):
Object with data for
scmklto be applied to. Only requirment is that.var_namesis correct and data matrix is inadata.Xoradata.raw.X. A h5ad file can be provided as astrand it will be read in. - cell_labels (np.ndarray | str):
If type
str, the labels forscmklto learn are captured fromadata.obs['cell_labels']. Else, anp.ndarrayof cell phenotypes corresponding with the cells inadata.X. - group_dict (dict | str):
Dictionary containing feature grouping information (i.e.
{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: np.array([...]), ...}. A pickle file can be provided as astrand it will be read in. - obs_names (None | np.ndarray):
The cell names corresponding to
Xto be assigned to output object.obs_namesattribute. - use_raw (bool):
If
False, will useadata.Xto create newadata. Else, will useadata.raw.X. - scale_data (bool):
If
True, data matrix is log transformed and standard scaled. - transform_data (bool):
If
True, data will be log1p transformed (recommended for counts data). Default isFalse. - split_data (None | np.ndarray):
If
None, data will be split stratified by cell labels. Else, is an array of precalculated train/test split corresponding to samples. Can include labels for entire dataset to benchmark performance or for only training data to classify unknown cell types (i.e.np.array(['train', 'test', ..., 'train']). - D (int):
Number of Random Fourier Features used to calculate Z.
Should be a positive integer. Higher values of D will
increase classification accuracy at the cost of computation
time. If set to
None, will be calculated given number of samples. - remove_features (bool):
If
True, will remove features fromXandfeature_namesnot ingroup_dictand remove features from groupings not infeature_names. - train_ratio (float):
Ratio of number of training samples to entire data set. Note:
if a threshold is applied, the ratio training samples may
decrease depending on class balance and
class_thresholdparameter ifallow_multiclass = True. - distance_metric (str):
The pairwise distance metric used to estimate sigma. Must
be one of the options used in
scipy.spatial.distance.cdist. - kernel_type (str):
The approximated kernel function used to calculate Zs.
Must be one of
'Gaussian','Laplacian', or'Cauchy'. - random_state (int): Integer random_state used to set the seed for reproducibilty.
- allow_multiclass (bool):
If
False, will ensure that cell labels are binary. - class_threshold (str | int):
Number of samples allowed in the training data for each cell
class in the training data. If
'median', the median number of cells per cell class will be the threshold for number of samples per class. - reduction (str | None):
Choose which dimension reduction technique to perform on
features within a group. 'svd' will run
sklearn.decomposition.TruncatedSVD, 'linear' will multiply by an array of 1s down to 50 dimensions. - tfidf (bool): Whether to calculate TFIDF transformation on peaks within groupings.
Returns
adata (ad.AnnData): AnnData with the following attributes and keys:
adata.X(array_like): Data matrix.adata.var_names(array_like): Feature names corresponding toadata.X.adata.obs['labels'](array_like): cell classes/phenotypes fromcell_labels.adata.uns['train_indices'](array_like): Indices for training data.adata.uns['test_indices'](array_like) Indices for testing data.adata.uns['group_dict'](dict): Grouping information.adata.uns['seed_obj'](np.random._generator.Generator): Seed object with seed equal to 100 *random_state.adata.uns['D'](int): Number of dimensions to scMKL with.adata.uns['scale_data'](bool): Whether or not data is scaled.adata.uns['transform_data'](bool): Whether or not data is log1p transformed.adata.uns['distance_metric'](str): Distance metric as given.adata.uns['kernel_type'](str): Kernel function as given.adata.uns['svd'](bool): Whether to calculate SVD reduction.adata.uns['tfidf'](bool): Whether to calculate TF-IDF per grouping.
Examples
>>> adata = ad.read_h5ad('MCF7_rna.h5ad')
>>> group_dict = np.load('hallmark_genesets.pkl',
>>> allow_pickle = True)
>>>
>>>
>>> # The labels in adata.obs we want to learn are 'celltypes'
>>> adata = scmkl.format_adata(adata, 'celltypes',
... group_dict)
>>> adata
AnnData object with n_obs × n_vars = 1000 × 4341
obs: 'labels'
uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type',
'distance_metric', 'train_indices', 'test_indices'