scmkl.multimodal_processing
1import numpy as np 2import anndata as ad 3import gc 4 5from scmkl.tfidf_normalize import tfidf_normalize 6from scmkl.estimate_sigma import estimate_sigma 7from scmkl.calculate_z import calculate_z, _sparse_var 8 9 10def _combine_modalities(adatas : list, names : list, 11 combination = 'concatenate'): 12 ''' 13 Combines data sets for multimodal classification. Combined group 14 names are assay+group_name. 15 16 Parameters 17 ---------- 18 adatas : a list of AnnData objects where each object is a different 19 modality 20 21 names : a list of strings names for each modality repective to each 22 object in adatas 23 24 combination: How to combine the matrices, either sum or concatenate 25 26 Returns 27 ------- 28 combined_adata : Adata object with the combined Z matrices and 29 annotations. Annotations must match. 30 ''' 31 assert len({adata.shape[0] for adata in adatas}) == 1, ("All adatas must " 32 "have the same " 33 "number of rows") 34 assert len(np.unique(names)) == len(names), 'Assay names must be distinct' 35 assert combination.lower() in ['sum', 'concatenate'] 36 37 z_train = all(['Z_train' in adata.uns.keys() for adata in adatas]) 38 z_test = all(['Z_test' in adata.uns.keys() for adata in adatas]) 39 40 assert all([z_train, z_test]), "Z not calculated for one or more adatas" 41 42 # Combining modalities 43 combined_adata = ad.concat(adatas, uns_merge = 'same', 44 axis = 1, label = 'labels') 45 46 assert 'train_indices' in combined_adata.uns.keys(), ("Different train " 47 "test splits " 48 "between AnnData " 49 "objects") 50 51 # Conserving labels from adatas 52 combined_adata.obs = adatas[0].obs 53 54 # Creating a single dictionary with all of the groups across modalities 55 group_dict = {} 56 for name, adata in zip(names, adatas): 57 for group_name, features in adata.uns['group_dict'].items(): 58 group_dict[f'{name}-{group_name}'] = features 59 60 if combination == 'concatenate': 61 combined_adata.uns['Z_train'] = np.hstack([adata.uns['Z_train'] 62 for adata in adatas]) 63 combined_adata.uns['Z_test'] = np.hstack([adata.uns['Z_test'] 64 for adata in adatas]) 65 66 67 elif combination == 'sum': 68 69 #Check that the dimensions of all Z's are the same 70 dims = [adata.uns['Z_train'].shape for adata in adatas] 71 dims = all([dim == dims[0] for dim in dims]) 72 assert dims, 'Cannot sum Z matrices with different dimensions' 73 74 combined_adata.uns['Z_train'] = np.sum([adata.uns['Z_train'] 75 for adata in adatas], 76 axis = 0) 77 combined_adata.uns['Z_test'] = np.sum([adata.uns['Z_test'] 78 for adata in adatas], 79 axis = 0) 80 81 82 combined_adata.uns['group_dict'] = group_dict 83 84 if 'seed_obj' in adatas[0].uns_keys(): 85 combined_adata.uns['seed_obj'] = adatas[0].uns['seed_obj'] 86 else: 87 print("No random seed present in adata" 88 "Recommended for reproducibility.") 89 90 del adatas 91 gc.collect() 92 93 return combined_adata 94 95 96def multimodal_processing(adatas : list, names : list, tfidf: list): 97 ''' 98 Combines and processes a list of adata objects. 99 100 Parameters 101 ---------- 102 **adatas** : *list[AnnData]* 103 > List of AnnData objects where each object is a different 104 modality for the same cells. 105 106 **names** : *list[str]* 107 > List of string names for each modality repective to each 108 object in `adatas`. 109 110 **tfidf** : *bool* 111 > List where if element i is `True`, adata[i] will be TFIDF 112 normalized. 113 114 Returns 115 ------- 116 **adata** : *AnnData* 117 > Concatenated from objects from `adatas` with Z matrices 118 calculated. 119 120 Examples 121 -------- 122 >>> rna_adata = scmkl.create_adata(X = mcf7_rna_mat, 123 ... feature_names = gene_names, 124 ... scale_data = True, 125 ... cell_labels = cell_labels, 126 ... group_dict = rna_grouping) 127 >>> 128 >>> atac_adata = scmkl.create_adata(X = mcf7_atac_mat, 129 ... feature_names = peak_names, 130 ... scale_data = False, 131 ... cell_labels = cell_labels, 132 ... group_dict = atac_grouping) 133 >>> 134 >>> adatas = [rna_adata, atac_adata] 135 >>> mod_names = ['rna', 'atac'] 136 >>> adata = scmkl.multimodal_processing(adatas = adatas, 137 ... names = mod_names, 138 ... tfidf = [False, True]) 139 >>> 140 >>> adata 141 AnnData object with n_obs × n_vars = 1000 × 12676 142 obs: 'labels' 143 var: 'labels' 144 uns: 'D', 'kernel_type', 'distance_metric', 'train_indices', 145 'test_indices', 'Z_train', 'Z_test', 'group_dict', 'seed_obj' 146 ''' 147 import warnings 148 warnings.filterwarnings('ignore') 149 150 assert all([adata.shape[0] for adata in adatas]), ("Different number of " 151 "cells present in " 152 "each object") 153 154 # True if all train indices match 155 same_train = np.all([np.array_equal(adatas[0].uns['train_indices'], 156 adatas[i].uns['train_indices']) 157 for i in range(1, len(adatas))]) 158 159 # True if all test indices match 160 same_test = np.all([np.array_equal(adatas[0].uns['test_indices'], 161 adatas[i].uns['test_indices']) 162 for i in range(1, len(adatas))]) 163 164 assert same_train, 'Different train indices' 165 assert same_test, 'Different test indices' 166 167 # Creates a boolean array for each modality of cells with non-empty rows 168 non_empty_rows = [np.array(_sparse_var(adata.X, axis = 1) != 0).ravel() 169 for adata in adatas] 170 171 # Returns a 1d array where sample feature sums 172 # across all modalities are more than 0 173 non_empty_rows = np.logical_and(*non_empty_rows).squeeze() 174 175 # Initializing final train test split array 176 train_test = np.repeat('train', adatas[0].shape[0]) 177 train_test[adatas[0].uns['test_indices']] = 'test' 178 179 # Capturing train test split with empty rows filtered out 180 train_test = train_test[non_empty_rows] 181 train_indices = np.where(train_test == 'train')[0] 182 test_indices = np.where(train_test == 'test')[0] 183 184 # Adding train test split arrays to AnnData objects 185 # and filtering out empty samples 186 for i, adata in enumerate(adatas): 187 adatas[i].uns['train_indices'] = train_indices 188 adatas[i].uns['test_indices'] = test_indices 189 adatas[i] = adata[non_empty_rows, :] 190 # tfidf normalizing if corresponding element in tfidf is True 191 if tfidf[i]: 192 adatas[i] = tfidf_normalize(adata) 193 194 if 'Z_train' not in adatas[i].uns.keys(): 195 # AnnData update must be pointing at the object in list 196 print(f'Estimating Sigma for {names[i]}', flush = True) 197 adatas[i] = estimate_sigma(adata, n_features= 200) 198 print(f'Calculating Z for {names[i]}', flush = True) 199 adatas[i] = calculate_z(adata, n_features = 5000) 200 201 if 'labels' in adatas[0].obs: 202 all_labels = [adata.obs['labels'] for adata in adatas] 203 # Ensuring cell labels for each AnnData object are the same 204 for i in range(1, len(all_labels)): 205 same_labels = np.all(all_labels[0] == all_labels[i]) 206 assert same_labels, (f"Cell labels between AnnData object in " 207 f"position 0 and position {i} in adatas do " 208 "not match") 209 210 adata = _combine_modalities(adatas = adatas, 211 names = names, 212 combination = 'concatenate') 213 214 del adatas 215 gc.collect() 216 217 return adata
def
multimodal_processing(adatas: list, names: list, tfidf: list):
97def multimodal_processing(adatas : list, names : list, tfidf: list): 98 ''' 99 Combines and processes a list of adata objects. 100 101 Parameters 102 ---------- 103 **adatas** : *list[AnnData]* 104 > List of AnnData objects where each object is a different 105 modality for the same cells. 106 107 **names** : *list[str]* 108 > List of string names for each modality repective to each 109 object in `adatas`. 110 111 **tfidf** : *bool* 112 > List where if element i is `True`, adata[i] will be TFIDF 113 normalized. 114 115 Returns 116 ------- 117 **adata** : *AnnData* 118 > Concatenated from objects from `adatas` with Z matrices 119 calculated. 120 121 Examples 122 -------- 123 >>> rna_adata = scmkl.create_adata(X = mcf7_rna_mat, 124 ... feature_names = gene_names, 125 ... scale_data = True, 126 ... cell_labels = cell_labels, 127 ... group_dict = rna_grouping) 128 >>> 129 >>> atac_adata = scmkl.create_adata(X = mcf7_atac_mat, 130 ... feature_names = peak_names, 131 ... scale_data = False, 132 ... cell_labels = cell_labels, 133 ... group_dict = atac_grouping) 134 >>> 135 >>> adatas = [rna_adata, atac_adata] 136 >>> mod_names = ['rna', 'atac'] 137 >>> adata = scmkl.multimodal_processing(adatas = adatas, 138 ... names = mod_names, 139 ... tfidf = [False, True]) 140 >>> 141 >>> adata 142 AnnData object with n_obs × n_vars = 1000 × 12676 143 obs: 'labels' 144 var: 'labels' 145 uns: 'D', 'kernel_type', 'distance_metric', 'train_indices', 146 'test_indices', 'Z_train', 'Z_test', 'group_dict', 'seed_obj' 147 ''' 148 import warnings 149 warnings.filterwarnings('ignore') 150 151 assert all([adata.shape[0] for adata in adatas]), ("Different number of " 152 "cells present in " 153 "each object") 154 155 # True if all train indices match 156 same_train = np.all([np.array_equal(adatas[0].uns['train_indices'], 157 adatas[i].uns['train_indices']) 158 for i in range(1, len(adatas))]) 159 160 # True if all test indices match 161 same_test = np.all([np.array_equal(adatas[0].uns['test_indices'], 162 adatas[i].uns['test_indices']) 163 for i in range(1, len(adatas))]) 164 165 assert same_train, 'Different train indices' 166 assert same_test, 'Different test indices' 167 168 # Creates a boolean array for each modality of cells with non-empty rows 169 non_empty_rows = [np.array(_sparse_var(adata.X, axis = 1) != 0).ravel() 170 for adata in adatas] 171 172 # Returns a 1d array where sample feature sums 173 # across all modalities are more than 0 174 non_empty_rows = np.logical_and(*non_empty_rows).squeeze() 175 176 # Initializing final train test split array 177 train_test = np.repeat('train', adatas[0].shape[0]) 178 train_test[adatas[0].uns['test_indices']] = 'test' 179 180 # Capturing train test split with empty rows filtered out 181 train_test = train_test[non_empty_rows] 182 train_indices = np.where(train_test == 'train')[0] 183 test_indices = np.where(train_test == 'test')[0] 184 185 # Adding train test split arrays to AnnData objects 186 # and filtering out empty samples 187 for i, adata in enumerate(adatas): 188 adatas[i].uns['train_indices'] = train_indices 189 adatas[i].uns['test_indices'] = test_indices 190 adatas[i] = adata[non_empty_rows, :] 191 # tfidf normalizing if corresponding element in tfidf is True 192 if tfidf[i]: 193 adatas[i] = tfidf_normalize(adata) 194 195 if 'Z_train' not in adatas[i].uns.keys(): 196 # AnnData update must be pointing at the object in list 197 print(f'Estimating Sigma for {names[i]}', flush = True) 198 adatas[i] = estimate_sigma(adata, n_features= 200) 199 print(f'Calculating Z for {names[i]}', flush = True) 200 adatas[i] = calculate_z(adata, n_features = 5000) 201 202 if 'labels' in adatas[0].obs: 203 all_labels = [adata.obs['labels'] for adata in adatas] 204 # Ensuring cell labels for each AnnData object are the same 205 for i in range(1, len(all_labels)): 206 same_labels = np.all(all_labels[0] == all_labels[i]) 207 assert same_labels, (f"Cell labels between AnnData object in " 208 f"position 0 and position {i} in adatas do " 209 "not match") 210 211 adata = _combine_modalities(adatas = adatas, 212 names = names, 213 combination = 'concatenate') 214 215 del adatas 216 gc.collect() 217 218 return adata
Combines and processes a list of adata objects.
Parameters
adatas : list[AnnData]
List of AnnData objects where each object is a different modality for the same cells.
names : list[str]
List of string names for each modality repective to each object in
adatas
.
tfidf : bool
List where if element i is
True
, adata[i] will be TFIDF normalized.
Returns
adata : AnnData
Concatenated from objects from
adatas
with Z matrices calculated.
Examples
>>> rna_adata = scmkl.create_adata(X = mcf7_rna_mat,
... feature_names = gene_names,
... scale_data = True,
... cell_labels = cell_labels,
... group_dict = rna_grouping)
>>>
>>> atac_adata = scmkl.create_adata(X = mcf7_atac_mat,
... feature_names = peak_names,
... scale_data = False,
... cell_labels = cell_labels,
... group_dict = atac_grouping)
>>>
>>> adatas = [rna_adata, atac_adata]
>>> mod_names = ['rna', 'atac']
>>> adata = scmkl.multimodal_processing(adatas = adatas,
... names = mod_names,
... tfidf = [False, True])
>>>
>>> adata
AnnData object with n_obs × n_vars = 1000 × 12676
obs: 'labels'
var: 'labels'
uns: 'D', 'kernel_type', 'distance_metric', 'train_indices',
'test_indices', 'Z_train', 'Z_test', 'group_dict', 'seed_obj'