scmkl.multimodal_processing
1import numpy as np 2import anndata as ad 3import gc 4 5from scmkl.tfidf_normalize import tfidf_normalize 6from scmkl.data_processing import sparse_var 7from scmkl.calculate_z import calculate_z 8 9 10def combine_modalities(adatas: list[ad.AnnData], names: list[str], 11 combination: str = 'concatenate'): 12 """ 13 Combines data sets for multimodal classification. Combined group 14 names are `f'{assay}+{group_name}'`. 15 16 Parameters 17 ---------- 18 adatas : list[ad.AnnData] 19 List of AnnData objects where each object is a different 20 modality. Annotations must match between objects (i.e. same 21 sample order). 22 23 names : list[str] 24 List of strings names for each modality repective to each 25 object in adatas. 26 27 combination: str 28 How to combine the matrices, either `'sum'` or `'concatenate'`. 29 30 Returns 31 ------- 32 combined_adata : ad.Anndata 33 Adata object with the combined Z matrices and annotations. 34 """ 35 assert len({adata.shape[0] for adata in adatas}) == 1, ("All adatas must " 36 "have the same " 37 "number of rows") 38 assert len(np.unique(names)) == len(names), "Assay names must be distinct" 39 assert combination.lower() in ['sum', 'concatenate'] 40 41 z_train = all(['Z_train' in adata.uns.keys() for adata in adatas]) 42 z_test = all(['Z_test' in adata.uns.keys() for adata in adatas]) 43 44 assert all([z_train, z_test]), "Z not calculated for one or more adatas" 45 46 # Combining modalities 47 combined_adata = ad.concat(adatas, uns_merge = 'same', 48 axis = 1, label = 'labels') 49 50 assert 'train_indices' in combined_adata.uns.keys(), ("Different train " 51 "test splits " 52 "between AnnData " 53 "objects") 54 55 # Conserving labels from adatas 56 combined_adata.obs = adatas[0].obs.copy() 57 58 # Creating a single dictionary with all of the groups across modalities 59 group_dict = {} 60 for name, adata in zip(names, adatas): 61 for group_name, features in adata.uns['group_dict'].items(): 62 group_dict[f'{name}-{group_name}'] = features 63 64 if combination == 'concatenate': 65 combined_adata.uns['Z_train'] = np.hstack([adata.uns['Z_train'] 66 for adata in adatas]) 67 combined_adata.uns['Z_test'] = np.hstack([adata.uns['Z_test'] 68 for adata in adatas]) 69 70 71 elif combination == 'sum': 72 73 #Check that the dimensions of all Z's are the same 74 dims = [adata.uns['Z_train'].shape for adata in adatas] 75 dims = all([dim == dims[0] for dim in dims]) 76 assert dims, "Cannot sum Z matrices with different dimensions" 77 78 combined_adata.uns['Z_train'] = np.sum([adata.uns['Z_train'] 79 for adata in adatas], 80 axis = 0) 81 combined_adata.uns['Z_test'] = np.sum([adata.uns['Z_test'] 82 for adata in adatas], 83 axis = 0) 84 85 86 combined_adata.uns['group_dict'] = group_dict 87 88 if 'seed_obj' in adatas[0].uns_keys(): 89 combined_adata.uns['seed_obj'] = adatas[0].uns['seed_obj'] 90 else: 91 print("No random seed present in adata" 92 "Recommended for reproducibility.") 93 94 del adatas 95 gc.collect() 96 97 return combined_adata 98 99 100def multimodal_processing(adatas : list[ad.AnnData], names : list[str], 101 tfidf: list[bool], combination: str='concatenate', 102 batches: int=10, batch_size: int=100, 103 verbose: bool=True) -> ad.AnnData: 104 """ 105 Combines and processes a list of `ad.AnnData` objects. 106 107 Parameters 108 ---------- 109 adatas : list[ad.AnnData] 110 List of `ad.AnnData` objects where each object is a different 111 modality. Annotations must match between objects (i.e. same 112 sample order). 113 114 names : list[str] 115 List of strings names for each modality repective to each 116 object in adatas. 117 118 combination: str 119 How to combine the matrices, either `'sum'` or `'concatenate'`. 120 121 tfidf : list[bool] 122 If element `i` is `True`, `adata[i]` will be TF-IDF normalized. 123 124 batches : int 125 The number of batches to use for the distance calculation. 126 This will average the result of `batches` distance calculations 127 of `batch_size` randomly sampled cells. More batches will converge 128 to population distance values at the cost of scalability. 129 130 batch_size : int 131 The number of cells to include per batch for distance 132 calculations. Higher batch size will converge to population 133 distance values at the cost of scalability. 134 If `batches*batch_size > num_training_cells`, `batch_size` 135 will be reduced to `int(num_training_cells / batches)`. 136 137 Returns 138 ------- 139 adata : ad.AnnData 140 Concatenated from objects from `adatas` with Z matrices 141 calculated. 142 143 Examples 144 -------- 145 >>> rna_adata = scmkl.create_adata(X = mcf7_rna_mat, 146 ... feature_names=gene_names, 147 ... scale_data=True, 148 ... transform_data=True, 149 ... cell_labels=cell_labels, 150 ... group_dict=rna_grouping) 151 >>> 152 >>> atac_adata = scmkl.create_adata(X=mcf7_atac_mat, 153 ... feature_names=peak_names, 154 ... scale_data=False, 155 ... cell_labels=cell_labels, 156 ... group_dict=atac_grouping) 157 >>> 158 >>> adatas = [rna_adata, atac_adata] 159 >>> mod_names = ['rna', 'atac'] 160 >>> adata = scmkl.multimodal_processing(adatas = adatas, 161 ... names = mod_names, 162 ... tfidf = [False, True]) 163 >>> 164 >>> adata 165 AnnData object with n_obs × n_vars = 1000 × 12676 166 obs: 'labels' 167 var: 'labels' 168 uns: 'D', 'kernel_type', 'distance_metric', 'train_indices', 169 'test_indices', 'Z_train', 'Z_test', 'group_dict', 'seed_obj' 170 """ 171 import warnings 172 warnings.filterwarnings('ignore') 173 174 diff_num_warn = "Different number of cells present in each object." 175 assert all([adata.shape[0] for adata in adatas]), diff_num_warn 176 177 # True if all train indices match 178 same_train = np.all([np.array_equal(adatas[0].uns['train_indices'], 179 adatas[i].uns['train_indices']) 180 for i in range(1, len(adatas))]) 181 182 # True if all test indices match 183 same_test = np.all([np.array_equal(adatas[0].uns['test_indices'], 184 adatas[i].uns['test_indices']) 185 for i in range(1, len(adatas))]) 186 187 assert same_train, "Different train indices" 188 assert same_test, "Different test indices" 189 190 # Creates a boolean array for each modality of cells with non-empty rows 191 non_empty_rows = [np.array(sparse_var(adata.X, axis = 1) != 0).ravel() 192 for adata in adatas] 193 non_empty_rows = np.transpose(non_empty_rows) 194 195 # Returns a 1D array where sample feature sums non-0 across all modalities 196 non_empty_rows = np.array([np.all(non_empty_rows[i]) 197 for i in range(non_empty_rows.shape[0])]) 198 199 # Initializing final train test split array 200 train_test = np.repeat('train', adatas[0].shape[0]) 201 train_test[adatas[0].uns['test_indices']] = 'test' 202 203 # Capturing train test split with empty rows filtered out 204 train_test = train_test[non_empty_rows] 205 train_indices = np.where(train_test == 'train')[0] 206 test_indices = np.where(train_test == 'test')[0] 207 208 # Adding train test split arrays to AnnData objects 209 # and filtering out empty samples 210 for i, adata in enumerate(adatas): 211 adatas[i].uns['train_indices'] = train_indices 212 adatas[i].uns['test_indices'] = test_indices 213 adatas[i] = adata[non_empty_rows, :] 214 215 # tfidf normalizing if corresponding element in tfidf is True 216 if tfidf[i]: 217 adatas[i] = tfidf_normalize(adata) 218 219 if verbose: 220 print(f"Estimating sigma and calculating Z for {names[i]}", 221 flush = True) 222 adatas[i] = calculate_z(adata, n_features = 5000, batches=batches, 223 batch_size=batch_size) 224 225 if 'labels' in adatas[0].obs: 226 all_labels = [adata.obs['labels'] for adata in adatas] 227 # Ensuring cell labels for each AnnData object are the same 228 uneq_labs_warn = ("Cell labels between AnnData object in position 0 " 229 "and position {} in adatas do not match") 230 for i in range(1, len(all_labels)): 231 same_labels = np.all(all_labels[0] == all_labels[i]) 232 assert same_labels, uneq_labs_warn.format(i) 233 234 adata = combine_modalities(adatas=adatas, 235 names=names, 236 combination=combination) 237 238 del adatas 239 gc.collect() 240 241 return adata
def
combine_modalities( adatas: list[anndata._core.anndata.AnnData], names: list[str], combination: str = 'concatenate'):
11def combine_modalities(adatas: list[ad.AnnData], names: list[str], 12 combination: str = 'concatenate'): 13 """ 14 Combines data sets for multimodal classification. Combined group 15 names are `f'{assay}+{group_name}'`. 16 17 Parameters 18 ---------- 19 adatas : list[ad.AnnData] 20 List of AnnData objects where each object is a different 21 modality. Annotations must match between objects (i.e. same 22 sample order). 23 24 names : list[str] 25 List of strings names for each modality repective to each 26 object in adatas. 27 28 combination: str 29 How to combine the matrices, either `'sum'` or `'concatenate'`. 30 31 Returns 32 ------- 33 combined_adata : ad.Anndata 34 Adata object with the combined Z matrices and annotations. 35 """ 36 assert len({adata.shape[0] for adata in adatas}) == 1, ("All adatas must " 37 "have the same " 38 "number of rows") 39 assert len(np.unique(names)) == len(names), "Assay names must be distinct" 40 assert combination.lower() in ['sum', 'concatenate'] 41 42 z_train = all(['Z_train' in adata.uns.keys() for adata in adatas]) 43 z_test = all(['Z_test' in adata.uns.keys() for adata in adatas]) 44 45 assert all([z_train, z_test]), "Z not calculated for one or more adatas" 46 47 # Combining modalities 48 combined_adata = ad.concat(adatas, uns_merge = 'same', 49 axis = 1, label = 'labels') 50 51 assert 'train_indices' in combined_adata.uns.keys(), ("Different train " 52 "test splits " 53 "between AnnData " 54 "objects") 55 56 # Conserving labels from adatas 57 combined_adata.obs = adatas[0].obs.copy() 58 59 # Creating a single dictionary with all of the groups across modalities 60 group_dict = {} 61 for name, adata in zip(names, adatas): 62 for group_name, features in adata.uns['group_dict'].items(): 63 group_dict[f'{name}-{group_name}'] = features 64 65 if combination == 'concatenate': 66 combined_adata.uns['Z_train'] = np.hstack([adata.uns['Z_train'] 67 for adata in adatas]) 68 combined_adata.uns['Z_test'] = np.hstack([adata.uns['Z_test'] 69 for adata in adatas]) 70 71 72 elif combination == 'sum': 73 74 #Check that the dimensions of all Z's are the same 75 dims = [adata.uns['Z_train'].shape for adata in adatas] 76 dims = all([dim == dims[0] for dim in dims]) 77 assert dims, "Cannot sum Z matrices with different dimensions" 78 79 combined_adata.uns['Z_train'] = np.sum([adata.uns['Z_train'] 80 for adata in adatas], 81 axis = 0) 82 combined_adata.uns['Z_test'] = np.sum([adata.uns['Z_test'] 83 for adata in adatas], 84 axis = 0) 85 86 87 combined_adata.uns['group_dict'] = group_dict 88 89 if 'seed_obj' in adatas[0].uns_keys(): 90 combined_adata.uns['seed_obj'] = adatas[0].uns['seed_obj'] 91 else: 92 print("No random seed present in adata" 93 "Recommended for reproducibility.") 94 95 del adatas 96 gc.collect() 97 98 return combined_adata
Combines data sets for multimodal classification. Combined group
names are f'{assay}+{group_name}'.
Parameters
- adatas (list[ad.AnnData]): List of AnnData objects where each object is a different modality. Annotations must match between objects (i.e. same sample order).
- names (list[str]): List of strings names for each modality repective to each object in adatas.
- combination (str):
How to combine the matrices, either
'sum'or'concatenate'.
Returns
- combined_adata (ad.Anndata): Adata object with the combined Z matrices and annotations.
def
multimodal_processing( adatas: list[anndata._core.anndata.AnnData], names: list[str], tfidf: list[bool], combination: str = 'concatenate', batches: int = 10, batch_size: int = 100, verbose: bool = True) -> anndata._core.anndata.AnnData:
101def multimodal_processing(adatas : list[ad.AnnData], names : list[str], 102 tfidf: list[bool], combination: str='concatenate', 103 batches: int=10, batch_size: int=100, 104 verbose: bool=True) -> ad.AnnData: 105 """ 106 Combines and processes a list of `ad.AnnData` objects. 107 108 Parameters 109 ---------- 110 adatas : list[ad.AnnData] 111 List of `ad.AnnData` objects where each object is a different 112 modality. Annotations must match between objects (i.e. same 113 sample order). 114 115 names : list[str] 116 List of strings names for each modality repective to each 117 object in adatas. 118 119 combination: str 120 How to combine the matrices, either `'sum'` or `'concatenate'`. 121 122 tfidf : list[bool] 123 If element `i` is `True`, `adata[i]` will be TF-IDF normalized. 124 125 batches : int 126 The number of batches to use for the distance calculation. 127 This will average the result of `batches` distance calculations 128 of `batch_size` randomly sampled cells. More batches will converge 129 to population distance values at the cost of scalability. 130 131 batch_size : int 132 The number of cells to include per batch for distance 133 calculations. Higher batch size will converge to population 134 distance values at the cost of scalability. 135 If `batches*batch_size > num_training_cells`, `batch_size` 136 will be reduced to `int(num_training_cells / batches)`. 137 138 Returns 139 ------- 140 adata : ad.AnnData 141 Concatenated from objects from `adatas` with Z matrices 142 calculated. 143 144 Examples 145 -------- 146 >>> rna_adata = scmkl.create_adata(X = mcf7_rna_mat, 147 ... feature_names=gene_names, 148 ... scale_data=True, 149 ... transform_data=True, 150 ... cell_labels=cell_labels, 151 ... group_dict=rna_grouping) 152 >>> 153 >>> atac_adata = scmkl.create_adata(X=mcf7_atac_mat, 154 ... feature_names=peak_names, 155 ... scale_data=False, 156 ... cell_labels=cell_labels, 157 ... group_dict=atac_grouping) 158 >>> 159 >>> adatas = [rna_adata, atac_adata] 160 >>> mod_names = ['rna', 'atac'] 161 >>> adata = scmkl.multimodal_processing(adatas = adatas, 162 ... names = mod_names, 163 ... tfidf = [False, True]) 164 >>> 165 >>> adata 166 AnnData object with n_obs × n_vars = 1000 × 12676 167 obs: 'labels' 168 var: 'labels' 169 uns: 'D', 'kernel_type', 'distance_metric', 'train_indices', 170 'test_indices', 'Z_train', 'Z_test', 'group_dict', 'seed_obj' 171 """ 172 import warnings 173 warnings.filterwarnings('ignore') 174 175 diff_num_warn = "Different number of cells present in each object." 176 assert all([adata.shape[0] for adata in adatas]), diff_num_warn 177 178 # True if all train indices match 179 same_train = np.all([np.array_equal(adatas[0].uns['train_indices'], 180 adatas[i].uns['train_indices']) 181 for i in range(1, len(adatas))]) 182 183 # True if all test indices match 184 same_test = np.all([np.array_equal(adatas[0].uns['test_indices'], 185 adatas[i].uns['test_indices']) 186 for i in range(1, len(adatas))]) 187 188 assert same_train, "Different train indices" 189 assert same_test, "Different test indices" 190 191 # Creates a boolean array for each modality of cells with non-empty rows 192 non_empty_rows = [np.array(sparse_var(adata.X, axis = 1) != 0).ravel() 193 for adata in adatas] 194 non_empty_rows = np.transpose(non_empty_rows) 195 196 # Returns a 1D array where sample feature sums non-0 across all modalities 197 non_empty_rows = np.array([np.all(non_empty_rows[i]) 198 for i in range(non_empty_rows.shape[0])]) 199 200 # Initializing final train test split array 201 train_test = np.repeat('train', adatas[0].shape[0]) 202 train_test[adatas[0].uns['test_indices']] = 'test' 203 204 # Capturing train test split with empty rows filtered out 205 train_test = train_test[non_empty_rows] 206 train_indices = np.where(train_test == 'train')[0] 207 test_indices = np.where(train_test == 'test')[0] 208 209 # Adding train test split arrays to AnnData objects 210 # and filtering out empty samples 211 for i, adata in enumerate(adatas): 212 adatas[i].uns['train_indices'] = train_indices 213 adatas[i].uns['test_indices'] = test_indices 214 adatas[i] = adata[non_empty_rows, :] 215 216 # tfidf normalizing if corresponding element in tfidf is True 217 if tfidf[i]: 218 adatas[i] = tfidf_normalize(adata) 219 220 if verbose: 221 print(f"Estimating sigma and calculating Z for {names[i]}", 222 flush = True) 223 adatas[i] = calculate_z(adata, n_features = 5000, batches=batches, 224 batch_size=batch_size) 225 226 if 'labels' in adatas[0].obs: 227 all_labels = [adata.obs['labels'] for adata in adatas] 228 # Ensuring cell labels for each AnnData object are the same 229 uneq_labs_warn = ("Cell labels between AnnData object in position 0 " 230 "and position {} in adatas do not match") 231 for i in range(1, len(all_labels)): 232 same_labels = np.all(all_labels[0] == all_labels[i]) 233 assert same_labels, uneq_labs_warn.format(i) 234 235 adata = combine_modalities(adatas=adatas, 236 names=names, 237 combination=combination) 238 239 del adatas 240 gc.collect() 241 242 return adata
Combines and processes a list of ad.AnnData objects.
Parameters
- adatas (list[ad.AnnData]):
List of
ad.AnnDataobjects where each object is a different modality. Annotations must match between objects (i.e. same sample order). - names (list[str]): List of strings names for each modality repective to each object in adatas.
- combination (str):
How to combine the matrices, either
'sum'or'concatenate'. - tfidf (list[bool]):
If element
iisTrue,adata[i]will be TF-IDF normalized. - batches (int):
The number of batches to use for the distance calculation.
This will average the result of
batchesdistance calculations ofbatch_sizerandomly sampled cells. More batches will converge to population distance values at the cost of scalability. - batch_size (int):
The number of cells to include per batch for distance
calculations. Higher batch size will converge to population
distance values at the cost of scalability.
If
batches*batch_size > num_training_cells,batch_sizewill be reduced toint(num_training_cells / batches).
Returns
- adata (ad.AnnData):
Concatenated from objects from
adataswith Z matrices calculated.
Examples
>>> rna_adata = scmkl.create_adata(X = mcf7_rna_mat,
... feature_names=gene_names,
... scale_data=True,
... transform_data=True,
... cell_labels=cell_labels,
... group_dict=rna_grouping)
>>>
>>> atac_adata = scmkl.create_adata(X=mcf7_atac_mat,
... feature_names=peak_names,
... scale_data=False,
... cell_labels=cell_labels,
... group_dict=atac_grouping)
>>>
>>> adatas = [rna_adata, atac_adata]
>>> mod_names = ['rna', 'atac']
>>> adata = scmkl.multimodal_processing(adatas = adatas,
... names = mod_names,
... tfidf = [False, True])
>>>
>>> adata
AnnData object with n_obs × n_vars = 1000 × 12676
obs: 'labels'
var: 'labels'
uns: 'D', 'kernel_type', 'distance_metric', 'train_indices',
'test_indices', 'Z_train', 'Z_test', 'group_dict', 'seed_obj'