scmkl.multimodal_processing
1import numpy as np 2import anndata as ad 3import gc 4 5from scmkl.tfidf_normalize import tfidf_normalize 6from scmkl.data_processing import sparse_var 7from scmkl.calculate_z import calculate_z 8 9 10def combine_modalities(adatas: list[ad.AnnData], names: list[str], 11 combination: str = 'concatenate'): 12 """ 13 Combines data sets for multimodal classification. Combined group 14 names are `f'{assay}+{group_name}'`. 15 16 Parameters 17 ---------- 18 adatas : list[ad.AnnData] 19 List of AnnData objects where each object is a different 20 modality. Annotations must match between objects (i.e. same 21 sample order). 22 23 names : list[str] 24 List of strings names for each modality repective to each 25 object in adatas. 26 27 combination: str 28 How to combine the matrices, either `'sum'` or `'concatenate'`. 29 30 Returns 31 ------- 32 combined_adata : ad.Anndata 33 Adata object with the combined Z matrices and annotations. 34 """ 35 assert len({adata.shape[0] for adata in adatas}) == 1, ("All adatas must " 36 "have the same " 37 "number of rows") 38 assert len(np.unique(names)) == len(names), "Assay names must be distinct" 39 assert combination.lower() in ['sum', 'concatenate'] 40 41 z_train = all(['Z_train' in adata.uns.keys() for adata in adatas]) 42 z_test = all(['Z_test' in adata.uns.keys() for adata in adatas]) 43 44 assert all([z_train, z_test]), "Z not calculated for one or more adatas" 45 46 # Combining modalities 47 combined_adata = ad.concat(adatas, uns_merge = 'same', 48 axis = 1, label = 'labels') 49 50 assert 'train_indices' in combined_adata.uns.keys(), ("Different train " 51 "test splits " 52 "between AnnData " 53 "objects") 54 55 # Conserving labels from adatas 56 combined_adata.obs = adatas[0].obs.copy() 57 58 # Creating a single dictionary with all of the groups across modalities 59 group_dict = {} 60 for name, adata in zip(names, adatas): 61 for group_name, features in adata.uns['group_dict'].items(): 62 group_dict[f'{name}-{group_name}'] = features 63 64 if combination == 'concatenate': 65 combined_adata.uns['Z_train'] = np.hstack([adata.uns['Z_train'] 66 for adata in adatas]) 67 combined_adata.uns['Z_test'] = np.hstack([adata.uns['Z_test'] 68 for adata in adatas]) 69 70 71 elif combination == 'sum': 72 73 #Check that the dimensions of all Z's are the same 74 dims = [adata.uns['Z_train'].shape for adata in adatas] 75 dims = all([dim == dims[0] for dim in dims]) 76 assert dims, "Cannot sum Z matrices with different dimensions" 77 78 combined_adata.uns['Z_train'] = np.sum([adata.uns['Z_train'] 79 for adata in adatas], 80 axis = 0) 81 combined_adata.uns['Z_test'] = np.sum([adata.uns['Z_test'] 82 for adata in adatas], 83 axis = 0) 84 85 86 combined_adata.uns['group_dict'] = group_dict 87 88 if 'seed_obj' in adatas[0].uns_keys(): 89 combined_adata.uns['seed_obj'] = adatas[0].uns['seed_obj'] 90 else: 91 print("No random seed present in adata" 92 "Recommended for reproducibility.") 93 94 del adatas 95 gc.collect() 96 97 return combined_adata 98 99 100def multimodal_processing(adatas : list[ad.AnnData], names : list[str], 101 tfidf: list[bool], combination: str='concatenate', 102 batches: int=10, batch_size: int=100) -> ad.AnnData: 103 """ 104 Combines and processes a list of `ad.AnnData` objects. 105 106 Parameters 107 ---------- 108 adatas : list[ad.AnnData] 109 List of `ad.AnnData` objects where each object is a different 110 modality. Annotations must match between objects (i.e. same 111 sample order). 112 113 names : list[str] 114 List of strings names for each modality repective to each 115 object in adatas. 116 117 combination: str 118 How to combine the matrices, either `'sum'` or `'concatenate'`. 119 120 tfidf : list[bool] 121 If element `i` is `True`, `adata[i]` will be TF-IDF normalized. 122 123 batches : int 124 The number of batches to use for the distance calculation. 125 This will average the result of `batches` distance calculations 126 of `batch_size` randomly sampled cells. More batches will converge 127 to population distance values at the cost of scalability. 128 129 batch_size : int 130 The number of cells to include per batch for distance 131 calculations. Higher batch size will converge to population 132 distance values at the cost of scalability. 133 If `batches*batch_size > num_training_cells`, `batch_size` 134 will be reduced to `int(num_training_cells / batches)`. 135 136 Returns 137 ------- 138 adata : ad.AnnData 139 Concatenated from objects from `adatas` with Z matrices 140 calculated. 141 142 Examples 143 -------- 144 >>> rna_adata = scmkl.create_adata(X = mcf7_rna_mat, 145 ... feature_names = gene_names, 146 ... scale_data = True, 147 ... cell_labels = cell_labels, 148 ... group_dict = rna_grouping) 149 >>> 150 >>> atac_adata = scmkl.create_adata(X = mcf7_atac_mat, 151 ... feature_names = peak_names, 152 ... scale_data = False, 153 ... cell_labels = cell_labels, 154 ... group_dict = atac_grouping) 155 >>> 156 >>> adatas = [rna_adata, atac_adata] 157 >>> mod_names = ['rna', 'atac'] 158 >>> adata = scmkl.multimodal_processing(adatas = adatas, 159 ... names = mod_names, 160 ... tfidf = [False, True]) 161 >>> 162 >>> adata 163 AnnData object with n_obs × n_vars = 1000 × 12676 164 obs: 'labels' 165 var: 'labels' 166 uns: 'D', 'kernel_type', 'distance_metric', 'train_indices', 167 'test_indices', 'Z_train', 'Z_test', 'group_dict', 'seed_obj' 168 """ 169 import warnings 170 warnings.filterwarnings('ignore') 171 172 diff_num_warn = "Different number of cells present in each object." 173 assert all([adata.shape[0] for adata in adatas]), diff_num_warn 174 175 # True if all train indices match 176 same_train = np.all([np.array_equal(adatas[0].uns['train_indices'], 177 adatas[i].uns['train_indices']) 178 for i in range(1, len(adatas))]) 179 180 # True if all test indices match 181 same_test = np.all([np.array_equal(adatas[0].uns['test_indices'], 182 adatas[i].uns['test_indices']) 183 for i in range(1, len(adatas))]) 184 185 assert same_train, "Different train indices" 186 assert same_test, "Different test indices" 187 188 # Creates a boolean array for each modality of cells with non-empty rows 189 non_empty_rows = [np.array(sparse_var(adata.X, axis = 1) != 0).ravel() 190 for adata in adatas] 191 192 # Returns a 1d array where sample feature sums 193 # across all modalities are more than 0 194 non_empty_rows = np.logical_and(*non_empty_rows).squeeze() 195 196 # Initializing final train test split array 197 train_test = np.repeat('train', adatas[0].shape[0]) 198 train_test[adatas[0].uns['test_indices']] = 'test' 199 200 # Capturing train test split with empty rows filtered out 201 train_test = train_test[non_empty_rows] 202 train_indices = np.where(train_test == 'train')[0] 203 test_indices = np.where(train_test == 'test')[0] 204 205 # Adding train test split arrays to AnnData objects 206 # and filtering out empty samples 207 for i, adata in enumerate(adatas): 208 adatas[i].uns['train_indices'] = train_indices 209 adatas[i].uns['test_indices'] = test_indices 210 adatas[i] = adata[non_empty_rows, :] 211 # tfidf normalizing if corresponding element in tfidf is True 212 if tfidf[i]: 213 adatas[i] = tfidf_normalize(adata) 214 215 print(f"Estimating sigma and calculating Z for {names[i]}", flush = True) 216 adatas[i] = calculate_z(adata, n_features = 5000, batches=batches, 217 batch_size=batch_size) 218 219 if 'labels' in adatas[0].obs: 220 all_labels = [adata.obs['labels'] for adata in adatas] 221 # Ensuring cell labels for each AnnData object are the same 222 uneq_labs_warn = ("Cell labels between AnnData object in position 0 " 223 "and position {} in adatas do not match") 224 for i in range(1, len(all_labels)): 225 same_labels = np.all(all_labels[0] == all_labels[i]) 226 assert same_labels, uneq_labs_warn.format(i) 227 228 adata = combine_modalities(adatas=adatas, 229 names=names, 230 combination=combination) 231 232 del adatas 233 gc.collect() 234 235 return adata
def
combine_modalities( adatas: list[anndata._core.anndata.AnnData], names: list[str], combination: str = 'concatenate'):
11def combine_modalities(adatas: list[ad.AnnData], names: list[str], 12 combination: str = 'concatenate'): 13 """ 14 Combines data sets for multimodal classification. Combined group 15 names are `f'{assay}+{group_name}'`. 16 17 Parameters 18 ---------- 19 adatas : list[ad.AnnData] 20 List of AnnData objects where each object is a different 21 modality. Annotations must match between objects (i.e. same 22 sample order). 23 24 names : list[str] 25 List of strings names for each modality repective to each 26 object in adatas. 27 28 combination: str 29 How to combine the matrices, either `'sum'` or `'concatenate'`. 30 31 Returns 32 ------- 33 combined_adata : ad.Anndata 34 Adata object with the combined Z matrices and annotations. 35 """ 36 assert len({adata.shape[0] for adata in adatas}) == 1, ("All adatas must " 37 "have the same " 38 "number of rows") 39 assert len(np.unique(names)) == len(names), "Assay names must be distinct" 40 assert combination.lower() in ['sum', 'concatenate'] 41 42 z_train = all(['Z_train' in adata.uns.keys() for adata in adatas]) 43 z_test = all(['Z_test' in adata.uns.keys() for adata in adatas]) 44 45 assert all([z_train, z_test]), "Z not calculated for one or more adatas" 46 47 # Combining modalities 48 combined_adata = ad.concat(adatas, uns_merge = 'same', 49 axis = 1, label = 'labels') 50 51 assert 'train_indices' in combined_adata.uns.keys(), ("Different train " 52 "test splits " 53 "between AnnData " 54 "objects") 55 56 # Conserving labels from adatas 57 combined_adata.obs = adatas[0].obs.copy() 58 59 # Creating a single dictionary with all of the groups across modalities 60 group_dict = {} 61 for name, adata in zip(names, adatas): 62 for group_name, features in adata.uns['group_dict'].items(): 63 group_dict[f'{name}-{group_name}'] = features 64 65 if combination == 'concatenate': 66 combined_adata.uns['Z_train'] = np.hstack([adata.uns['Z_train'] 67 for adata in adatas]) 68 combined_adata.uns['Z_test'] = np.hstack([adata.uns['Z_test'] 69 for adata in adatas]) 70 71 72 elif combination == 'sum': 73 74 #Check that the dimensions of all Z's are the same 75 dims = [adata.uns['Z_train'].shape for adata in adatas] 76 dims = all([dim == dims[0] for dim in dims]) 77 assert dims, "Cannot sum Z matrices with different dimensions" 78 79 combined_adata.uns['Z_train'] = np.sum([adata.uns['Z_train'] 80 for adata in adatas], 81 axis = 0) 82 combined_adata.uns['Z_test'] = np.sum([adata.uns['Z_test'] 83 for adata in adatas], 84 axis = 0) 85 86 87 combined_adata.uns['group_dict'] = group_dict 88 89 if 'seed_obj' in adatas[0].uns_keys(): 90 combined_adata.uns['seed_obj'] = adatas[0].uns['seed_obj'] 91 else: 92 print("No random seed present in adata" 93 "Recommended for reproducibility.") 94 95 del adatas 96 gc.collect() 97 98 return combined_adata
Combines data sets for multimodal classification. Combined group
names are f'{assay}+{group_name}'
.
Parameters
- adatas (list[ad.AnnData]): List of AnnData objects where each object is a different modality. Annotations must match between objects (i.e. same sample order).
- names (list[str]): List of strings names for each modality repective to each object in adatas.
- combination (str):
How to combine the matrices, either
'sum'
or'concatenate'
.
Returns
- combined_adata (ad.Anndata): Adata object with the combined Z matrices and annotations.
def
multimodal_processing( adatas: list[anndata._core.anndata.AnnData], names: list[str], tfidf: list[bool], combination: str = 'concatenate', batches: int = 10, batch_size: int = 100) -> anndata._core.anndata.AnnData:
101def multimodal_processing(adatas : list[ad.AnnData], names : list[str], 102 tfidf: list[bool], combination: str='concatenate', 103 batches: int=10, batch_size: int=100) -> ad.AnnData: 104 """ 105 Combines and processes a list of `ad.AnnData` objects. 106 107 Parameters 108 ---------- 109 adatas : list[ad.AnnData] 110 List of `ad.AnnData` objects where each object is a different 111 modality. Annotations must match between objects (i.e. same 112 sample order). 113 114 names : list[str] 115 List of strings names for each modality repective to each 116 object in adatas. 117 118 combination: str 119 How to combine the matrices, either `'sum'` or `'concatenate'`. 120 121 tfidf : list[bool] 122 If element `i` is `True`, `adata[i]` will be TF-IDF normalized. 123 124 batches : int 125 The number of batches to use for the distance calculation. 126 This will average the result of `batches` distance calculations 127 of `batch_size` randomly sampled cells. More batches will converge 128 to population distance values at the cost of scalability. 129 130 batch_size : int 131 The number of cells to include per batch for distance 132 calculations. Higher batch size will converge to population 133 distance values at the cost of scalability. 134 If `batches*batch_size > num_training_cells`, `batch_size` 135 will be reduced to `int(num_training_cells / batches)`. 136 137 Returns 138 ------- 139 adata : ad.AnnData 140 Concatenated from objects from `adatas` with Z matrices 141 calculated. 142 143 Examples 144 -------- 145 >>> rna_adata = scmkl.create_adata(X = mcf7_rna_mat, 146 ... feature_names = gene_names, 147 ... scale_data = True, 148 ... cell_labels = cell_labels, 149 ... group_dict = rna_grouping) 150 >>> 151 >>> atac_adata = scmkl.create_adata(X = mcf7_atac_mat, 152 ... feature_names = peak_names, 153 ... scale_data = False, 154 ... cell_labels = cell_labels, 155 ... group_dict = atac_grouping) 156 >>> 157 >>> adatas = [rna_adata, atac_adata] 158 >>> mod_names = ['rna', 'atac'] 159 >>> adata = scmkl.multimodal_processing(adatas = adatas, 160 ... names = mod_names, 161 ... tfidf = [False, True]) 162 >>> 163 >>> adata 164 AnnData object with n_obs × n_vars = 1000 × 12676 165 obs: 'labels' 166 var: 'labels' 167 uns: 'D', 'kernel_type', 'distance_metric', 'train_indices', 168 'test_indices', 'Z_train', 'Z_test', 'group_dict', 'seed_obj' 169 """ 170 import warnings 171 warnings.filterwarnings('ignore') 172 173 diff_num_warn = "Different number of cells present in each object." 174 assert all([adata.shape[0] for adata in adatas]), diff_num_warn 175 176 # True if all train indices match 177 same_train = np.all([np.array_equal(adatas[0].uns['train_indices'], 178 adatas[i].uns['train_indices']) 179 for i in range(1, len(adatas))]) 180 181 # True if all test indices match 182 same_test = np.all([np.array_equal(adatas[0].uns['test_indices'], 183 adatas[i].uns['test_indices']) 184 for i in range(1, len(adatas))]) 185 186 assert same_train, "Different train indices" 187 assert same_test, "Different test indices" 188 189 # Creates a boolean array for each modality of cells with non-empty rows 190 non_empty_rows = [np.array(sparse_var(adata.X, axis = 1) != 0).ravel() 191 for adata in adatas] 192 193 # Returns a 1d array where sample feature sums 194 # across all modalities are more than 0 195 non_empty_rows = np.logical_and(*non_empty_rows).squeeze() 196 197 # Initializing final train test split array 198 train_test = np.repeat('train', adatas[0].shape[0]) 199 train_test[adatas[0].uns['test_indices']] = 'test' 200 201 # Capturing train test split with empty rows filtered out 202 train_test = train_test[non_empty_rows] 203 train_indices = np.where(train_test == 'train')[0] 204 test_indices = np.where(train_test == 'test')[0] 205 206 # Adding train test split arrays to AnnData objects 207 # and filtering out empty samples 208 for i, adata in enumerate(adatas): 209 adatas[i].uns['train_indices'] = train_indices 210 adatas[i].uns['test_indices'] = test_indices 211 adatas[i] = adata[non_empty_rows, :] 212 # tfidf normalizing if corresponding element in tfidf is True 213 if tfidf[i]: 214 adatas[i] = tfidf_normalize(adata) 215 216 print(f"Estimating sigma and calculating Z for {names[i]}", flush = True) 217 adatas[i] = calculate_z(adata, n_features = 5000, batches=batches, 218 batch_size=batch_size) 219 220 if 'labels' in adatas[0].obs: 221 all_labels = [adata.obs['labels'] for adata in adatas] 222 # Ensuring cell labels for each AnnData object are the same 223 uneq_labs_warn = ("Cell labels between AnnData object in position 0 " 224 "and position {} in adatas do not match") 225 for i in range(1, len(all_labels)): 226 same_labels = np.all(all_labels[0] == all_labels[i]) 227 assert same_labels, uneq_labs_warn.format(i) 228 229 adata = combine_modalities(adatas=adatas, 230 names=names, 231 combination=combination) 232 233 del adatas 234 gc.collect() 235 236 return adata
Combines and processes a list of ad.AnnData
objects.
Parameters
- adatas (list[ad.AnnData]):
List of
ad.AnnData
objects where each object is a different modality. Annotations must match between objects (i.e. same sample order). - names (list[str]): List of strings names for each modality repective to each object in adatas.
- combination (str):
How to combine the matrices, either
'sum'
or'concatenate'
. - tfidf (list[bool]):
If element
i
isTrue
,adata[i]
will be TF-IDF normalized. - batches (int):
The number of batches to use for the distance calculation.
This will average the result of
batches
distance calculations ofbatch_size
randomly sampled cells. More batches will converge to population distance values at the cost of scalability. - batch_size (int):
The number of cells to include per batch for distance
calculations. Higher batch size will converge to population
distance values at the cost of scalability.
If
batches*batch_size > num_training_cells
,batch_size
will be reduced toint(num_training_cells / batches)
.
Returns
- adata (ad.AnnData):
Concatenated from objects from
adatas
with Z matrices calculated.
Examples
>>> rna_adata = scmkl.create_adata(X = mcf7_rna_mat,
... feature_names = gene_names,
... scale_data = True,
... cell_labels = cell_labels,
... group_dict = rna_grouping)
>>>
>>> atac_adata = scmkl.create_adata(X = mcf7_atac_mat,
... feature_names = peak_names,
... scale_data = False,
... cell_labels = cell_labels,
... group_dict = atac_grouping)
>>>
>>> adatas = [rna_adata, atac_adata]
>>> mod_names = ['rna', 'atac']
>>> adata = scmkl.multimodal_processing(adatas = adatas,
... names = mod_names,
... tfidf = [False, True])
>>>
>>> adata
AnnData object with n_obs × n_vars = 1000 × 12676
obs: 'labels'
var: 'labels'
uns: 'D', 'kernel_type', 'distance_metric', 'train_indices',
'test_indices', 'Z_train', 'Z_test', 'group_dict', 'seed_obj'