scmkl.multimodal_processing

  1import numpy as np
  2import anndata as ad
  3import gc
  4
  5from scmkl.tfidf_normalize import tfidf_normalize
  6from scmkl.data_processing import sparse_var
  7from scmkl.calculate_z import calculate_z
  8
  9
 10def combine_modalities(adatas: list[ad.AnnData], names: list[str], 
 11                        combination: str = 'concatenate'):
 12    """
 13    Combines data sets for multimodal classification. Combined group 
 14    names are `f'{assay}+{group_name}'`.
 15
 16    Parameters
 17    ----------
 18    adatas : list[ad.AnnData]
 19        List of AnnData objects where each object is a different 
 20        modality. Annotations must match between objects (i.e. same 
 21        sample order).
 22
 23    names : list[str]
 24        List of strings names for each modality repective to each 
 25        object in adatas.
 26            
 27    combination: str
 28        How to combine the matrices, either `'sum'` or `'concatenate'`.
 29    
 30    Returns
 31    -------
 32    combined_adata : ad.Anndata
 33        Adata object with the combined Z matrices and annotations. 
 34    """
 35    assert len({adata.shape[0] for adata in adatas}) == 1, ("All adatas must "
 36                                                            "have the same "
 37                                                            "number of rows")
 38    assert len(np.unique(names)) == len(names), "Assay names must be distinct"
 39    assert combination.lower() in ['sum', 'concatenate']
 40
 41    z_train = all(['Z_train' in adata.uns.keys() for adata in adatas])
 42    z_test = all(['Z_test' in adata.uns.keys() for adata in adatas])
 43
 44    assert all([z_train, z_test]), "Z not calculated for one or more adatas"
 45
 46    # Combining modalities
 47    combined_adata = ad.concat(adatas, uns_merge = 'same', 
 48                               axis = 1, label = 'labels')
 49    
 50    assert 'train_indices' in combined_adata.uns.keys(), ("Different train "
 51                                                          "test splits "
 52                                                          "between AnnData "
 53                                                          "objects")
 54
 55    # Conserving labels from adatas
 56    combined_adata.obs = adatas[0].obs.copy()
 57
 58    # Creating a single dictionary with all of the groups across modalities 
 59    group_dict = {}
 60    for name, adata in zip(names, adatas):
 61        for group_name, features in adata.uns['group_dict'].items():
 62            group_dict[f'{name}-{group_name}'] = features
 63
 64    if combination == 'concatenate':
 65        combined_adata.uns['Z_train'] = np.hstack([adata.uns['Z_train'] 
 66                                                   for adata in adatas])
 67        combined_adata.uns['Z_test'] = np.hstack([adata.uns['Z_test'] 
 68                                                  for adata in adatas])
 69
 70
 71    elif combination == 'sum':
 72
 73        #Check that the dimensions of all Z's are the same
 74        dims = [adata.uns['Z_train'].shape for adata in adatas]
 75        dims = all([dim == dims[0] for dim in dims])
 76        assert dims, "Cannot sum Z matrices with different dimensions"
 77        
 78        combined_adata.uns['Z_train'] = np.sum([adata.uns['Z_train'] 
 79                                                for adata in adatas], 
 80                                                axis = 0)
 81        combined_adata.uns['Z_test'] = np.sum([adata.uns['Z_test'] 
 82                                               for adata in adatas], 
 83                                               axis = 0)
 84
 85
 86    combined_adata.uns['group_dict'] = group_dict
 87
 88    if 'seed_obj' in adatas[0].uns_keys():
 89        combined_adata.uns['seed_obj'] = adatas[0].uns['seed_obj']
 90    else:
 91        print("No random seed present in adata"
 92              "Recommended for reproducibility.")
 93
 94    del adatas
 95    gc.collect()
 96
 97    return combined_adata
 98
 99
100def multimodal_processing(adatas : list[ad.AnnData], names : list[str], 
101                          tfidf: list[bool], combination: str='concatenate', 
102                          batches: int=10, batch_size: int=100, 
103                          verbose: bool=True) -> ad.AnnData:
104    """
105    Combines and processes a list of `ad.AnnData` objects.
106
107    Parameters
108    ----------
109    adatas : list[ad.AnnData]
110        List of `ad.AnnData` objects where each object is a different 
111        modality. Annotations must match between objects (i.e. same 
112        sample order).
113
114    names : list[str]
115        List of strings names for each modality repective to each 
116        object in adatas.
117            
118    combination: str
119        How to combine the matrices, either `'sum'` or `'concatenate'`.
120    
121    tfidf : list[bool]
122        If element `i` is `True`, `adata[i]` will be TF-IDF normalized.
123
124    batches : int
125        The number of batches to use for the distance calculation.
126        This will average the result of `batches` distance calculations
127        of `batch_size` randomly sampled cells. More batches will converge
128        to population distance values at the cost of scalability.
129
130    batch_size : int
131        The number of cells to include per batch for distance
132        calculations. Higher batch size will converge to population
133        distance values at the cost of scalability.
134        If `batches*batch_size > num_training_cells`, `batch_size` 
135        will be reduced to `int(num_training_cells / batches)`.
136
137    Returns
138    -------
139    adata : ad.AnnData
140        Concatenated from objects from `adatas` with Z matrices 
141        calculated.
142
143    Examples
144    --------
145    >>> rna_adata = scmkl.create_adata(X = mcf7_rna_mat, 
146    ...                                feature_names=gene_names, 
147    ...                                scale_data=True, 
148    ...                                transform_data=True,
149    ...                                cell_labels=cell_labels, 
150    ...                                 group_dict=rna_grouping)
151    >>>
152    >>> atac_adata = scmkl.create_adata(X=mcf7_atac_mat, 
153    ...                                 feature_names=peak_names, 
154    ...                                 scale_data=False, 
155    ...                                 cell_labels=cell_labels, 
156    ...                                 group_dict=atac_grouping)
157    >>>
158    >>> adatas = [rna_adata, atac_adata]
159    >>> mod_names = ['rna', 'atac']
160    >>> adata = scmkl.multimodal_processing(adatas = adatas, 
161    ...                                     names = mod_names,
162    ...                                     tfidf = [False, True])
163    >>>
164    >>> adata
165    AnnData object with n_obs × n_vars = 1000 × 12676
166    obs: 'labels'
167    var: 'labels'
168    uns: 'D', 'kernel_type', 'distance_metric', 'train_indices',  
169    'test_indices', 'Z_train', 'Z_test', 'group_dict', 'seed_obj'
170    """
171    import warnings 
172    warnings.filterwarnings('ignore')
173
174    diff_num_warn = "Different number of cells present in each object."
175    assert all([adata.shape[0] for adata in adatas]), diff_num_warn
176    
177    # True if all train indices match
178    same_train = np.all([np.array_equal(adatas[0].uns['train_indices'], 
179                                        adatas[i].uns['train_indices']) 
180                         for i in range(1, len(adatas))])
181
182    # True if all test indices match
183    same_test = np.all([np.array_equal(adatas[0].uns['test_indices'], 
184                                       adatas[i].uns['test_indices']) 
185                        for i in range(1, len(adatas))])
186
187    assert same_train, "Different train indices"
188    assert same_test, "Different test indices"
189
190    # Creates a boolean array for each modality of cells with non-empty rows
191    non_empty_rows = [np.array(sparse_var(adata.X, axis = 1) != 0).ravel() 
192                      for adata in adatas]
193    non_empty_rows = np.transpose(non_empty_rows)
194
195    # Returns a 1D array where sample feature sums non-0 across all modalities
196    non_empty_rows = np.array([np.all(non_empty_rows[i])
197                              for i in range(non_empty_rows.shape[0])])
198
199    # Initializing final train test split array
200    train_test = np.repeat('train', adatas[0].shape[0])
201    train_test[adatas[0].uns['test_indices']] = 'test'
202
203    # Capturing train test split with empty rows filtered out
204    train_test = train_test[non_empty_rows]
205    train_indices = np.where(train_test == 'train')[0]
206    test_indices = np.where(train_test == 'test')[0]
207
208    # Adding train test split arrays to AnnData objects 
209    # and filtering out empty samples
210    for i, adata in enumerate(adatas):
211        adatas[i].uns['train_indices'] = train_indices
212        adatas[i].uns['test_indices'] = test_indices
213        adatas[i] = adata[non_empty_rows, :]
214
215        # tfidf normalizing if corresponding element in tfidf is True
216        if tfidf[i]:
217            adatas[i] = tfidf_normalize(adata)
218        
219        if verbose:
220            print(f"Estimating sigma and calculating Z for {names[i]}", 
221                  flush = True)
222        adatas[i] = calculate_z(adata, n_features = 5000, batches=batches, 
223                                batch_size=batch_size)
224
225    if 'labels' in adatas[0].obs:
226        all_labels = [adata.obs['labels'] for adata in adatas]
227        # Ensuring cell labels for each AnnData object are the same
228        uneq_labs_warn = ("Cell labels between AnnData object in position 0 "
229                          "and position {} in adatas do not match")
230        for i in range(1, len(all_labels)):
231            same_labels = np.all(all_labels[0] == all_labels[i])
232            assert same_labels, uneq_labs_warn.format(i)
233
234    adata = combine_modalities(adatas=adatas,
235                                names=names,
236                                combination=combination)
237
238    del adatas
239    gc.collect()
240
241    return adata    
def combine_modalities( adatas: list[anndata._core.anndata.AnnData], names: list[str], combination: str = 'concatenate'):
11def combine_modalities(adatas: list[ad.AnnData], names: list[str], 
12                        combination: str = 'concatenate'):
13    """
14    Combines data sets for multimodal classification. Combined group 
15    names are `f'{assay}+{group_name}'`.
16
17    Parameters
18    ----------
19    adatas : list[ad.AnnData]
20        List of AnnData objects where each object is a different 
21        modality. Annotations must match between objects (i.e. same 
22        sample order).
23
24    names : list[str]
25        List of strings names for each modality repective to each 
26        object in adatas.
27            
28    combination: str
29        How to combine the matrices, either `'sum'` or `'concatenate'`.
30    
31    Returns
32    -------
33    combined_adata : ad.Anndata
34        Adata object with the combined Z matrices and annotations. 
35    """
36    assert len({adata.shape[0] for adata in adatas}) == 1, ("All adatas must "
37                                                            "have the same "
38                                                            "number of rows")
39    assert len(np.unique(names)) == len(names), "Assay names must be distinct"
40    assert combination.lower() in ['sum', 'concatenate']
41
42    z_train = all(['Z_train' in adata.uns.keys() for adata in adatas])
43    z_test = all(['Z_test' in adata.uns.keys() for adata in adatas])
44
45    assert all([z_train, z_test]), "Z not calculated for one or more adatas"
46
47    # Combining modalities
48    combined_adata = ad.concat(adatas, uns_merge = 'same', 
49                               axis = 1, label = 'labels')
50    
51    assert 'train_indices' in combined_adata.uns.keys(), ("Different train "
52                                                          "test splits "
53                                                          "between AnnData "
54                                                          "objects")
55
56    # Conserving labels from adatas
57    combined_adata.obs = adatas[0].obs.copy()
58
59    # Creating a single dictionary with all of the groups across modalities 
60    group_dict = {}
61    for name, adata in zip(names, adatas):
62        for group_name, features in adata.uns['group_dict'].items():
63            group_dict[f'{name}-{group_name}'] = features
64
65    if combination == 'concatenate':
66        combined_adata.uns['Z_train'] = np.hstack([adata.uns['Z_train'] 
67                                                   for adata in adatas])
68        combined_adata.uns['Z_test'] = np.hstack([adata.uns['Z_test'] 
69                                                  for adata in adatas])
70
71
72    elif combination == 'sum':
73
74        #Check that the dimensions of all Z's are the same
75        dims = [adata.uns['Z_train'].shape for adata in adatas]
76        dims = all([dim == dims[0] for dim in dims])
77        assert dims, "Cannot sum Z matrices with different dimensions"
78        
79        combined_adata.uns['Z_train'] = np.sum([adata.uns['Z_train'] 
80                                                for adata in adatas], 
81                                                axis = 0)
82        combined_adata.uns['Z_test'] = np.sum([adata.uns['Z_test'] 
83                                               for adata in adatas], 
84                                               axis = 0)
85
86
87    combined_adata.uns['group_dict'] = group_dict
88
89    if 'seed_obj' in adatas[0].uns_keys():
90        combined_adata.uns['seed_obj'] = adatas[0].uns['seed_obj']
91    else:
92        print("No random seed present in adata"
93              "Recommended for reproducibility.")
94
95    del adatas
96    gc.collect()
97
98    return combined_adata

Combines data sets for multimodal classification. Combined group names are f'{assay}+{group_name}'.

Parameters
  • adatas (list[ad.AnnData]): List of AnnData objects where each object is a different modality. Annotations must match between objects (i.e. same sample order).
  • names (list[str]): List of strings names for each modality repective to each object in adatas.
  • combination (str): How to combine the matrices, either 'sum' or 'concatenate'.
Returns
  • combined_adata (ad.Anndata): Adata object with the combined Z matrices and annotations.
def multimodal_processing( adatas: list[anndata._core.anndata.AnnData], names: list[str], tfidf: list[bool], combination: str = 'concatenate', batches: int = 10, batch_size: int = 100, verbose: bool = True) -> anndata._core.anndata.AnnData:
101def multimodal_processing(adatas : list[ad.AnnData], names : list[str], 
102                          tfidf: list[bool], combination: str='concatenate', 
103                          batches: int=10, batch_size: int=100, 
104                          verbose: bool=True) -> ad.AnnData:
105    """
106    Combines and processes a list of `ad.AnnData` objects.
107
108    Parameters
109    ----------
110    adatas : list[ad.AnnData]
111        List of `ad.AnnData` objects where each object is a different 
112        modality. Annotations must match between objects (i.e. same 
113        sample order).
114
115    names : list[str]
116        List of strings names for each modality repective to each 
117        object in adatas.
118            
119    combination: str
120        How to combine the matrices, either `'sum'` or `'concatenate'`.
121    
122    tfidf : list[bool]
123        If element `i` is `True`, `adata[i]` will be TF-IDF normalized.
124
125    batches : int
126        The number of batches to use for the distance calculation.
127        This will average the result of `batches` distance calculations
128        of `batch_size` randomly sampled cells. More batches will converge
129        to population distance values at the cost of scalability.
130
131    batch_size : int
132        The number of cells to include per batch for distance
133        calculations. Higher batch size will converge to population
134        distance values at the cost of scalability.
135        If `batches*batch_size > num_training_cells`, `batch_size` 
136        will be reduced to `int(num_training_cells / batches)`.
137
138    Returns
139    -------
140    adata : ad.AnnData
141        Concatenated from objects from `adatas` with Z matrices 
142        calculated.
143
144    Examples
145    --------
146    >>> rna_adata = scmkl.create_adata(X = mcf7_rna_mat, 
147    ...                                feature_names=gene_names, 
148    ...                                scale_data=True, 
149    ...                                transform_data=True,
150    ...                                cell_labels=cell_labels, 
151    ...                                 group_dict=rna_grouping)
152    >>>
153    >>> atac_adata = scmkl.create_adata(X=mcf7_atac_mat, 
154    ...                                 feature_names=peak_names, 
155    ...                                 scale_data=False, 
156    ...                                 cell_labels=cell_labels, 
157    ...                                 group_dict=atac_grouping)
158    >>>
159    >>> adatas = [rna_adata, atac_adata]
160    >>> mod_names = ['rna', 'atac']
161    >>> adata = scmkl.multimodal_processing(adatas = adatas, 
162    ...                                     names = mod_names,
163    ...                                     tfidf = [False, True])
164    >>>
165    >>> adata
166    AnnData object with n_obs × n_vars = 1000 × 12676
167    obs: 'labels'
168    var: 'labels'
169    uns: 'D', 'kernel_type', 'distance_metric', 'train_indices',  
170    'test_indices', 'Z_train', 'Z_test', 'group_dict', 'seed_obj'
171    """
172    import warnings 
173    warnings.filterwarnings('ignore')
174
175    diff_num_warn = "Different number of cells present in each object."
176    assert all([adata.shape[0] for adata in adatas]), diff_num_warn
177    
178    # True if all train indices match
179    same_train = np.all([np.array_equal(adatas[0].uns['train_indices'], 
180                                        adatas[i].uns['train_indices']) 
181                         for i in range(1, len(adatas))])
182
183    # True if all test indices match
184    same_test = np.all([np.array_equal(adatas[0].uns['test_indices'], 
185                                       adatas[i].uns['test_indices']) 
186                        for i in range(1, len(adatas))])
187
188    assert same_train, "Different train indices"
189    assert same_test, "Different test indices"
190
191    # Creates a boolean array for each modality of cells with non-empty rows
192    non_empty_rows = [np.array(sparse_var(adata.X, axis = 1) != 0).ravel() 
193                      for adata in adatas]
194    non_empty_rows = np.transpose(non_empty_rows)
195
196    # Returns a 1D array where sample feature sums non-0 across all modalities
197    non_empty_rows = np.array([np.all(non_empty_rows[i])
198                              for i in range(non_empty_rows.shape[0])])
199
200    # Initializing final train test split array
201    train_test = np.repeat('train', adatas[0].shape[0])
202    train_test[adatas[0].uns['test_indices']] = 'test'
203
204    # Capturing train test split with empty rows filtered out
205    train_test = train_test[non_empty_rows]
206    train_indices = np.where(train_test == 'train')[0]
207    test_indices = np.where(train_test == 'test')[0]
208
209    # Adding train test split arrays to AnnData objects 
210    # and filtering out empty samples
211    for i, adata in enumerate(adatas):
212        adatas[i].uns['train_indices'] = train_indices
213        adatas[i].uns['test_indices'] = test_indices
214        adatas[i] = adata[non_empty_rows, :]
215
216        # tfidf normalizing if corresponding element in tfidf is True
217        if tfidf[i]:
218            adatas[i] = tfidf_normalize(adata)
219        
220        if verbose:
221            print(f"Estimating sigma and calculating Z for {names[i]}", 
222                  flush = True)
223        adatas[i] = calculate_z(adata, n_features = 5000, batches=batches, 
224                                batch_size=batch_size)
225
226    if 'labels' in adatas[0].obs:
227        all_labels = [adata.obs['labels'] for adata in adatas]
228        # Ensuring cell labels for each AnnData object are the same
229        uneq_labs_warn = ("Cell labels between AnnData object in position 0 "
230                          "and position {} in adatas do not match")
231        for i in range(1, len(all_labels)):
232            same_labels = np.all(all_labels[0] == all_labels[i])
233            assert same_labels, uneq_labs_warn.format(i)
234
235    adata = combine_modalities(adatas=adatas,
236                                names=names,
237                                combination=combination)
238
239    del adatas
240    gc.collect()
241
242    return adata    

Combines and processes a list of ad.AnnData objects.

Parameters
  • adatas (list[ad.AnnData]): List of ad.AnnData objects where each object is a different modality. Annotations must match between objects (i.e. same sample order).
  • names (list[str]): List of strings names for each modality repective to each object in adatas.
  • combination (str): How to combine the matrices, either 'sum' or 'concatenate'.
  • tfidf (list[bool]): If element i is True, adata[i] will be TF-IDF normalized.
  • batches (int): The number of batches to use for the distance calculation. This will average the result of batches distance calculations of batch_size randomly sampled cells. More batches will converge to population distance values at the cost of scalability.
  • batch_size (int): The number of cells to include per batch for distance calculations. Higher batch size will converge to population distance values at the cost of scalability. If batches*batch_size > num_training_cells, batch_size will be reduced to int(num_training_cells / batches).
Returns
  • adata (ad.AnnData): Concatenated from objects from adatas with Z matrices calculated.
Examples
>>> rna_adata = scmkl.create_adata(X = mcf7_rna_mat, 
...                                feature_names=gene_names, 
...                                scale_data=True, 
...                                transform_data=True,
...                                cell_labels=cell_labels, 
...                                 group_dict=rna_grouping)
>>>
>>> atac_adata = scmkl.create_adata(X=mcf7_atac_mat, 
...                                 feature_names=peak_names, 
...                                 scale_data=False, 
...                                 cell_labels=cell_labels, 
...                                 group_dict=atac_grouping)
>>>
>>> adatas = [rna_adata, atac_adata]
>>> mod_names = ['rna', 'atac']
>>> adata = scmkl.multimodal_processing(adatas = adatas, 
...                                     names = mod_names,
...                                     tfidf = [False, True])
>>>
>>> adata
AnnData object with n_obs × n_vars = 1000 × 12676
obs: 'labels'
var: 'labels'
uns: 'D', 'kernel_type', 'distance_metric', 'train_indices',  
'test_indices', 'Z_train', 'Z_test', 'group_dict', 'seed_obj'