scmkl.multimodal_processing

  1import numpy as np
  2import anndata as ad
  3import gc
  4
  5from scmkl.tfidf_normalize import tfidf_normalize
  6from scmkl.data_processing import sparse_var
  7from scmkl.calculate_z import calculate_z
  8
  9
 10def combine_modalities(adatas: list[ad.AnnData], names: list[str], 
 11                        combination: str = 'concatenate'):
 12    """
 13    Combines data sets for multimodal classification. Combined group 
 14    names are `f'{assay}+{group_name}'`.
 15
 16    Parameters
 17    ----------
 18    adatas : list[ad.AnnData]
 19        List of AnnData objects where each object is a different 
 20        modality. Annotations must match between objects (i.e. same 
 21        sample order).
 22
 23    names : list[str]
 24        List of strings names for each modality repective to each 
 25        object in adatas.
 26            
 27    combination: str
 28        How to combine the matrices, either `'sum'` or `'concatenate'`.
 29    
 30    Returns
 31    -------
 32    combined_adata : ad.Anndata
 33        Adata object with the combined Z matrices and annotations. 
 34    """
 35    assert len({adata.shape[0] for adata in adatas}) == 1, ("All adatas must "
 36                                                            "have the same "
 37                                                            "number of rows")
 38    assert len(np.unique(names)) == len(names), "Assay names must be distinct"
 39    assert combination.lower() in ['sum', 'concatenate']
 40
 41    z_train = all(['Z_train' in adata.uns.keys() for adata in adatas])
 42    z_test = all(['Z_test' in adata.uns.keys() for adata in adatas])
 43
 44    assert all([z_train, z_test]), "Z not calculated for one or more adatas"
 45
 46    # Combining modalities
 47    combined_adata = ad.concat(adatas, uns_merge = 'same', 
 48                               axis = 1, label = 'labels')
 49    
 50    assert 'train_indices' in combined_adata.uns.keys(), ("Different train "
 51                                                          "test splits "
 52                                                          "between AnnData "
 53                                                          "objects")
 54
 55    # Conserving labels from adatas
 56    combined_adata.obs = adatas[0].obs.copy()
 57
 58    # Creating a single dictionary with all of the groups across modalities 
 59    group_dict = {}
 60    for name, adata in zip(names, adatas):
 61        for group_name, features in adata.uns['group_dict'].items():
 62            group_dict[f'{name}-{group_name}'] = features
 63
 64    if combination == 'concatenate':
 65        combined_adata.uns['Z_train'] = np.hstack([adata.uns['Z_train'] 
 66                                                   for adata in adatas])
 67        combined_adata.uns['Z_test'] = np.hstack([adata.uns['Z_test'] 
 68                                                  for adata in adatas])
 69
 70
 71    elif combination == 'sum':
 72
 73        #Check that the dimensions of all Z's are the same
 74        dims = [adata.uns['Z_train'].shape for adata in adatas]
 75        dims = all([dim == dims[0] for dim in dims])
 76        assert dims, "Cannot sum Z matrices with different dimensions"
 77        
 78        combined_adata.uns['Z_train'] = np.sum([adata.uns['Z_train'] 
 79                                                for adata in adatas], 
 80                                                axis = 0)
 81        combined_adata.uns['Z_test'] = np.sum([adata.uns['Z_test'] 
 82                                               for adata in adatas], 
 83                                               axis = 0)
 84
 85
 86    combined_adata.uns['group_dict'] = group_dict
 87
 88    if 'seed_obj' in adatas[0].uns_keys():
 89        combined_adata.uns['seed_obj'] = adatas[0].uns['seed_obj']
 90    else:
 91        print("No random seed present in adata"
 92              "Recommended for reproducibility.")
 93
 94    del adatas
 95    gc.collect()
 96
 97    return combined_adata
 98
 99
100def multimodal_processing(adatas : list[ad.AnnData], names : list[str], 
101                          tfidf: list[bool], combination: str='concatenate', 
102                          batches: int=10, batch_size: int=100) -> ad.AnnData:
103    """
104    Combines and processes a list of `ad.AnnData` objects.
105
106    Parameters
107    ----------
108    adatas : list[ad.AnnData]
109        List of `ad.AnnData` objects where each object is a different 
110        modality. Annotations must match between objects (i.e. same 
111        sample order).
112
113    names : list[str]
114        List of strings names for each modality repective to each 
115        object in adatas.
116            
117    combination: str
118        How to combine the matrices, either `'sum'` or `'concatenate'`.
119    
120    tfidf : list[bool]
121        If element `i` is `True`, `adata[i]` will be TF-IDF normalized.
122
123    batches : int
124        The number of batches to use for the distance calculation.
125        This will average the result of `batches` distance calculations
126        of `batch_size` randomly sampled cells. More batches will converge
127        to population distance values at the cost of scalability.
128
129    batch_size : int
130        The number of cells to include per batch for distance
131        calculations. Higher batch size will converge to population
132        distance values at the cost of scalability.
133        If `batches*batch_size > num_training_cells`, `batch_size` 
134        will be reduced to `int(num_training_cells / batches)`.
135
136    Returns
137    -------
138    adata : ad.AnnData
139        Concatenated from objects from `adatas` with Z matrices 
140        calculated.
141
142    Examples
143    --------
144    >>> rna_adata = scmkl.create_adata(X = mcf7_rna_mat, 
145    ...                                feature_names = gene_names, 
146    ...                                scale_data = True, 
147    ...                                cell_labels = cell_labels, 
148    ...                                 group_dict = rna_grouping)
149    >>>
150    >>> atac_adata = scmkl.create_adata(X = mcf7_atac_mat, 
151    ...                                 feature_names = peak_names, 
152    ...                                 scale_data = False, 
153    ...                                 cell_labels = cell_labels, 
154    ...                                 group_dict = atac_grouping)
155    >>>
156    >>> adatas = [rna_adata, atac_adata]
157    >>> mod_names = ['rna', 'atac']
158    >>> adata = scmkl.multimodal_processing(adatas = adatas, 
159    ...                                     names = mod_names,
160    ...                                     tfidf = [False, True])
161    >>>
162    >>> adata
163    AnnData object with n_obs × n_vars = 1000 × 12676
164    obs: 'labels'
165    var: 'labels'
166    uns: 'D', 'kernel_type', 'distance_metric', 'train_indices',  
167    'test_indices', 'Z_train', 'Z_test', 'group_dict', 'seed_obj'
168    """
169    import warnings 
170    warnings.filterwarnings('ignore')
171
172    diff_num_warn = "Different number of cells present in each object."
173    assert all([adata.shape[0] for adata in adatas]), diff_num_warn
174    
175    # True if all train indices match
176    same_train = np.all([np.array_equal(adatas[0].uns['train_indices'], 
177                                        adatas[i].uns['train_indices']) 
178                         for i in range(1, len(adatas))])
179
180    # True if all test indices match
181    same_test = np.all([np.array_equal(adatas[0].uns['test_indices'], 
182                                       adatas[i].uns['test_indices']) 
183                        for i in range(1, len(adatas))])
184
185    assert same_train, "Different train indices"
186    assert same_test, "Different test indices"
187
188    # Creates a boolean array for each modality of cells with non-empty rows
189    non_empty_rows = [np.array(sparse_var(adata.X, axis = 1) != 0).ravel() 
190                      for adata in adatas]
191
192    # Returns a 1d array where sample feature sums
193    # across all modalities are more than 0
194    non_empty_rows = np.logical_and(*non_empty_rows).squeeze()
195
196    # Initializing final train test split array
197    train_test = np.repeat('train', adatas[0].shape[0])
198    train_test[adatas[0].uns['test_indices']] = 'test'
199
200    # Capturing train test split with empty rows filtered out
201    train_test = train_test[non_empty_rows]
202    train_indices = np.where(train_test == 'train')[0]
203    test_indices = np.where(train_test == 'test')[0]
204
205    # Adding train test split arrays to AnnData objects 
206    # and filtering out empty samples
207    for i, adata in enumerate(adatas):
208        adatas[i].uns['train_indices'] = train_indices
209        adatas[i].uns['test_indices'] = test_indices
210        adatas[i] = adata[non_empty_rows, :]
211        # tfidf normalizing if corresponding element in tfidf is True
212        if tfidf[i]:
213            adatas[i] = tfidf_normalize(adata)
214
215        print(f"Estimating sigma and calculating Z for {names[i]}", flush = True)
216        adatas[i] = calculate_z(adata, n_features = 5000, batches=batches, 
217                                batch_size=batch_size)
218
219    if 'labels' in adatas[0].obs:
220        all_labels = [adata.obs['labels'] for adata in adatas]
221        # Ensuring cell labels for each AnnData object are the same
222        uneq_labs_warn = ("Cell labels between AnnData object in position 0 "
223                          "and position {} in adatas do not match")
224        for i in range(1, len(all_labels)):
225            same_labels = np.all(all_labels[0] == all_labels[i])
226            assert same_labels, uneq_labs_warn.format(i)
227
228    adata = combine_modalities(adatas=adatas,
229                                names=names,
230                                combination=combination)
231
232    del adatas
233    gc.collect()
234
235    return adata    
def combine_modalities( adatas: list[anndata._core.anndata.AnnData], names: list[str], combination: str = 'concatenate'):
11def combine_modalities(adatas: list[ad.AnnData], names: list[str], 
12                        combination: str = 'concatenate'):
13    """
14    Combines data sets for multimodal classification. Combined group 
15    names are `f'{assay}+{group_name}'`.
16
17    Parameters
18    ----------
19    adatas : list[ad.AnnData]
20        List of AnnData objects where each object is a different 
21        modality. Annotations must match between objects (i.e. same 
22        sample order).
23
24    names : list[str]
25        List of strings names for each modality repective to each 
26        object in adatas.
27            
28    combination: str
29        How to combine the matrices, either `'sum'` or `'concatenate'`.
30    
31    Returns
32    -------
33    combined_adata : ad.Anndata
34        Adata object with the combined Z matrices and annotations. 
35    """
36    assert len({adata.shape[0] for adata in adatas}) == 1, ("All adatas must "
37                                                            "have the same "
38                                                            "number of rows")
39    assert len(np.unique(names)) == len(names), "Assay names must be distinct"
40    assert combination.lower() in ['sum', 'concatenate']
41
42    z_train = all(['Z_train' in adata.uns.keys() for adata in adatas])
43    z_test = all(['Z_test' in adata.uns.keys() for adata in adatas])
44
45    assert all([z_train, z_test]), "Z not calculated for one or more adatas"
46
47    # Combining modalities
48    combined_adata = ad.concat(adatas, uns_merge = 'same', 
49                               axis = 1, label = 'labels')
50    
51    assert 'train_indices' in combined_adata.uns.keys(), ("Different train "
52                                                          "test splits "
53                                                          "between AnnData "
54                                                          "objects")
55
56    # Conserving labels from adatas
57    combined_adata.obs = adatas[0].obs.copy()
58
59    # Creating a single dictionary with all of the groups across modalities 
60    group_dict = {}
61    for name, adata in zip(names, adatas):
62        for group_name, features in adata.uns['group_dict'].items():
63            group_dict[f'{name}-{group_name}'] = features
64
65    if combination == 'concatenate':
66        combined_adata.uns['Z_train'] = np.hstack([adata.uns['Z_train'] 
67                                                   for adata in adatas])
68        combined_adata.uns['Z_test'] = np.hstack([adata.uns['Z_test'] 
69                                                  for adata in adatas])
70
71
72    elif combination == 'sum':
73
74        #Check that the dimensions of all Z's are the same
75        dims = [adata.uns['Z_train'].shape for adata in adatas]
76        dims = all([dim == dims[0] for dim in dims])
77        assert dims, "Cannot sum Z matrices with different dimensions"
78        
79        combined_adata.uns['Z_train'] = np.sum([adata.uns['Z_train'] 
80                                                for adata in adatas], 
81                                                axis = 0)
82        combined_adata.uns['Z_test'] = np.sum([adata.uns['Z_test'] 
83                                               for adata in adatas], 
84                                               axis = 0)
85
86
87    combined_adata.uns['group_dict'] = group_dict
88
89    if 'seed_obj' in adatas[0].uns_keys():
90        combined_adata.uns['seed_obj'] = adatas[0].uns['seed_obj']
91    else:
92        print("No random seed present in adata"
93              "Recommended for reproducibility.")
94
95    del adatas
96    gc.collect()
97
98    return combined_adata

Combines data sets for multimodal classification. Combined group names are f'{assay}+{group_name}'.

Parameters
  • adatas (list[ad.AnnData]): List of AnnData objects where each object is a different modality. Annotations must match between objects (i.e. same sample order).
  • names (list[str]): List of strings names for each modality repective to each object in adatas.
  • combination (str): How to combine the matrices, either 'sum' or 'concatenate'.
Returns
  • combined_adata (ad.Anndata): Adata object with the combined Z matrices and annotations.
def multimodal_processing( adatas: list[anndata._core.anndata.AnnData], names: list[str], tfidf: list[bool], combination: str = 'concatenate', batches: int = 10, batch_size: int = 100) -> anndata._core.anndata.AnnData:
101def multimodal_processing(adatas : list[ad.AnnData], names : list[str], 
102                          tfidf: list[bool], combination: str='concatenate', 
103                          batches: int=10, batch_size: int=100) -> ad.AnnData:
104    """
105    Combines and processes a list of `ad.AnnData` objects.
106
107    Parameters
108    ----------
109    adatas : list[ad.AnnData]
110        List of `ad.AnnData` objects where each object is a different 
111        modality. Annotations must match between objects (i.e. same 
112        sample order).
113
114    names : list[str]
115        List of strings names for each modality repective to each 
116        object in adatas.
117            
118    combination: str
119        How to combine the matrices, either `'sum'` or `'concatenate'`.
120    
121    tfidf : list[bool]
122        If element `i` is `True`, `adata[i]` will be TF-IDF normalized.
123
124    batches : int
125        The number of batches to use for the distance calculation.
126        This will average the result of `batches` distance calculations
127        of `batch_size` randomly sampled cells. More batches will converge
128        to population distance values at the cost of scalability.
129
130    batch_size : int
131        The number of cells to include per batch for distance
132        calculations. Higher batch size will converge to population
133        distance values at the cost of scalability.
134        If `batches*batch_size > num_training_cells`, `batch_size` 
135        will be reduced to `int(num_training_cells / batches)`.
136
137    Returns
138    -------
139    adata : ad.AnnData
140        Concatenated from objects from `adatas` with Z matrices 
141        calculated.
142
143    Examples
144    --------
145    >>> rna_adata = scmkl.create_adata(X = mcf7_rna_mat, 
146    ...                                feature_names = gene_names, 
147    ...                                scale_data = True, 
148    ...                                cell_labels = cell_labels, 
149    ...                                 group_dict = rna_grouping)
150    >>>
151    >>> atac_adata = scmkl.create_adata(X = mcf7_atac_mat, 
152    ...                                 feature_names = peak_names, 
153    ...                                 scale_data = False, 
154    ...                                 cell_labels = cell_labels, 
155    ...                                 group_dict = atac_grouping)
156    >>>
157    >>> adatas = [rna_adata, atac_adata]
158    >>> mod_names = ['rna', 'atac']
159    >>> adata = scmkl.multimodal_processing(adatas = adatas, 
160    ...                                     names = mod_names,
161    ...                                     tfidf = [False, True])
162    >>>
163    >>> adata
164    AnnData object with n_obs × n_vars = 1000 × 12676
165    obs: 'labels'
166    var: 'labels'
167    uns: 'D', 'kernel_type', 'distance_metric', 'train_indices',  
168    'test_indices', 'Z_train', 'Z_test', 'group_dict', 'seed_obj'
169    """
170    import warnings 
171    warnings.filterwarnings('ignore')
172
173    diff_num_warn = "Different number of cells present in each object."
174    assert all([adata.shape[0] for adata in adatas]), diff_num_warn
175    
176    # True if all train indices match
177    same_train = np.all([np.array_equal(adatas[0].uns['train_indices'], 
178                                        adatas[i].uns['train_indices']) 
179                         for i in range(1, len(adatas))])
180
181    # True if all test indices match
182    same_test = np.all([np.array_equal(adatas[0].uns['test_indices'], 
183                                       adatas[i].uns['test_indices']) 
184                        for i in range(1, len(adatas))])
185
186    assert same_train, "Different train indices"
187    assert same_test, "Different test indices"
188
189    # Creates a boolean array for each modality of cells with non-empty rows
190    non_empty_rows = [np.array(sparse_var(adata.X, axis = 1) != 0).ravel() 
191                      for adata in adatas]
192
193    # Returns a 1d array where sample feature sums
194    # across all modalities are more than 0
195    non_empty_rows = np.logical_and(*non_empty_rows).squeeze()
196
197    # Initializing final train test split array
198    train_test = np.repeat('train', adatas[0].shape[0])
199    train_test[adatas[0].uns['test_indices']] = 'test'
200
201    # Capturing train test split with empty rows filtered out
202    train_test = train_test[non_empty_rows]
203    train_indices = np.where(train_test == 'train')[0]
204    test_indices = np.where(train_test == 'test')[0]
205
206    # Adding train test split arrays to AnnData objects 
207    # and filtering out empty samples
208    for i, adata in enumerate(adatas):
209        adatas[i].uns['train_indices'] = train_indices
210        adatas[i].uns['test_indices'] = test_indices
211        adatas[i] = adata[non_empty_rows, :]
212        # tfidf normalizing if corresponding element in tfidf is True
213        if tfidf[i]:
214            adatas[i] = tfidf_normalize(adata)
215
216        print(f"Estimating sigma and calculating Z for {names[i]}", flush = True)
217        adatas[i] = calculate_z(adata, n_features = 5000, batches=batches, 
218                                batch_size=batch_size)
219
220    if 'labels' in adatas[0].obs:
221        all_labels = [adata.obs['labels'] for adata in adatas]
222        # Ensuring cell labels for each AnnData object are the same
223        uneq_labs_warn = ("Cell labels between AnnData object in position 0 "
224                          "and position {} in adatas do not match")
225        for i in range(1, len(all_labels)):
226            same_labels = np.all(all_labels[0] == all_labels[i])
227            assert same_labels, uneq_labs_warn.format(i)
228
229    adata = combine_modalities(adatas=adatas,
230                                names=names,
231                                combination=combination)
232
233    del adatas
234    gc.collect()
235
236    return adata    

Combines and processes a list of ad.AnnData objects.

Parameters
  • adatas (list[ad.AnnData]): List of ad.AnnData objects where each object is a different modality. Annotations must match between objects (i.e. same sample order).
  • names (list[str]): List of strings names for each modality repective to each object in adatas.
  • combination (str): How to combine the matrices, either 'sum' or 'concatenate'.
  • tfidf (list[bool]): If element i is True, adata[i] will be TF-IDF normalized.
  • batches (int): The number of batches to use for the distance calculation. This will average the result of batches distance calculations of batch_size randomly sampled cells. More batches will converge to population distance values at the cost of scalability.
  • batch_size (int): The number of cells to include per batch for distance calculations. Higher batch size will converge to population distance values at the cost of scalability. If batches*batch_size > num_training_cells, batch_size will be reduced to int(num_training_cells / batches).
Returns
  • adata (ad.AnnData): Concatenated from objects from adatas with Z matrices calculated.
Examples
>>> rna_adata = scmkl.create_adata(X = mcf7_rna_mat, 
...                                feature_names = gene_names, 
...                                scale_data = True, 
...                                cell_labels = cell_labels, 
...                                 group_dict = rna_grouping)
>>>
>>> atac_adata = scmkl.create_adata(X = mcf7_atac_mat, 
...                                 feature_names = peak_names, 
...                                 scale_data = False, 
...                                 cell_labels = cell_labels, 
...                                 group_dict = atac_grouping)
>>>
>>> adatas = [rna_adata, atac_adata]
>>> mod_names = ['rna', 'atac']
>>> adata = scmkl.multimodal_processing(adatas = adatas, 
...                                     names = mod_names,
...                                     tfidf = [False, True])
>>>
>>> adata
AnnData object with n_obs × n_vars = 1000 × 12676
obs: 'labels'
var: 'labels'
uns: 'D', 'kernel_type', 'distance_metric', 'train_indices',  
'test_indices', 'Z_train', 'Z_test', 'group_dict', 'seed_obj'