scmkl.multimodal_processing

  1import numpy as np
  2import anndata as ad
  3import gc
  4
  5from scmkl.tfidf_normalize import tfidf_normalize
  6from scmkl.estimate_sigma import estimate_sigma
  7from scmkl.calculate_z import calculate_z, _sparse_var
  8
  9
 10def _combine_modalities(adatas : list, names : list, 
 11                        combination = 'concatenate'):
 12    '''
 13    Combines data sets for multimodal classification. Combined group 
 14    names are assay+group_name.
 15
 16    Parameters
 17    ----------
 18    adatas : a list of AnnData objects where each object is a different 
 19             modality
 20
 21    names : a list of strings names for each modality repective to each 
 22            object in adatas
 23            
 24    combination: How to combine the matrices, either sum or concatenate
 25    
 26    Returns
 27    -------
 28    combined_adata : Adata object with the combined Z matrices and 
 29                     annotations. Annotations must match.
 30    '''
 31    assert len({adata.shape[0] for adata in adatas}) == 1, ("All adatas must "
 32                                                            "have the same "
 33                                                            "number of rows")
 34    assert len(np.unique(names)) == len(names), 'Assay names must be distinct'
 35    assert combination.lower() in ['sum', 'concatenate']
 36
 37    z_train = all(['Z_train' in adata.uns.keys() for adata in adatas])
 38    z_test = all(['Z_test' in adata.uns.keys() for adata in adatas])
 39
 40    assert all([z_train, z_test]), "Z not calculated for one or more adatas"
 41
 42    # Combining modalities
 43    combined_adata = ad.concat(adatas, uns_merge = 'same', 
 44                               axis = 1, label = 'labels')
 45
 46    assert 'train_indices' in combined_adata.uns.keys(), ("Different train "
 47                                                          "test splits "
 48                                                          "between AnnData "
 49                                                          "objects")
 50
 51    # Conserving labels from adatas
 52    combined_adata.obs = adatas[0].obs
 53
 54    # Creating a single dictionary with all of the groups across modalities 
 55    group_dict = {}
 56    for name, adata in zip(names, adatas):
 57        for group_name, features in adata.uns['group_dict'].items():
 58            group_dict[f'{name}-{group_name}'] = features
 59
 60    if combination == 'concatenate':
 61        combined_adata.uns['Z_train'] = np.hstack([adata.uns['Z_train'] 
 62                                                   for adata in adatas])
 63        combined_adata.uns['Z_test'] = np.hstack([adata.uns['Z_test'] 
 64                                                  for adata in adatas])
 65
 66
 67    elif combination == 'sum':
 68
 69        #Check that the dimensions of all Z's are the same
 70        dims = [adata.uns['Z_train'].shape for adata in adatas]
 71        dims = all([dim == dims[0] for dim in dims])
 72        assert dims, 'Cannot sum Z matrices with different dimensions'
 73        
 74        combined_adata.uns['Z_train'] = np.sum([adata.uns['Z_train'] 
 75                                                for adata in adatas], 
 76                                                axis = 0)
 77        combined_adata.uns['Z_test'] = np.sum([adata.uns['Z_test'] 
 78                                               for adata in adatas], 
 79                                               axis = 0)
 80
 81
 82    combined_adata.uns['group_dict'] = group_dict
 83
 84    if 'seed_obj' in adatas[0].uns_keys():
 85        combined_adata.uns['seed_obj'] = adatas[0].uns['seed_obj']
 86    else:
 87        print("No random seed present in adata"
 88              "Recommended for reproducibility.")
 89
 90    del adatas
 91    gc.collect()
 92
 93    return combined_adata
 94
 95
 96def multimodal_processing(adatas : list, names : list, tfidf: list):
 97    '''
 98    Combines and processes a list of adata objects.
 99
100    Parameters
101    ----------
102    **adatas** : *list[AnnData]* 
103        > List of AnnData objects where each object is a different 
104        modality for the same cells.
105
106    **names** : *list[str]*
107        > List of string names for each modality repective to each 
108        object in `adatas`.
109    
110    **tfidf** : *bool* 
111        > List where if element i is `True`, adata[i] will be TFIDF 
112        normalized.
113
114    Returns
115    -------
116    **adata** : *AnnData* 
117        > Concatenated from objects from `adatas` with Z matrices 
118        calculated.
119
120    Examples
121    --------
122    >>> rna_adata = scmkl.create_adata(X = mcf7_rna_mat, 
123    ...                                feature_names = gene_names, 
124    ...                                scale_data = True, 
125    ...                                cell_labels = cell_labels, 
126    ...                                 group_dict = rna_grouping)
127    >>>
128    >>> atac_adata = scmkl.create_adata(X = mcf7_atac_mat, 
129    ...                                 feature_names = peak_names, 
130    ...                                 scale_data = False, 
131    ...                                 cell_labels = cell_labels, 
132    ...                                 group_dict = atac_grouping)
133    >>>
134    >>> adatas = [rna_adata, atac_adata]
135    >>> mod_names = ['rna', 'atac']
136    >>> adata = scmkl.multimodal_processing(adatas = adatas, 
137    ...                                     names = mod_names,
138    ...                                     tfidf = [False, True])
139    >>>
140    >>> adata
141    AnnData object with n_obs × n_vars = 1000 × 12676
142    obs: 'labels'
143    var: 'labels'
144    uns: 'D', 'kernel_type', 'distance_metric', 'train_indices',  
145    'test_indices', 'Z_train', 'Z_test', 'group_dict', 'seed_obj'
146    '''
147    import warnings 
148    warnings.filterwarnings('ignore')
149
150    assert all([adata.shape[0] for adata in adatas]), ("Different number of "
151                                                       "cells present in "
152                                                       "each object")
153    
154    # True if all train indices match
155    same_train = np.all([np.array_equal(adatas[0].uns['train_indices'], 
156                                        adatas[i].uns['train_indices']) 
157                         for i in range(1, len(adatas))])
158
159    # True if all test indices match
160    same_test = np.all([np.array_equal(adatas[0].uns['test_indices'], 
161                                       adatas[i].uns['test_indices']) 
162                        for i in range(1, len(adatas))])
163
164    assert same_train, 'Different train indices'
165    assert same_test, 'Different test indices'
166
167    # Creates a boolean array for each modality of cells with non-empty rows
168    non_empty_rows = [np.array(_sparse_var(adata.X, axis = 1) != 0).ravel() 
169                      for adata in adatas]
170
171    # Returns a 1d array where sample feature sums
172    # across all modalities are more than 0
173    non_empty_rows = np.logical_and(*non_empty_rows).squeeze()
174
175    # Initializing final train test split array
176    train_test = np.repeat('train', adatas[0].shape[0])
177    train_test[adatas[0].uns['test_indices']] = 'test'
178
179    # Capturing train test split with empty rows filtered out
180    train_test = train_test[non_empty_rows]
181    train_indices = np.where(train_test == 'train')[0]
182    test_indices = np.where(train_test == 'test')[0]
183
184    # Adding train test split arrays to AnnData objects 
185    # and filtering out empty samples
186    for i, adata in enumerate(adatas):
187        adatas[i].uns['train_indices'] = train_indices
188        adatas[i].uns['test_indices'] = test_indices
189        adatas[i] = adata[non_empty_rows, :]
190        # tfidf normalizing if corresponding element in tfidf is True
191        if tfidf[i]:
192            adatas[i] = tfidf_normalize(adata)
193
194        if 'Z_train' not in adatas[i].uns.keys():
195            # AnnData update must be pointing at the object in list
196            print(f'Estimating Sigma for {names[i]}', flush = True)
197            adatas[i] = estimate_sigma(adata, n_features= 200)
198            print(f'Calculating Z for {names[i]}', flush = True)
199            adatas[i] = calculate_z(adata, n_features = 5000)
200
201    if 'labels' in adatas[0].obs:
202        all_labels = [adata.obs['labels'] for adata in adatas]
203        # Ensuring cell labels for each AnnData object are the same
204        for i in range(1, len(all_labels)):
205            same_labels = np.all(all_labels[0] == all_labels[i])
206            assert same_labels, (f"Cell labels between AnnData object in "
207                                 f"position 0 and position {i} in adatas do "
208                                 "not match")
209
210    adata = _combine_modalities(adatas = adatas,
211                                names = names,
212                                combination = 'concatenate')
213
214    del adatas
215    gc.collect()
216
217    return adata    
def multimodal_processing(adatas: list, names: list, tfidf: list):
 97def multimodal_processing(adatas : list, names : list, tfidf: list):
 98    '''
 99    Combines and processes a list of adata objects.
100
101    Parameters
102    ----------
103    **adatas** : *list[AnnData]* 
104        > List of AnnData objects where each object is a different 
105        modality for the same cells.
106
107    **names** : *list[str]*
108        > List of string names for each modality repective to each 
109        object in `adatas`.
110    
111    **tfidf** : *bool* 
112        > List where if element i is `True`, adata[i] will be TFIDF 
113        normalized.
114
115    Returns
116    -------
117    **adata** : *AnnData* 
118        > Concatenated from objects from `adatas` with Z matrices 
119        calculated.
120
121    Examples
122    --------
123    >>> rna_adata = scmkl.create_adata(X = mcf7_rna_mat, 
124    ...                                feature_names = gene_names, 
125    ...                                scale_data = True, 
126    ...                                cell_labels = cell_labels, 
127    ...                                 group_dict = rna_grouping)
128    >>>
129    >>> atac_adata = scmkl.create_adata(X = mcf7_atac_mat, 
130    ...                                 feature_names = peak_names, 
131    ...                                 scale_data = False, 
132    ...                                 cell_labels = cell_labels, 
133    ...                                 group_dict = atac_grouping)
134    >>>
135    >>> adatas = [rna_adata, atac_adata]
136    >>> mod_names = ['rna', 'atac']
137    >>> adata = scmkl.multimodal_processing(adatas = adatas, 
138    ...                                     names = mod_names,
139    ...                                     tfidf = [False, True])
140    >>>
141    >>> adata
142    AnnData object with n_obs × n_vars = 1000 × 12676
143    obs: 'labels'
144    var: 'labels'
145    uns: 'D', 'kernel_type', 'distance_metric', 'train_indices',  
146    'test_indices', 'Z_train', 'Z_test', 'group_dict', 'seed_obj'
147    '''
148    import warnings 
149    warnings.filterwarnings('ignore')
150
151    assert all([adata.shape[0] for adata in adatas]), ("Different number of "
152                                                       "cells present in "
153                                                       "each object")
154    
155    # True if all train indices match
156    same_train = np.all([np.array_equal(adatas[0].uns['train_indices'], 
157                                        adatas[i].uns['train_indices']) 
158                         for i in range(1, len(adatas))])
159
160    # True if all test indices match
161    same_test = np.all([np.array_equal(adatas[0].uns['test_indices'], 
162                                       adatas[i].uns['test_indices']) 
163                        for i in range(1, len(adatas))])
164
165    assert same_train, 'Different train indices'
166    assert same_test, 'Different test indices'
167
168    # Creates a boolean array for each modality of cells with non-empty rows
169    non_empty_rows = [np.array(_sparse_var(adata.X, axis = 1) != 0).ravel() 
170                      for adata in adatas]
171
172    # Returns a 1d array where sample feature sums
173    # across all modalities are more than 0
174    non_empty_rows = np.logical_and(*non_empty_rows).squeeze()
175
176    # Initializing final train test split array
177    train_test = np.repeat('train', adatas[0].shape[0])
178    train_test[adatas[0].uns['test_indices']] = 'test'
179
180    # Capturing train test split with empty rows filtered out
181    train_test = train_test[non_empty_rows]
182    train_indices = np.where(train_test == 'train')[0]
183    test_indices = np.where(train_test == 'test')[0]
184
185    # Adding train test split arrays to AnnData objects 
186    # and filtering out empty samples
187    for i, adata in enumerate(adatas):
188        adatas[i].uns['train_indices'] = train_indices
189        adatas[i].uns['test_indices'] = test_indices
190        adatas[i] = adata[non_empty_rows, :]
191        # tfidf normalizing if corresponding element in tfidf is True
192        if tfidf[i]:
193            adatas[i] = tfidf_normalize(adata)
194
195        if 'Z_train' not in adatas[i].uns.keys():
196            # AnnData update must be pointing at the object in list
197            print(f'Estimating Sigma for {names[i]}', flush = True)
198            adatas[i] = estimate_sigma(adata, n_features= 200)
199            print(f'Calculating Z for {names[i]}', flush = True)
200            adatas[i] = calculate_z(adata, n_features = 5000)
201
202    if 'labels' in adatas[0].obs:
203        all_labels = [adata.obs['labels'] for adata in adatas]
204        # Ensuring cell labels for each AnnData object are the same
205        for i in range(1, len(all_labels)):
206            same_labels = np.all(all_labels[0] == all_labels[i])
207            assert same_labels, (f"Cell labels between AnnData object in "
208                                 f"position 0 and position {i} in adatas do "
209                                 "not match")
210
211    adata = _combine_modalities(adatas = adatas,
212                                names = names,
213                                combination = 'concatenate')
214
215    del adatas
216    gc.collect()
217
218    return adata    

Combines and processes a list of adata objects.

Parameters

adatas : list[AnnData]

List of AnnData objects where each object is a different modality for the same cells.

names : list[str]

List of string names for each modality repective to each object in adatas.

tfidf : bool

List where if element i is True, adata[i] will be TFIDF normalized.

Returns

adata : AnnData

Concatenated from objects from adatas with Z matrices calculated.

Examples

>>> rna_adata = scmkl.create_adata(X = mcf7_rna_mat, 
...                                feature_names = gene_names, 
...                                scale_data = True, 
...                                cell_labels = cell_labels, 
...                                 group_dict = rna_grouping)
>>>
>>> atac_adata = scmkl.create_adata(X = mcf7_atac_mat, 
...                                 feature_names = peak_names, 
...                                 scale_data = False, 
...                                 cell_labels = cell_labels, 
...                                 group_dict = atac_grouping)
>>>
>>> adatas = [rna_adata, atac_adata]
>>> mod_names = ['rna', 'atac']
>>> adata = scmkl.multimodal_processing(adatas = adatas, 
...                                     names = mod_names,
...                                     tfidf = [False, True])
>>>
>>> adata
AnnData object with n_obs × n_vars = 1000 × 12676
obs: 'labels'
var: 'labels'
uns: 'D', 'kernel_type', 'distance_metric', 'train_indices',  
'test_indices', 'Z_train', 'Z_test', 'group_dict', 'seed_obj'