scmkl.calculate_z

  1import numpy as np
  2import scipy
  3import anndata as ad
  4
  5from scmkl.tfidf_normalize import tfidf_train_test
  6from scmkl.estimate_sigma import est_group_sigma, get_batches
  7from scmkl.data_processing import process_data, get_group_mat, sample_cells
  8from scmkl.projections import gaussian_trans, laplacian_trans, cauchy_trans
  9
 10
 11def get_z_indices(m, D):
 12    """
 13    Takes the number associated with the group as `m` and returns the 
 14    indices for cos and sin functions to be applied.
 15
 16    Parameters
 17    ----------
 18    m : int
 19        The chronological number of the group being processed.
 20
 21    D : int
 22        The number of dimensions per group.
 23
 24    Returns
 25    -------
 26    cos_idx, sin_idx : np.ndarray, np.ndarray
 27        The indices for cos and sin projections in overall Z matrix.
 28    """
 29    x_idx = np.arange(m*2*D ,(m + 1)*2*D)
 30    cos_idx = x_idx[:len(x_idx)//2]
 31    sin_idx = x_idx[len(x_idx)//2:]
 32
 33    return cos_idx, sin_idx
 34
 35
 36def calc_groupz(X_train, X_test, adata, D, sigma, proj_func):
 37    """
 38    Calculates the Z matrix for grouping.
 39
 40    Parameters
 41    ----------
 42    X_train : np.ndarray
 43        The filtered data matrix to calculate train Z mat for.
 44    
 45    X_test : np.ndarray
 46        The filtered data matrix to calculate test Z mat for.
 47
 48    adata : anndata.AnnData 
 49        AnnData object containing `seed_obj` in `.uns` attribute.
 50
 51    D : int
 52        Number of dimensions per grouping.
 53
 54    sigma : float
 55        Kernel width for grouping.
 56
 57    proj_func : function
 58        The projection direction function to be applied to data.
 59
 60    Returns
 61    -------
 62    train_projections, test_projections : np.ndarray, np.ndarray
 63        Training and testing Z matrices for group.
 64    """  
 65    if scipy.sparse.issparse(X_train):
 66        X_train = X_train.toarray().astype(np.float16)
 67        X_test = X_test.toarray().astype(np.float16)
 68
 69    W = proj_func(X_train, sigma, adata.uns['seed_obj'], D)
 70    
 71    train_projection = np.matmul(X_train, W)
 72    test_projection = np.matmul(X_test, W)
 73
 74    return train_projection, test_projection
 75
 76
 77def calculate_z(adata, n_features=5000, batches=10, 
 78                batch_size=100) -> ad.AnnData:
 79    """
 80    Function to calculate Z matrices for all groups in both training 
 81    and testing data.
 82
 83    Parameters
 84    ----------
 85    adata : ad.AnnData
 86        created by `scmkl.create_adata()` with `adata.uns.keys()`: 
 87        `'train_indices'`, and `'test_indices'`. 
 88
 89    n_features : int
 90        Number of random feature to use when calculating Z; used for 
 91        scalability.
 92
 93    batches : int
 94        The number of batches to use for the distance calculation.
 95        This will average the result of `batches` distance calculations
 96        of `batch_size` randomly sampled cells. More batches will converge
 97        to population distance values at the cost of scalability.
 98
 99    batch_size : int
100        The number of cells to include per batch for distance
101        calculations. Higher batch size will converge to population
102        distance values at the cost of scalability.
103        If `batches*batch_size > num_training_cells`,
104        `batch_size` will be reduced to 
105        `int(num_training_cells / batches)`.
106
107    Returns
108    -------
109    adata : ad.AnnData
110        `adata` with Z matrices accessible with `adata.uns['Z_train']` 
111        and `adata.uns['Z_test']`.
112
113    Examples
114    --------
115    >>> adata = scmkl.estimate_sigma(adata)
116    >>> adata = scmkl.calculate_z(adata)
117    >>> adata.uns.keys()
118    dict_keys(['Z_train', 'Z_test', 'sigmas', 'train_indices', 
119    'test_indices'])
120    """
121    # Number of groupings taking from group_dict
122    n_pathway = len(adata.uns['group_dict'].keys())
123    D = adata.uns['D']
124
125    sq_i_d = np.sqrt(1/D)
126
127    # Capturing training and testing sizes
128    train_len = len(adata.uns['train_indices'])
129    test_len = len(adata.uns['test_indices'])
130
131    if batch_size * batches > len(adata.uns['train_indices']):
132        old_batch_size = batch_size
133        batch_size = int(len(adata.uns['train_indices'])/batches)
134        print("Specified batch size required too many cells for "
135                "independent batches. Reduced batch size from "
136                f"{old_batch_size} to {batch_size}")
137
138    if 'sigma' not in adata.uns.keys():
139        n_samples = np.min((2000, adata.uns['train_indices'].shape[0]))
140        sample_range = np.arange(n_samples)
141        batch_idx = get_batches(sample_range, adata.uns['seed_obj'], 
142                                batches=batches, batch_size=batch_size)
143        sigma_indices = sample_cells(adata.uns['train_indices'], n_samples, adata.uns['seed_obj'])
144
145    # Create Arrays to store concatenated group Zs
146    # Each group of features will have a corresponding entry in each array
147    n_cols = 2*adata.uns['D']*n_pathway
148    Z_train = np.zeros((train_len, n_cols))
149    Z_test = np.zeros((test_len, n_cols))
150
151
152    # Setting kernel function 
153    match adata.uns['kernel_type'].lower():
154        case 'gaussian':
155            proj_func = gaussian_trans
156        case 'laplacian':
157            proj_func = laplacian_trans
158        case 'cauchy':
159            proj_func = cauchy_trans
160
161
162    # Loop over each of the groups and creating Z for each
163    sigma_list = list()
164    for m, group_features in enumerate(adata.uns['group_dict'].values()):
165
166        n_group_features = len(group_features)
167
168        X_train, X_test = get_group_mat(adata, n_features, group_features, 
169                                        n_group_features, process_test=True)
170        
171        if adata.uns['tfidf']:
172            X_train, X_test = tfidf_train_test(X_train, X_test)
173
174        # Data filtering, and transformation according to given data_type
175        # Will remove low variance (< 1e5) features regardless of data_type
176        # If scale_data will log scale and z-score the data
177        X_train, X_test = process_data(X_train=X_train, X_test=X_test, 
178                                       scale_data=adata.uns['scale_data'], 
179                                       return_dense=True)    
180
181        # Getting sigma
182        if 'sigma' in adata.uns.keys():
183            sigma = adata.uns['sigma'][m]
184        else:
185            sigma = est_group_sigma(adata, X_train, n_group_features, 
186                                    n_features, batch_idx=batch_idx)
187            sigma_list.append(sigma)
188            
189        assert sigma > 0, "Sigma must be more than 0"
190        train_projection, test_projection = calc_groupz(X_train, X_test, 
191                                                        adata, D, sigma, 
192                                                        proj_func)
193
194        # Store group Z in whole-Z object
195        # Preserves order to be able to extract meaningful groups
196        cos_idx, sin_idx = get_z_indices(m, D)
197
198        Z_train[0:, cos_idx] = np.cos(train_projection)
199        Z_train[0:, sin_idx] = np.sin(train_projection)
200
201        Z_test[0:, cos_idx] = np.cos(test_projection)
202        Z_test[0:, sin_idx] = np.sin(test_projection)
203
204    adata.uns['Z_train'] = Z_train*sq_i_d
205    adata.uns['Z_test'] = Z_test*sq_i_d
206
207    if 'sigma' not in adata.uns.keys():
208        adata.uns['sigma'] = np.array(sigma_list)
209
210    return adata
def get_z_indices(m, D):
12def get_z_indices(m, D):
13    """
14    Takes the number associated with the group as `m` and returns the 
15    indices for cos and sin functions to be applied.
16
17    Parameters
18    ----------
19    m : int
20        The chronological number of the group being processed.
21
22    D : int
23        The number of dimensions per group.
24
25    Returns
26    -------
27    cos_idx, sin_idx : np.ndarray, np.ndarray
28        The indices for cos and sin projections in overall Z matrix.
29    """
30    x_idx = np.arange(m*2*D ,(m + 1)*2*D)
31    cos_idx = x_idx[:len(x_idx)//2]
32    sin_idx = x_idx[len(x_idx)//2:]
33
34    return cos_idx, sin_idx

Takes the number associated with the group as m and returns the indices for cos and sin functions to be applied.

Parameters
  • m (int): The chronological number of the group being processed.
  • D (int): The number of dimensions per group.
Returns
  • cos_idx, sin_idx (np.ndarray, np.ndarray): The indices for cos and sin projections in overall Z matrix.
def calc_groupz(X_train, X_test, adata, D, sigma, proj_func):
37def calc_groupz(X_train, X_test, adata, D, sigma, proj_func):
38    """
39    Calculates the Z matrix for grouping.
40
41    Parameters
42    ----------
43    X_train : np.ndarray
44        The filtered data matrix to calculate train Z mat for.
45    
46    X_test : np.ndarray
47        The filtered data matrix to calculate test Z mat for.
48
49    adata : anndata.AnnData 
50        AnnData object containing `seed_obj` in `.uns` attribute.
51
52    D : int
53        Number of dimensions per grouping.
54
55    sigma : float
56        Kernel width for grouping.
57
58    proj_func : function
59        The projection direction function to be applied to data.
60
61    Returns
62    -------
63    train_projections, test_projections : np.ndarray, np.ndarray
64        Training and testing Z matrices for group.
65    """  
66    if scipy.sparse.issparse(X_train):
67        X_train = X_train.toarray().astype(np.float16)
68        X_test = X_test.toarray().astype(np.float16)
69
70    W = proj_func(X_train, sigma, adata.uns['seed_obj'], D)
71    
72    train_projection = np.matmul(X_train, W)
73    test_projection = np.matmul(X_test, W)
74
75    return train_projection, test_projection

Calculates the Z matrix for grouping.

Parameters
  • X_train (np.ndarray): The filtered data matrix to calculate train Z mat for.
  • X_test (np.ndarray): The filtered data matrix to calculate test Z mat for.
  • adata (anndata.AnnData): AnnData object containing seed_obj in .uns attribute.
  • D (int): Number of dimensions per grouping.
  • sigma (float): Kernel width for grouping.
  • proj_func (function): The projection direction function to be applied to data.
Returns
  • train_projections, test_projections (np.ndarray, np.ndarray): Training and testing Z matrices for group.
def calculate_z( adata, n_features=5000, batches=10, batch_size=100) -> anndata._core.anndata.AnnData:
 78def calculate_z(adata, n_features=5000, batches=10, 
 79                batch_size=100) -> ad.AnnData:
 80    """
 81    Function to calculate Z matrices for all groups in both training 
 82    and testing data.
 83
 84    Parameters
 85    ----------
 86    adata : ad.AnnData
 87        created by `scmkl.create_adata()` with `adata.uns.keys()`: 
 88        `'train_indices'`, and `'test_indices'`. 
 89
 90    n_features : int
 91        Number of random feature to use when calculating Z; used for 
 92        scalability.
 93
 94    batches : int
 95        The number of batches to use for the distance calculation.
 96        This will average the result of `batches` distance calculations
 97        of `batch_size` randomly sampled cells. More batches will converge
 98        to population distance values at the cost of scalability.
 99
100    batch_size : int
101        The number of cells to include per batch for distance
102        calculations. Higher batch size will converge to population
103        distance values at the cost of scalability.
104        If `batches*batch_size > num_training_cells`,
105        `batch_size` will be reduced to 
106        `int(num_training_cells / batches)`.
107
108    Returns
109    -------
110    adata : ad.AnnData
111        `adata` with Z matrices accessible with `adata.uns['Z_train']` 
112        and `adata.uns['Z_test']`.
113
114    Examples
115    --------
116    >>> adata = scmkl.estimate_sigma(adata)
117    >>> adata = scmkl.calculate_z(adata)
118    >>> adata.uns.keys()
119    dict_keys(['Z_train', 'Z_test', 'sigmas', 'train_indices', 
120    'test_indices'])
121    """
122    # Number of groupings taking from group_dict
123    n_pathway = len(adata.uns['group_dict'].keys())
124    D = adata.uns['D']
125
126    sq_i_d = np.sqrt(1/D)
127
128    # Capturing training and testing sizes
129    train_len = len(adata.uns['train_indices'])
130    test_len = len(adata.uns['test_indices'])
131
132    if batch_size * batches > len(adata.uns['train_indices']):
133        old_batch_size = batch_size
134        batch_size = int(len(adata.uns['train_indices'])/batches)
135        print("Specified batch size required too many cells for "
136                "independent batches. Reduced batch size from "
137                f"{old_batch_size} to {batch_size}")
138
139    if 'sigma' not in adata.uns.keys():
140        n_samples = np.min((2000, adata.uns['train_indices'].shape[0]))
141        sample_range = np.arange(n_samples)
142        batch_idx = get_batches(sample_range, adata.uns['seed_obj'], 
143                                batches=batches, batch_size=batch_size)
144        sigma_indices = sample_cells(adata.uns['train_indices'], n_samples, adata.uns['seed_obj'])
145
146    # Create Arrays to store concatenated group Zs
147    # Each group of features will have a corresponding entry in each array
148    n_cols = 2*adata.uns['D']*n_pathway
149    Z_train = np.zeros((train_len, n_cols))
150    Z_test = np.zeros((test_len, n_cols))
151
152
153    # Setting kernel function 
154    match adata.uns['kernel_type'].lower():
155        case 'gaussian':
156            proj_func = gaussian_trans
157        case 'laplacian':
158            proj_func = laplacian_trans
159        case 'cauchy':
160            proj_func = cauchy_trans
161
162
163    # Loop over each of the groups and creating Z for each
164    sigma_list = list()
165    for m, group_features in enumerate(adata.uns['group_dict'].values()):
166
167        n_group_features = len(group_features)
168
169        X_train, X_test = get_group_mat(adata, n_features, group_features, 
170                                        n_group_features, process_test=True)
171        
172        if adata.uns['tfidf']:
173            X_train, X_test = tfidf_train_test(X_train, X_test)
174
175        # Data filtering, and transformation according to given data_type
176        # Will remove low variance (< 1e5) features regardless of data_type
177        # If scale_data will log scale and z-score the data
178        X_train, X_test = process_data(X_train=X_train, X_test=X_test, 
179                                       scale_data=adata.uns['scale_data'], 
180                                       return_dense=True)    
181
182        # Getting sigma
183        if 'sigma' in adata.uns.keys():
184            sigma = adata.uns['sigma'][m]
185        else:
186            sigma = est_group_sigma(adata, X_train, n_group_features, 
187                                    n_features, batch_idx=batch_idx)
188            sigma_list.append(sigma)
189            
190        assert sigma > 0, "Sigma must be more than 0"
191        train_projection, test_projection = calc_groupz(X_train, X_test, 
192                                                        adata, D, sigma, 
193                                                        proj_func)
194
195        # Store group Z in whole-Z object
196        # Preserves order to be able to extract meaningful groups
197        cos_idx, sin_idx = get_z_indices(m, D)
198
199        Z_train[0:, cos_idx] = np.cos(train_projection)
200        Z_train[0:, sin_idx] = np.sin(train_projection)
201
202        Z_test[0:, cos_idx] = np.cos(test_projection)
203        Z_test[0:, sin_idx] = np.sin(test_projection)
204
205    adata.uns['Z_train'] = Z_train*sq_i_d
206    adata.uns['Z_test'] = Z_test*sq_i_d
207
208    if 'sigma' not in adata.uns.keys():
209        adata.uns['sigma'] = np.array(sigma_list)
210
211    return adata

Function to calculate Z matrices for all groups in both training and testing data.

Parameters
  • adata (ad.AnnData): created by scmkl.create_adata with adata.uns.keys(): 'train_indices', and 'test_indices'.
  • n_features (int): Number of random feature to use when calculating Z; used for scalability.
  • batches (int): The number of batches to use for the distance calculation. This will average the result of batches distance calculations of batch_size randomly sampled cells. More batches will converge to population distance values at the cost of scalability.
  • batch_size (int): The number of cells to include per batch for distance calculations. Higher batch size will converge to population distance values at the cost of scalability. If batches*batch_size > num_training_cells, batch_size will be reduced to int(num_training_cells / batches).
Returns
  • adata (ad.AnnData): adata with Z matrices accessible with adata.uns['Z_train'] and adata.uns['Z_test'].
Examples
>>> adata = scmkl.estimate_sigma(adata)
>>> adata = scmkl.calculate_z(adata)
>>> adata.uns.keys()
dict_keys(['Z_train', 'Z_test', 'sigmas', 'train_indices', 
'test_indices'])