scmkl.tfidf_normalize

  1import numpy as np
  2import scipy
  3
  4
  5def _tfidf(X, mode = 'filter'):
  6    '''
  7    Function to use Term Frequency Inverse Document Frequency 
  8    filtering for atac data to find meaningful features. If input is 
  9    pandas data frame or scipy sparse array, it will be converted to a 
 10    numpy array.
 11    
 12    Parameters
 13    ----------
 14    x : Data matrix of cell x feature.  Must be a Numpy array or Scipy 
 15        sparse array.
 16    mode : Argument to determine what to return.  Must be filter or 
 17           normalize
 18    
 19    Returns
 20    -------
 21    TFIDF : Output depends on given 'mode' parameter
 22            'filter' : returns which column sums are non 0 i.e. which 
 23                       features are significant
 24            'normalize' : returns TFIDF filtered data matrix of the 
 25                          same dimensions as x. Returns as scipy 
 26                          sparse matrix
 27    '''
 28    assert mode in ['filter', 'normalize'], ("mode must be 'filter' or "
 29                                             "'normalize'.")
 30    
 31    if scipy.sparse.issparse(X):
 32        tf = scipy.sparse.csc_array(X)
 33        doc_freq = np.array(np.sum(X > 0, axis=0)).flatten()
 34    else:
 35        tf = X
 36        doc_freq = np.sum(X > 0, axis=0)
 37
 38    idf = np.log1p((1 + X.shape[0]) / (1 + doc_freq))
 39    tfidf = tf * idf
 40
 41    if mode == 'normalize':
 42        if scipy.sparse.issparse(tfidf):
 43            tfidf = scipy.sparse.csc_matrix(tfidf)
 44        return tfidf
 45    elif mode == 'filter':
 46        significant_features = np.where(np.sum(tfidf, axis=0) > 0)[0]
 47        return significant_features
 48        
 49
 50def tfidf_normalize(adata, binarize = False):
 51    '''
 52    Function to TFIDF normalize the data in an adata object. If any 
 53    rows are entirely 0, that row and its metadata will be removed from
 54    the object.
 55
 56    Parameters
 57    ----------
 58    **adata** : *AnnData* 
 59        > `adata.X` to be normalized. If `'train_indices'` and 
 60        `'test_indices'` in `'adata.uns.keys()'`, normalization will be
 61        done separately for the training and testing data. Otherwise, 
 62        it will calculate it on the entire dataset.
 63
 64    **binarize** : *bool* 
 65        > If `True`, all values in `adata.X` greater than 1 will become 
 66        1.
 67
 68    Returns
 69    -------
 70    **adata** : *AnnData* 
 71        > adata with adata.X TFIDF normalized. Will now have the train 
 72        data stacked on test data, and the indices will be adjusted 
 73        accordingly.
 74
 75    Examples
 76    --------
 77    >>> adata = scmkl.create_adata(X = data_mat, 
 78    ...                            feature_names = gene_names, 
 79    ...                            group_dict = group_dict)
 80    >>> 
 81    >>> adata = scmkl.tfidf_normalize(adata)
 82    '''
 83    X = adata.X.copy()
 84    row_sums = np.sum(X, axis = 1)
 85    assert np.all(row_sums > 0), "TFIDF requires all row sums be positive"
 86
 87    if binarize:
 88        X[X > 0] = 1
 89
 90    if 'train_indices' in adata.uns_keys():
 91
 92        train_indices = adata.uns['train_indices'].copy()
 93        test_indices = adata.uns['test_indices'].copy()
 94
 95        # Calculate the train TFIDF matrix on just the training data so it is 
 96        # not biased by testing data
 97        tfidf_train = _tfidf(X[train_indices,:], mode = 'normalize')
 98
 99        # Calculate the test TFIDF by calculating it on the train and test 
100        # data and index the test data
101        tfidf_test = _tfidf(X, mode = 'normalize')[test_indices,:]
102
103        # Impossible to add rows back to original location so we need to 
104        # stack the matrices to maintain train/test
105        if scipy.sparse.issparse(X):
106            tfidf_norm = scipy.sparse.vstack((tfidf_train, tfidf_test))
107        else:
108            tfidf_norm = np.vstack((tfidf_train, tfidf_test))
109
110        # I'm not sure why this reassignment is necessary, but without, the 
111        # values will be saved as 0s in adata
112        adata.uns['train_indices'] = train_indices
113        adata.uns['test_indices'] = test_indices
114
115        combined_indices = np.concatenate((train_indices, test_indices))
116
117        # Anndata indexes by "rownames" not position so we need to rename the 
118        # rows to properly index
119        adata_index = adata.obs_names[combined_indices].astype(int)
120        tfidf_norm = tfidf_norm[np.argsort(adata_index),:]
121
122    else:
123
124        tfidf_norm = _tfidf(X, mode = 'normalize')
125
126    adata.X = tfidf_norm.copy()
127
128    return adata
def tfidf_normalize(adata, binarize=False):
 51def tfidf_normalize(adata, binarize = False):
 52    '''
 53    Function to TFIDF normalize the data in an adata object. If any 
 54    rows are entirely 0, that row and its metadata will be removed from
 55    the object.
 56
 57    Parameters
 58    ----------
 59    **adata** : *AnnData* 
 60        > `adata.X` to be normalized. If `'train_indices'` and 
 61        `'test_indices'` in `'adata.uns.keys()'`, normalization will be
 62        done separately for the training and testing data. Otherwise, 
 63        it will calculate it on the entire dataset.
 64
 65    **binarize** : *bool* 
 66        > If `True`, all values in `adata.X` greater than 1 will become 
 67        1.
 68
 69    Returns
 70    -------
 71    **adata** : *AnnData* 
 72        > adata with adata.X TFIDF normalized. Will now have the train 
 73        data stacked on test data, and the indices will be adjusted 
 74        accordingly.
 75
 76    Examples
 77    --------
 78    >>> adata = scmkl.create_adata(X = data_mat, 
 79    ...                            feature_names = gene_names, 
 80    ...                            group_dict = group_dict)
 81    >>> 
 82    >>> adata = scmkl.tfidf_normalize(adata)
 83    '''
 84    X = adata.X.copy()
 85    row_sums = np.sum(X, axis = 1)
 86    assert np.all(row_sums > 0), "TFIDF requires all row sums be positive"
 87
 88    if binarize:
 89        X[X > 0] = 1
 90
 91    if 'train_indices' in adata.uns_keys():
 92
 93        train_indices = adata.uns['train_indices'].copy()
 94        test_indices = adata.uns['test_indices'].copy()
 95
 96        # Calculate the train TFIDF matrix on just the training data so it is 
 97        # not biased by testing data
 98        tfidf_train = _tfidf(X[train_indices,:], mode = 'normalize')
 99
100        # Calculate the test TFIDF by calculating it on the train and test 
101        # data and index the test data
102        tfidf_test = _tfidf(X, mode = 'normalize')[test_indices,:]
103
104        # Impossible to add rows back to original location so we need to 
105        # stack the matrices to maintain train/test
106        if scipy.sparse.issparse(X):
107            tfidf_norm = scipy.sparse.vstack((tfidf_train, tfidf_test))
108        else:
109            tfidf_norm = np.vstack((tfidf_train, tfidf_test))
110
111        # I'm not sure why this reassignment is necessary, but without, the 
112        # values will be saved as 0s in adata
113        adata.uns['train_indices'] = train_indices
114        adata.uns['test_indices'] = test_indices
115
116        combined_indices = np.concatenate((train_indices, test_indices))
117
118        # Anndata indexes by "rownames" not position so we need to rename the 
119        # rows to properly index
120        adata_index = adata.obs_names[combined_indices].astype(int)
121        tfidf_norm = tfidf_norm[np.argsort(adata_index),:]
122
123    else:
124
125        tfidf_norm = _tfidf(X, mode = 'normalize')
126
127    adata.X = tfidf_norm.copy()
128
129    return adata

Function to TFIDF normalize the data in an adata object. If any rows are entirely 0, that row and its metadata will be removed from the object.

Parameters

adata : AnnData

adata.X to be normalized. If 'train_indices' and 'test_indices' in 'adata.uns.keys()', normalization will be done separately for the training and testing data. Otherwise, it will calculate it on the entire dataset.

binarize : bool

If True, all values in adata.X greater than 1 will become 1.

Returns

adata : AnnData

adata with adata.X TFIDF normalized. Will now have the train data stacked on test data, and the indices will be adjusted accordingly.

Examples

>>> adata = scmkl.create_adata(X = data_mat, 
...                            feature_names = gene_names, 
...                            group_dict = group_dict)
>>> 
>>> adata = scmkl.tfidf_normalize(adata)