scmkl.tfidf_normalize

View Source

  1import numpy as np
  2import scipy
  3
  4
  5def _tfidf(X, mode = 'filter'):
  6    '''
  7    Function to use Term Frequency Inverse Document Frequency 
  8    filtering for atac data to find meaningful features. If input is 
  9    pandas data frame or scipy sparse array, it will be converted to a 
 10    numpy array.
 11    
 12    Parameters
 13    ----------
 14    x : Data matrix of cell x feature.  Must be a Numpy array or Scipy 
 15        sparse array.
 16    mode : Argument to determine what to return.  Must be filter or 
 17           normalize
 18    
 19    Returns
 20    -------
 21    TFIDF : Output depends on given 'mode' parameter
 22            'filter' : returns which column sums are non 0 i.e. which 
 23                       features are significant
 24            'normalize' : returns TFIDF filtered data matrix of the 
 25                          same dimensions as x. Returns as scipy 
 26                          sparse matrix
 27    '''
 28    assert mode in ['filter', 'normalize'], ("mode must be 'filter' or "
 29                                             "'normalize'.")
 30
 31    if scipy.sparse.issparse(X):
 32        tf = scipy.sparse.csc_array(X)
 33        doc_freq = np.array(np.sum(tf > 0, axis=0)).reshape(-1)
 34    else:
 35        tf = np.asarray(X)
 36        doc_freq = np.sum(X > 0, axis=0)
 37
 38    idf = np.log1p((1 + X.shape[0]) / (1 + doc_freq))
 39    tfidf = tf * idf
 40
 41    if mode == 'normalize':
 42        if scipy.sparse.issparse(tfidf):
 43            tfidf = scipy.sparse.csc_matrix(tfidf)
 44        return tfidf
 45    elif mode == 'filter':
 46        significant_features = np.where(np.sum(tfidf, axis=0) > 0)[0]
 47        return significant_features
 48        
 49def _tfidf_train_test(X_train, X_test):
 50    if scipy.sparse.issparse(X_train):
 51        tf_train = scipy.sparse.csc_array(X_train)
 52        tf_test = scipy.sparse.csc_array(X_test)
 53        doc_freq = np.array(np.sum(X_train > 0, axis=0)).reshape(-1)
 54    else:
 55        tf_train = X_train
 56        tf_test = X_test
 57        doc_freq = np.sum(X_train > 0, axis=0)
 58
 59    idf = np.log1p((1 + X_train.shape[0]) / (1 + doc_freq))
 60
 61    tfidf_train = tf_train * idf
 62    tfidf_test = tf_test * idf
 63
 64    if scipy.sparse.issparse(tfidf_train):
 65        tfidf_train = scipy.sparse.csc_matrix(tfidf_train)
 66        tfidf_test = scipy.sparse.csc_matrix(tfidf_test)
 67        
 68    return tfidf_train, tfidf_test
 69
 70
 71def tfidf_normalize(adata, binarize = False):
 72    '''
 73    Function to TFIDF normalize the data in an adata object. If any 
 74    rows are entirely 0, that row and its metadata will be removed from
 75    the object.
 76
 77    Parameters
 78    ----------
 79    **adata** : *AnnData* 
 80        > `adata.X` to be normalized. If `'train_indices'` and 
 81        `'test_indices'` in `'adata.uns.keys()'`, normalization will be
 82        done separately for the training and testing data. Otherwise, 
 83        it will calculate it on the entire dataset.
 84
 85    **binarize** : *bool* 
 86        > If `True`, all values in `adata.X` greater than 1 will become 
 87        1.
 88
 89    Returns
 90    -------
 91    **adata** : *AnnData* 
 92        > adata with adata.X TFIDF normalized. Will now have the train 
 93        data stacked on test data, and the indices will be adjusted 
 94        accordingly.
 95
 96    Examples
 97    --------
 98    >>> adata = scmkl.create_adata(X = data_mat, 
 99    ...                            feature_names = gene_names, 
100    ...                            group_dict = group_dict)
101    >>> 
102    >>> adata = scmkl.tfidf_normalize(adata)
103    '''
104    X = adata.X.copy()
105    row_sums = np.sum(X, axis = 1)
106    assert np.all(row_sums > 0), "TFIDF requires all row sums be positive"
107
108    if binarize:
109        X[X > 0] = 1
110
111    if 'train_indices' in adata.uns_keys():
112
113        train_indices = adata.uns['train_indices'].copy()
114        test_indices = adata.uns['test_indices'].copy()
115
116        # Calculate the train TFIDF matrix on just the training data so it is 
117        # not biased by testing data
118        tfidf_train = _tfidf(X[train_indices,:], mode = 'normalize')
119
120        # Calculate the test TFIDF by calculating it on the train and test 
121        # data and index the test data
122        tfidf_test = _tfidf(X, mode = 'normalize')[test_indices,:]
123
124        # Impossible to add rows back to original location so we need to 
125        # stack the matrices to maintain train/test
126        if scipy.sparse.issparse(X):
127            tfidf_norm = scipy.sparse.vstack((tfidf_train, tfidf_test))
128        else:
129            tfidf_norm = np.vstack((tfidf_train, tfidf_test))
130
131        # I'm not sure why this reassignment is necessary, but without, the 
132        # values will be saved as 0s in adata
133        adata.uns['train_indices'] = train_indices
134        adata.uns['test_indices'] = test_indices
135
136        combined_indices = np.concatenate((train_indices, test_indices))
137
138        # Anndata indexes by "rownames" not position so we need to rename the 
139        # rows to properly index
140        adata_index = adata.obs_names[combined_indices].astype(int)
141        tfidf_norm = tfidf_norm[np.argsort(adata_index),:]
142
143    else:
144
145        tfidf_norm = _tfidf(X, mode = 'normalize')
146
147    adata.X = tfidf_norm.copy()
148
149    return adata

def tfidf_normalize(adata, binarize=False): View Source

 72def tfidf_normalize(adata, binarize = False):
 73    '''
 74    Function to TFIDF normalize the data in an adata object. If any 
 75    rows are entirely 0, that row and its metadata will be removed from
 76    the object.
 77
 78    Parameters
 79    ----------
 80    **adata** : *AnnData* 
 81        > `adata.X` to be normalized. If `'train_indices'` and 
 82        `'test_indices'` in `'adata.uns.keys()'`, normalization will be
 83        done separately for the training and testing data. Otherwise, 
 84        it will calculate it on the entire dataset.
 85
 86    **binarize** : *bool* 
 87        > If `True`, all values in `adata.X` greater than 1 will become 
 88        1.
 89
 90    Returns
 91    -------
 92    **adata** : *AnnData* 
 93        > adata with adata.X TFIDF normalized. Will now have the train 
 94        data stacked on test data, and the indices will be adjusted 
 95        accordingly.
 96
 97    Examples
 98    --------
 99    >>> adata = scmkl.create_adata(X = data_mat, 
100    ...                            feature_names = gene_names, 
101    ...                            group_dict = group_dict)
102    >>> 
103    >>> adata = scmkl.tfidf_normalize(adata)
104    '''
105    X = adata.X.copy()
106    row_sums = np.sum(X, axis = 1)
107    assert np.all(row_sums > 0), "TFIDF requires all row sums be positive"
108
109    if binarize:
110        X[X > 0] = 1
111
112    if 'train_indices' in adata.uns_keys():
113
114        train_indices = adata.uns['train_indices'].copy()
115        test_indices = adata.uns['test_indices'].copy()
116
117        # Calculate the train TFIDF matrix on just the training data so it is 
118        # not biased by testing data
119        tfidf_train = _tfidf(X[train_indices,:], mode = 'normalize')
120
121        # Calculate the test TFIDF by calculating it on the train and test 
122        # data and index the test data
123        tfidf_test = _tfidf(X, mode = 'normalize')[test_indices,:]
124
125        # Impossible to add rows back to original location so we need to 
126        # stack the matrices to maintain train/test
127        if scipy.sparse.issparse(X):
128            tfidf_norm = scipy.sparse.vstack((tfidf_train, tfidf_test))
129        else:
130            tfidf_norm = np.vstack((tfidf_train, tfidf_test))
131
132        # I'm not sure why this reassignment is necessary, but without, the 
133        # values will be saved as 0s in adata
134        adata.uns['train_indices'] = train_indices
135        adata.uns['test_indices'] = test_indices
136
137        combined_indices = np.concatenate((train_indices, test_indices))
138
139        # Anndata indexes by "rownames" not position so we need to rename the 
140        # rows to properly index
141        adata_index = adata.obs_names[combined_indices].astype(int)
142        tfidf_norm = tfidf_norm[np.argsort(adata_index),:]
143
144    else:
145
146        tfidf_norm = _tfidf(X, mode = 'normalize')
147
148    adata.X = tfidf_norm.copy()
149
150    return adata

Function to TFIDF normalize the data in an adata object. If any rows are entirely 0, that row and its metadata will be removed from the object.

Parameters

adata : AnnData

adata.X to be normalized. If 'train_indices' and 'test_indices' in 'adata.uns.keys()', normalization will be done separately for the training and testing data. Otherwise, it will calculate it on the entire dataset.

binarize : bool

If True, all values in adata.X greater than 1 will become 1.

Returns

adata : AnnData

adata with adata.X TFIDF normalized. Will now have the train data stacked on test data, and the indices will be adjusted accordingly.

Examples

>>> adata = scmkl.create_adata(X = data_mat, 
...                            feature_names = gene_names, 
...                            group_dict = group_dict)
>>> 
>>> adata = scmkl.tfidf_normalize(adata)