scmkl.tfidf_normalize
1import numpy as np 2import scipy 3 4 5def _tfidf(X, mode = 'filter'): 6 ''' 7 Function to use Term Frequency Inverse Document Frequency 8 filtering for atac data to find meaningful features. If input is 9 pandas data frame or scipy sparse array, it will be converted to a 10 numpy array. 11 12 Parameters 13 ---------- 14 x : Data matrix of cell x feature. Must be a Numpy array or Scipy 15 sparse array. 16 mode : Argument to determine what to return. Must be filter or 17 normalize 18 19 Returns 20 ------- 21 TFIDF : Output depends on given 'mode' parameter 22 'filter' : returns which column sums are non 0 i.e. which 23 features are significant 24 'normalize' : returns TFIDF filtered data matrix of the 25 same dimensions as x. Returns as scipy 26 sparse matrix 27 ''' 28 assert mode in ['filter', 'normalize'], ("mode must be 'filter' or " 29 "'normalize'.") 30 31 if scipy.sparse.issparse(X): 32 tf = scipy.sparse.csc_array(X) 33 doc_freq = np.array(np.sum(tf > 0, axis=0)).reshape(-1) 34 else: 35 tf = np.asarray(X) 36 doc_freq = np.sum(X > 0, axis=0) 37 38 idf = np.log1p((1 + X.shape[0]) / (1 + doc_freq)) 39 tfidf = tf * idf 40 41 if mode == 'normalize': 42 if scipy.sparse.issparse(tfidf): 43 tfidf = scipy.sparse.csc_matrix(tfidf) 44 return tfidf 45 elif mode == 'filter': 46 significant_features = np.where(np.sum(tfidf, axis=0) > 0)[0] 47 return significant_features 48 49def _tfidf_train_test(X_train, X_test): 50 if scipy.sparse.issparse(X_train): 51 tf_train = scipy.sparse.csc_array(X_train) 52 tf_test = scipy.sparse.csc_array(X_test) 53 doc_freq = np.array(np.sum(X_train > 0, axis=0)).reshape(-1) 54 else: 55 tf_train = X_train 56 tf_test = X_test 57 doc_freq = np.sum(X_train > 0, axis=0) 58 59 idf = np.log1p((1 + X_train.shape[0]) / (1 + doc_freq)) 60 61 tfidf_train = tf_train * idf 62 tfidf_test = tf_test * idf 63 64 if scipy.sparse.issparse(tfidf_train): 65 tfidf_train = scipy.sparse.csc_matrix(tfidf_train) 66 tfidf_test = scipy.sparse.csc_matrix(tfidf_test) 67 68 return tfidf_train, tfidf_test 69 70 71def tfidf_normalize(adata, binarize = False): 72 ''' 73 Function to TFIDF normalize the data in an adata object. If any 74 rows are entirely 0, that row and its metadata will be removed from 75 the object. 76 77 Parameters 78 ---------- 79 **adata** : *AnnData* 80 > `adata.X` to be normalized. If `'train_indices'` and 81 `'test_indices'` in `'adata.uns.keys()'`, normalization will be 82 done separately for the training and testing data. Otherwise, 83 it will calculate it on the entire dataset. 84 85 **binarize** : *bool* 86 > If `True`, all values in `adata.X` greater than 1 will become 87 1. 88 89 Returns 90 ------- 91 **adata** : *AnnData* 92 > adata with adata.X TFIDF normalized. Will now have the train 93 data stacked on test data, and the indices will be adjusted 94 accordingly. 95 96 Examples 97 -------- 98 >>> adata = scmkl.create_adata(X = data_mat, 99 ... feature_names = gene_names, 100 ... group_dict = group_dict) 101 >>> 102 >>> adata = scmkl.tfidf_normalize(adata) 103 ''' 104 X = adata.X.copy() 105 row_sums = np.sum(X, axis = 1) 106 assert np.all(row_sums > 0), "TFIDF requires all row sums be positive" 107 108 if binarize: 109 X[X > 0] = 1 110 111 if 'train_indices' in adata.uns_keys(): 112 113 train_indices = adata.uns['train_indices'].copy() 114 test_indices = adata.uns['test_indices'].copy() 115 116 # Calculate the train TFIDF matrix on just the training data so it is 117 # not biased by testing data 118 tfidf_train = _tfidf(X[train_indices,:], mode = 'normalize') 119 120 # Calculate the test TFIDF by calculating it on the train and test 121 # data and index the test data 122 tfidf_test = _tfidf(X, mode = 'normalize')[test_indices,:] 123 124 # Impossible to add rows back to original location so we need to 125 # stack the matrices to maintain train/test 126 if scipy.sparse.issparse(X): 127 tfidf_norm = scipy.sparse.vstack((tfidf_train, tfidf_test)) 128 else: 129 tfidf_norm = np.vstack((tfidf_train, tfidf_test)) 130 131 # I'm not sure why this reassignment is necessary, but without, the 132 # values will be saved as 0s in adata 133 adata.uns['train_indices'] = train_indices 134 adata.uns['test_indices'] = test_indices 135 136 combined_indices = np.concatenate((train_indices, test_indices)) 137 138 # Anndata indexes by "rownames" not position so we need to rename the 139 # rows to properly index 140 adata_index = adata.obs_names[combined_indices].astype(int) 141 tfidf_norm = tfidf_norm[np.argsort(adata_index),:] 142 143 else: 144 145 tfidf_norm = _tfidf(X, mode = 'normalize') 146 147 adata.X = tfidf_norm.copy() 148 149 return adata
def
tfidf_normalize(adata, binarize=False):
72def tfidf_normalize(adata, binarize = False): 73 ''' 74 Function to TFIDF normalize the data in an adata object. If any 75 rows are entirely 0, that row and its metadata will be removed from 76 the object. 77 78 Parameters 79 ---------- 80 **adata** : *AnnData* 81 > `adata.X` to be normalized. If `'train_indices'` and 82 `'test_indices'` in `'adata.uns.keys()'`, normalization will be 83 done separately for the training and testing data. Otherwise, 84 it will calculate it on the entire dataset. 85 86 **binarize** : *bool* 87 > If `True`, all values in `adata.X` greater than 1 will become 88 1. 89 90 Returns 91 ------- 92 **adata** : *AnnData* 93 > adata with adata.X TFIDF normalized. Will now have the train 94 data stacked on test data, and the indices will be adjusted 95 accordingly. 96 97 Examples 98 -------- 99 >>> adata = scmkl.create_adata(X = data_mat, 100 ... feature_names = gene_names, 101 ... group_dict = group_dict) 102 >>> 103 >>> adata = scmkl.tfidf_normalize(adata) 104 ''' 105 X = adata.X.copy() 106 row_sums = np.sum(X, axis = 1) 107 assert np.all(row_sums > 0), "TFIDF requires all row sums be positive" 108 109 if binarize: 110 X[X > 0] = 1 111 112 if 'train_indices' in adata.uns_keys(): 113 114 train_indices = adata.uns['train_indices'].copy() 115 test_indices = adata.uns['test_indices'].copy() 116 117 # Calculate the train TFIDF matrix on just the training data so it is 118 # not biased by testing data 119 tfidf_train = _tfidf(X[train_indices,:], mode = 'normalize') 120 121 # Calculate the test TFIDF by calculating it on the train and test 122 # data and index the test data 123 tfidf_test = _tfidf(X, mode = 'normalize')[test_indices,:] 124 125 # Impossible to add rows back to original location so we need to 126 # stack the matrices to maintain train/test 127 if scipy.sparse.issparse(X): 128 tfidf_norm = scipy.sparse.vstack((tfidf_train, tfidf_test)) 129 else: 130 tfidf_norm = np.vstack((tfidf_train, tfidf_test)) 131 132 # I'm not sure why this reassignment is necessary, but without, the 133 # values will be saved as 0s in adata 134 adata.uns['train_indices'] = train_indices 135 adata.uns['test_indices'] = test_indices 136 137 combined_indices = np.concatenate((train_indices, test_indices)) 138 139 # Anndata indexes by "rownames" not position so we need to rename the 140 # rows to properly index 141 adata_index = adata.obs_names[combined_indices].astype(int) 142 tfidf_norm = tfidf_norm[np.argsort(adata_index),:] 143 144 else: 145 146 tfidf_norm = _tfidf(X, mode = 'normalize') 147 148 adata.X = tfidf_norm.copy() 149 150 return adata
Function to TFIDF normalize the data in an adata object. If any rows are entirely 0, that row and its metadata will be removed from the object.
Parameters
adata : AnnData
adata.X
to be normalized. If'train_indices'
and'test_indices'
in'adata.uns.keys()'
, normalization will be done separately for the training and testing data. Otherwise, it will calculate it on the entire dataset.
binarize : bool
If
True
, all values inadata.X
greater than 1 will become 1.
Returns
adata : AnnData
adata with adata.X TFIDF normalized. Will now have the train data stacked on test data, and the indices will be adjusted accordingly.
Examples
>>> adata = scmkl.create_adata(X = data_mat,
... feature_names = gene_names,
... group_dict = group_dict)
>>>
>>> adata = scmkl.tfidf_normalize(adata)