scmkl.tfidf_normalize
1import numpy as np 2import scipy 3import anndata as ad 4 5def tfidf(X: np.ndarray | scipy.sparse._csc.csc_matrix, mode: str='filter'): 6 """ 7 Function to use Term Frequency Inverse Document Frequency (TF-IDF) 8 filtering for atac data to find meaningful features. 9 10 Parameters 11 ---------- 12 X : np.ndarray | scipy.sparse._csc.csc_matrix 13 Data matrix of cell x feature. Must be a `np.ndarray` or 14 `scipy.sparse` matrix. 15 16 mode : str 17 Argument to determine what to return. Must be `'filter'` or 18 `'normalize'`. 19 20 Returns 21 ------- 22 tfidf | significant_features : array_like 23 Output depends on given `'mode'` parameter: 24 25 `'filter'` (np.ndarray): 26 Which column sums are nonzero (i.e. which features are 27 significant). 28 29 `'normalize'` (np.ndarray | scipy.sparse._csc.csc_matrix): 30 TF-IDF filtered data matrix of the same dimensions as `X`. 31 """ 32 assert mode in ['filter', 'normalize'], ("mode must be 'filter' or " 33 "'normalize'.") 34 35 if scipy.sparse.issparse(X): 36 tf = scipy.sparse.csc_array(X) 37 doc_freq = np.array(np.sum(tf > 0, axis=0)).reshape(-1) 38 else: 39 tf = np.asarray(X) 40 doc_freq = np.sum(X > 0, axis=0) 41 42 idf = np.log1p((1 + X.shape[0])/(1 + doc_freq)) 43 tfidf = tf*idf 44 45 if mode == 'normalize': 46 if scipy.sparse.issparse(tfidf): 47 tfidf = scipy.sparse.csc_matrix(tfidf) 48 return tfidf 49 50 elif mode == 'filter': 51 significant_features = np.where(np.sum(tfidf, axis=0) > 0)[0] 52 return significant_features 53 54def tfidf_train_test(X_train, X_test): 55 if scipy.sparse.issparse(X_train): 56 tf_train = scipy.sparse.csc_array(X_train) 57 tf_test = scipy.sparse.csc_array(X_test) 58 doc_freq = np.array(np.sum(X_train > 0, axis=0)).reshape(-1) 59 else: 60 tf_train = X_train 61 tf_test = X_test 62 doc_freq = np.sum(X_train > 0, axis=0) 63 64 idf = np.log1p((1 + X_train.shape[0])/(1 + doc_freq)) 65 66 tfidf_train = tf_train*idf 67 tfidf_test = tf_test*idf 68 69 if scipy.sparse.issparse(tfidf_train): 70 tfidf_train = scipy.sparse.csc_matrix(tfidf_train) 71 tfidf_test = scipy.sparse.csc_matrix(tfidf_test) 72 73 return tfidf_train, tfidf_test 74 75 76def tfidf_normalize(adata: ad.AnnData, binarize: bool=False): 77 """ 78 Function to TF-IDF normalize the data in an adata object. If any 79 rows are entirely 0, that row and its metadata will be removed from 80 the object. 81 82 Parameters 83 ---------- 84 adata : ad.AnnData 85 `ad.Anndata` with `.X` to be normalized. If `'train_indices'` 86 and `'test_indices'` in `'adata.uns.keys()'`, normalization 87 will be done separately for the training and testing data. 88 Otherwise, it will calculate it on the entire dataset. 89 90 binarize : bool 91 If `True`, all values in `adata.X` greater than 1 will become 92 1. 93 94 Returns 95 ------- 96 adata : ad.AnnData 97 `adata` with `adata.X` TF-IDF normalized. Will now have the 98 train data stacked on test data, and the indices will be 99 adjusted accordingly. 100 101 Examples 102 -------- 103 >>> adata = scmkl.create_adata(X = data_mat, 104 ... feature_names = gene_names, 105 ... group_dict = group_dict) 106 >>> 107 >>> adata = scmkl.tfidf_normalize(adata) 108 """ 109 X = adata.X.copy() 110 row_sums = np.sum(X, axis = 1) 111 assert np.all(row_sums > 0), "TFIDF requires all row sums be positive" 112 113 if binarize: 114 X[X > 0] = 1 115 116 if 'train_indices' in adata.uns_keys(): 117 118 train_indices = adata.uns['train_indices'].copy() 119 test_indices = adata.uns['test_indices'].copy() 120 121 # Calculate the train TFIDF matrix on just the training data so it is 122 # not biased by testing data 123 tfidf_train = tfidf(X[train_indices,:], mode = 'normalize') 124 125 # Calculate the test TFIDF by calculating it on the train and test 126 # data and index the test data 127 tfidf_test = tfidf(X, mode = 'normalize')[test_indices,:] 128 129 # Impossible to add rows back to original location so we need to 130 # stack the matrices to maintain train/test 131 if scipy.sparse.issparse(X): 132 tfidf_norm = scipy.sparse.vstack((tfidf_train, tfidf_test)) 133 else: 134 tfidf_norm = np.vstack((tfidf_train, tfidf_test)) 135 136 # I'm not sure why this reassignment is necessary, but without, the 137 # values will be saved as 0s in adata 138 adata.uns['train_indices'] = train_indices 139 adata.uns['test_indices'] = test_indices 140 141 combined_indices = np.concatenate((train_indices, test_indices)) 142 143 # Anndata indexes by "rownames" not position so we need to rename the 144 # rows to properly index 145 adata_index = adata.obs_names[combined_indices].astype(int) 146 tfidf_norm = tfidf_norm[np.argsort(adata_index),:] 147 148 else: 149 150 tfidf_norm = tfidf(X, mode = 'normalize') 151 152 adata.X = tfidf_norm.copy() 153 154 return adata
def
tfidf( X: numpy.ndarray | scipy.sparse._csc.csc_matrix, mode: str = 'filter'):
6def tfidf(X: np.ndarray | scipy.sparse._csc.csc_matrix, mode: str='filter'): 7 """ 8 Function to use Term Frequency Inverse Document Frequency (TF-IDF) 9 filtering for atac data to find meaningful features. 10 11 Parameters 12 ---------- 13 X : np.ndarray | scipy.sparse._csc.csc_matrix 14 Data matrix of cell x feature. Must be a `np.ndarray` or 15 `scipy.sparse` matrix. 16 17 mode : str 18 Argument to determine what to return. Must be `'filter'` or 19 `'normalize'`. 20 21 Returns 22 ------- 23 tfidf | significant_features : array_like 24 Output depends on given `'mode'` parameter: 25 26 `'filter'` (np.ndarray): 27 Which column sums are nonzero (i.e. which features are 28 significant). 29 30 `'normalize'` (np.ndarray | scipy.sparse._csc.csc_matrix): 31 TF-IDF filtered data matrix of the same dimensions as `X`. 32 """ 33 assert mode in ['filter', 'normalize'], ("mode must be 'filter' or " 34 "'normalize'.") 35 36 if scipy.sparse.issparse(X): 37 tf = scipy.sparse.csc_array(X) 38 doc_freq = np.array(np.sum(tf > 0, axis=0)).reshape(-1) 39 else: 40 tf = np.asarray(X) 41 doc_freq = np.sum(X > 0, axis=0) 42 43 idf = np.log1p((1 + X.shape[0])/(1 + doc_freq)) 44 tfidf = tf*idf 45 46 if mode == 'normalize': 47 if scipy.sparse.issparse(tfidf): 48 tfidf = scipy.sparse.csc_matrix(tfidf) 49 return tfidf 50 51 elif mode == 'filter': 52 significant_features = np.where(np.sum(tfidf, axis=0) > 0)[0] 53 return significant_features
Function to use Term Frequency Inverse Document Frequency (TF-IDF) filtering for atac data to find meaningful features.
Parameters
- X (np.ndarray | scipy.sparse._csc.csc_matrix):
Data matrix of cell x feature. Must be a
np.ndarray
orscipy.sparse
matrix. - mode (str):
Argument to determine what to return. Must be
'filter'
or'normalize'
.
Returns
tfidf | significant_features (array_like): Output depends on given
'mode'
parameter:'filter'
(np.ndarray): Which column sums are nonzero (i.e. which features are significant).'normalize'
(np.ndarray | scipy.sparse._csc.csc_matrix): TF-IDF filtered data matrix of the same dimensions asX
.
def
tfidf_train_test(X_train, X_test):
55def tfidf_train_test(X_train, X_test): 56 if scipy.sparse.issparse(X_train): 57 tf_train = scipy.sparse.csc_array(X_train) 58 tf_test = scipy.sparse.csc_array(X_test) 59 doc_freq = np.array(np.sum(X_train > 0, axis=0)).reshape(-1) 60 else: 61 tf_train = X_train 62 tf_test = X_test 63 doc_freq = np.sum(X_train > 0, axis=0) 64 65 idf = np.log1p((1 + X_train.shape[0])/(1 + doc_freq)) 66 67 tfidf_train = tf_train*idf 68 tfidf_test = tf_test*idf 69 70 if scipy.sparse.issparse(tfidf_train): 71 tfidf_train = scipy.sparse.csc_matrix(tfidf_train) 72 tfidf_test = scipy.sparse.csc_matrix(tfidf_test) 73 74 return tfidf_train, tfidf_test
def
tfidf_normalize(adata: anndata._core.anndata.AnnData, binarize: bool = False):
77def tfidf_normalize(adata: ad.AnnData, binarize: bool=False): 78 """ 79 Function to TF-IDF normalize the data in an adata object. If any 80 rows are entirely 0, that row and its metadata will be removed from 81 the object. 82 83 Parameters 84 ---------- 85 adata : ad.AnnData 86 `ad.Anndata` with `.X` to be normalized. If `'train_indices'` 87 and `'test_indices'` in `'adata.uns.keys()'`, normalization 88 will be done separately for the training and testing data. 89 Otherwise, it will calculate it on the entire dataset. 90 91 binarize : bool 92 If `True`, all values in `adata.X` greater than 1 will become 93 1. 94 95 Returns 96 ------- 97 adata : ad.AnnData 98 `adata` with `adata.X` TF-IDF normalized. Will now have the 99 train data stacked on test data, and the indices will be 100 adjusted accordingly. 101 102 Examples 103 -------- 104 >>> adata = scmkl.create_adata(X = data_mat, 105 ... feature_names = gene_names, 106 ... group_dict = group_dict) 107 >>> 108 >>> adata = scmkl.tfidf_normalize(adata) 109 """ 110 X = adata.X.copy() 111 row_sums = np.sum(X, axis = 1) 112 assert np.all(row_sums > 0), "TFIDF requires all row sums be positive" 113 114 if binarize: 115 X[X > 0] = 1 116 117 if 'train_indices' in adata.uns_keys(): 118 119 train_indices = adata.uns['train_indices'].copy() 120 test_indices = adata.uns['test_indices'].copy() 121 122 # Calculate the train TFIDF matrix on just the training data so it is 123 # not biased by testing data 124 tfidf_train = tfidf(X[train_indices,:], mode = 'normalize') 125 126 # Calculate the test TFIDF by calculating it on the train and test 127 # data and index the test data 128 tfidf_test = tfidf(X, mode = 'normalize')[test_indices,:] 129 130 # Impossible to add rows back to original location so we need to 131 # stack the matrices to maintain train/test 132 if scipy.sparse.issparse(X): 133 tfidf_norm = scipy.sparse.vstack((tfidf_train, tfidf_test)) 134 else: 135 tfidf_norm = np.vstack((tfidf_train, tfidf_test)) 136 137 # I'm not sure why this reassignment is necessary, but without, the 138 # values will be saved as 0s in adata 139 adata.uns['train_indices'] = train_indices 140 adata.uns['test_indices'] = test_indices 141 142 combined_indices = np.concatenate((train_indices, test_indices)) 143 144 # Anndata indexes by "rownames" not position so we need to rename the 145 # rows to properly index 146 adata_index = adata.obs_names[combined_indices].astype(int) 147 tfidf_norm = tfidf_norm[np.argsort(adata_index),:] 148 149 else: 150 151 tfidf_norm = tfidf(X, mode = 'normalize') 152 153 adata.X = tfidf_norm.copy() 154 155 return adata
Function to TF-IDF normalize the data in an adata object. If any rows are entirely 0, that row and its metadata will be removed from the object.
Parameters
- adata (ad.AnnData):
ad.Anndata
with.X
to be normalized. If'train_indices'
and'test_indices'
in'adata.uns.keys()'
, normalization will be done separately for the training and testing data. Otherwise, it will calculate it on the entire dataset. - binarize (bool):
If
True
, all values inadata.X
greater than 1 will become 1.
Returns
- adata (ad.AnnData):
adata
withadata.X
TF-IDF normalized. Will now have the train data stacked on test data, and the indices will be adjusted accordingly.
Examples
>>> adata = scmkl.create_adata(X = data_mat,
... feature_names = gene_names,
... group_dict = group_dict)
>>>
>>> adata = scmkl.tfidf_normalize(adata)