scmkl.tfidf_normalize

  1import numpy as np
  2import scipy
  3import anndata as ad
  4
  5def tfidf(X: np.ndarray | scipy.sparse._csc.csc_matrix, mode: str='filter'):
  6    """
  7    Function to use Term Frequency Inverse Document Frequency (TF-IDF)
  8    filtering for atac data to find meaningful features.
  9    
 10    Parameters
 11    ----------
 12    X : np.ndarray | scipy.sparse._csc.csc_matrix
 13        Data matrix of cell x feature.  Must be a `np.ndarray` or 
 14        `scipy.sparse` matrix.
 15
 16    mode : str
 17        Argument to determine what to return. Must be `'filter'` or 
 18        `'normalize'`.
 19    
 20    Returns
 21    -------
 22    tfidf | significant_features : array_like
 23        Output depends on given `'mode'` parameter:
 24
 25        `'filter'` (np.ndarray): 
 26        Which column sums are nonzero (i.e. which features are 
 27        significant).
 28
 29        `'normalize'` (np.ndarray | scipy.sparse._csc.csc_matrix): 
 30        TF-IDF filtered data matrix of the same dimensions as `X`. 
 31    """
 32    assert mode in ['filter', 'normalize'], ("mode must be 'filter' or "
 33                                             "'normalize'.")
 34
 35    if scipy.sparse.issparse(X):
 36        tf = scipy.sparse.csc_array(X)
 37        doc_freq = np.array(np.sum(tf > 0, axis=0)).reshape(-1)
 38    else:
 39        tf = np.asarray(X)
 40        doc_freq = np.sum(X > 0, axis=0)
 41
 42    idf = np.log1p((1 + X.shape[0])/(1 + doc_freq))
 43    tfidf = tf*idf
 44
 45    if mode == 'normalize':
 46        if scipy.sparse.issparse(tfidf):
 47            tfidf = scipy.sparse.csc_matrix(tfidf)
 48        return tfidf
 49    
 50    elif mode == 'filter':
 51        significant_features = np.where(np.sum(tfidf, axis=0) > 0)[0]
 52        return significant_features
 53        
 54def tfidf_train_test(X_train, X_test):
 55    if scipy.sparse.issparse(X_train):
 56        tf_train = scipy.sparse.csc_array(X_train)
 57        tf_test = scipy.sparse.csc_array(X_test)
 58        doc_freq = np.array(np.sum(X_train > 0, axis=0)).reshape(-1)
 59    else:
 60        tf_train = X_train
 61        tf_test = X_test
 62        doc_freq = np.sum(X_train > 0, axis=0)
 63
 64    idf = np.log1p((1 + X_train.shape[0])/(1 + doc_freq))
 65
 66    tfidf_train = tf_train*idf
 67    tfidf_test = tf_test*idf
 68
 69    if scipy.sparse.issparse(tfidf_train):
 70        tfidf_train = scipy.sparse.csc_matrix(tfidf_train)
 71        tfidf_test = scipy.sparse.csc_matrix(tfidf_test)
 72        
 73    return tfidf_train, tfidf_test
 74
 75
 76def tfidf_normalize(adata: ad.AnnData, binarize: bool=False):
 77    """
 78    Function to TF-IDF normalize the data in an adata object. If any 
 79    rows are entirely 0, that row and its metadata will be removed from
 80    the object.
 81
 82    Parameters
 83    ----------
 84    adata : ad.AnnData
 85        `ad.Anndata` with `.X` to be normalized. If `'train_indices'` 
 86        and `'test_indices'` in `'adata.uns.keys()'`, normalization 
 87        will be done separately for the training and testing data. 
 88        Otherwise, it will calculate it on the entire dataset.
 89
 90    binarize : bool 
 91        If `True`, all values in `adata.X` greater than 1 will become 
 92        1.
 93
 94    Returns
 95    -------
 96    adata : ad.AnnData 
 97        `adata` with `adata.X` TF-IDF normalized. Will now have the 
 98        train data stacked on test data, and the indices will be 
 99        adjusted accordingly.
100
101    Examples
102    --------
103    >>> adata = scmkl.create_adata(X = data_mat, 
104    ...                            feature_names = gene_names, 
105    ...                            group_dict = group_dict)
106    >>> 
107    >>> adata = scmkl.tfidf_normalize(adata)
108    """
109    X = adata.X.copy()
110    row_sums = np.sum(X, axis = 1)
111    assert np.all(row_sums > 0), "TFIDF requires all row sums be positive"
112
113    if binarize:
114        X[X > 0] = 1
115
116    if 'train_indices' in adata.uns_keys():
117
118        train_indices = adata.uns['train_indices'].copy()
119        test_indices = adata.uns['test_indices'].copy()
120
121        # Calculate the train TFIDF matrix on just the training data so it is 
122        # not biased by testing data
123        tfidf_train = tfidf(X[train_indices,:], mode = 'normalize')
124
125        # Calculate the test TFIDF by calculating it on the train and test 
126        # data and index the test data
127        tfidf_test = tfidf(X, mode = 'normalize')[test_indices,:]
128
129        # Impossible to add rows back to original location so we need to 
130        # stack the matrices to maintain train/test
131        if scipy.sparse.issparse(X):
132            tfidf_norm = scipy.sparse.vstack((tfidf_train, tfidf_test))
133        else:
134            tfidf_norm = np.vstack((tfidf_train, tfidf_test))
135
136        # I'm not sure why this reassignment is necessary, but without, the 
137        # values will be saved as 0s in adata
138        adata.uns['train_indices'] = train_indices
139        adata.uns['test_indices'] = test_indices
140
141        combined_indices = np.concatenate((train_indices, test_indices))
142
143        # Anndata indexes by "rownames" not position so we need to rename the 
144        # rows to properly index
145        adata_index = adata.obs_names[combined_indices].astype(int)
146        tfidf_norm = tfidf_norm[np.argsort(adata_index),:]
147
148    else:
149
150        tfidf_norm = tfidf(X, mode = 'normalize')
151
152    adata.X = tfidf_norm.copy()
153
154    return adata
def tfidf( X: numpy.ndarray | scipy.sparse._csc.csc_matrix, mode: str = 'filter'):
 6def tfidf(X: np.ndarray | scipy.sparse._csc.csc_matrix, mode: str='filter'):
 7    """
 8    Function to use Term Frequency Inverse Document Frequency (TF-IDF)
 9    filtering for atac data to find meaningful features.
10    
11    Parameters
12    ----------
13    X : np.ndarray | scipy.sparse._csc.csc_matrix
14        Data matrix of cell x feature.  Must be a `np.ndarray` or 
15        `scipy.sparse` matrix.
16
17    mode : str
18        Argument to determine what to return. Must be `'filter'` or 
19        `'normalize'`.
20    
21    Returns
22    -------
23    tfidf | significant_features : array_like
24        Output depends on given `'mode'` parameter:
25
26        `'filter'` (np.ndarray): 
27        Which column sums are nonzero (i.e. which features are 
28        significant).
29
30        `'normalize'` (np.ndarray | scipy.sparse._csc.csc_matrix): 
31        TF-IDF filtered data matrix of the same dimensions as `X`. 
32    """
33    assert mode in ['filter', 'normalize'], ("mode must be 'filter' or "
34                                             "'normalize'.")
35
36    if scipy.sparse.issparse(X):
37        tf = scipy.sparse.csc_array(X)
38        doc_freq = np.array(np.sum(tf > 0, axis=0)).reshape(-1)
39    else:
40        tf = np.asarray(X)
41        doc_freq = np.sum(X > 0, axis=0)
42
43    idf = np.log1p((1 + X.shape[0])/(1 + doc_freq))
44    tfidf = tf*idf
45
46    if mode == 'normalize':
47        if scipy.sparse.issparse(tfidf):
48            tfidf = scipy.sparse.csc_matrix(tfidf)
49        return tfidf
50    
51    elif mode == 'filter':
52        significant_features = np.where(np.sum(tfidf, axis=0) > 0)[0]
53        return significant_features

Function to use Term Frequency Inverse Document Frequency (TF-IDF) filtering for atac data to find meaningful features.

Parameters
  • X (np.ndarray | scipy.sparse._csc.csc_matrix): Data matrix of cell x feature. Must be a np.ndarray or scipy.sparse matrix.
  • mode (str): Argument to determine what to return. Must be 'filter' or 'normalize'.
Returns
  • tfidf | significant_features (array_like): Output depends on given 'mode' parameter:

    'filter' (np.ndarray): Which column sums are nonzero (i.e. which features are significant).

    'normalize' (np.ndarray | scipy.sparse._csc.csc_matrix): TF-IDF filtered data matrix of the same dimensions as X.

def tfidf_train_test(X_train, X_test):
55def tfidf_train_test(X_train, X_test):
56    if scipy.sparse.issparse(X_train):
57        tf_train = scipy.sparse.csc_array(X_train)
58        tf_test = scipy.sparse.csc_array(X_test)
59        doc_freq = np.array(np.sum(X_train > 0, axis=0)).reshape(-1)
60    else:
61        tf_train = X_train
62        tf_test = X_test
63        doc_freq = np.sum(X_train > 0, axis=0)
64
65    idf = np.log1p((1 + X_train.shape[0])/(1 + doc_freq))
66
67    tfidf_train = tf_train*idf
68    tfidf_test = tf_test*idf
69
70    if scipy.sparse.issparse(tfidf_train):
71        tfidf_train = scipy.sparse.csc_matrix(tfidf_train)
72        tfidf_test = scipy.sparse.csc_matrix(tfidf_test)
73        
74    return tfidf_train, tfidf_test
def tfidf_normalize(adata: anndata._core.anndata.AnnData, binarize: bool = False):
 77def tfidf_normalize(adata: ad.AnnData, binarize: bool=False):
 78    """
 79    Function to TF-IDF normalize the data in an adata object. If any 
 80    rows are entirely 0, that row and its metadata will be removed from
 81    the object.
 82
 83    Parameters
 84    ----------
 85    adata : ad.AnnData
 86        `ad.Anndata` with `.X` to be normalized. If `'train_indices'` 
 87        and `'test_indices'` in `'adata.uns.keys()'`, normalization 
 88        will be done separately for the training and testing data. 
 89        Otherwise, it will calculate it on the entire dataset.
 90
 91    binarize : bool 
 92        If `True`, all values in `adata.X` greater than 1 will become 
 93        1.
 94
 95    Returns
 96    -------
 97    adata : ad.AnnData 
 98        `adata` with `adata.X` TF-IDF normalized. Will now have the 
 99        train data stacked on test data, and the indices will be 
100        adjusted accordingly.
101
102    Examples
103    --------
104    >>> adata = scmkl.create_adata(X = data_mat, 
105    ...                            feature_names = gene_names, 
106    ...                            group_dict = group_dict)
107    >>> 
108    >>> adata = scmkl.tfidf_normalize(adata)
109    """
110    X = adata.X.copy()
111    row_sums = np.sum(X, axis = 1)
112    assert np.all(row_sums > 0), "TFIDF requires all row sums be positive"
113
114    if binarize:
115        X[X > 0] = 1
116
117    if 'train_indices' in adata.uns_keys():
118
119        train_indices = adata.uns['train_indices'].copy()
120        test_indices = adata.uns['test_indices'].copy()
121
122        # Calculate the train TFIDF matrix on just the training data so it is 
123        # not biased by testing data
124        tfidf_train = tfidf(X[train_indices,:], mode = 'normalize')
125
126        # Calculate the test TFIDF by calculating it on the train and test 
127        # data and index the test data
128        tfidf_test = tfidf(X, mode = 'normalize')[test_indices,:]
129
130        # Impossible to add rows back to original location so we need to 
131        # stack the matrices to maintain train/test
132        if scipy.sparse.issparse(X):
133            tfidf_norm = scipy.sparse.vstack((tfidf_train, tfidf_test))
134        else:
135            tfidf_norm = np.vstack((tfidf_train, tfidf_test))
136
137        # I'm not sure why this reassignment is necessary, but without, the 
138        # values will be saved as 0s in adata
139        adata.uns['train_indices'] = train_indices
140        adata.uns['test_indices'] = test_indices
141
142        combined_indices = np.concatenate((train_indices, test_indices))
143
144        # Anndata indexes by "rownames" not position so we need to rename the 
145        # rows to properly index
146        adata_index = adata.obs_names[combined_indices].astype(int)
147        tfidf_norm = tfidf_norm[np.argsort(adata_index),:]
148
149    else:
150
151        tfidf_norm = tfidf(X, mode = 'normalize')
152
153    adata.X = tfidf_norm.copy()
154
155    return adata

Function to TF-IDF normalize the data in an adata object. If any rows are entirely 0, that row and its metadata will be removed from the object.

Parameters
  • adata (ad.AnnData): ad.Anndata with .X to be normalized. If 'train_indices' and 'test_indices' in 'adata.uns.keys()', normalization will be done separately for the training and testing data. Otherwise, it will calculate it on the entire dataset.
  • binarize (bool): If True, all values in adata.X greater than 1 will become 1.
Returns
  • adata (ad.AnnData): adata with adata.X TF-IDF normalized. Will now have the train data stacked on test data, and the indices will be adjusted accordingly.
Examples
>>> adata = scmkl.create_adata(X = data_mat, 
...                            feature_names = gene_names, 
...                            group_dict = group_dict)
>>> 
>>> adata = scmkl.tfidf_normalize(adata)