Backport of Prepare for pandas 2.0 (#2434) onto 1.9.x (#2435)

* Prepare for pandas 2.0 (#2434) * is_categorical -> is_categorical_dtype * cat.replace(to_remove, np.nan) -> cat.remove_categories(to_remove) * df1.append(df2) -> pd.concat([df1, df2]) * Series.iteritems -> Series.items * Fix indexing a pandas object with a set in score genes * Release notes (cherry picked from commit 0692ef9) * Fix anndata-dev test * anndata-dev compatibility * Prep 1.9.3 * Fix pr reference
scverse · Mar 2, 2023 · ed3b277 · ed3b277
1 parent 1fbbfcd
commit ed3b277
Show file tree

Hide file tree

Showing 11 changed files with 31 additions and 21 deletions.
diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml
@@ -50,7 +50,7 @@ jobs:
     displayName: 'Install dependencies'
 
   - script: |
-      'pip install -v "anndata[dev,test] @ git+https://github.com/scverse/anndata"'
+      pip install -v "anndata[dev,test] @ git+https://github.com/scverse/anndata"
     displayName: 'Install development anndata'
     condition: eq(variables['ANNDATA_DEV'], 'yes')
 

diff --git a/docs/release-notes/1.9.3.md b/docs/release-notes/1.9.3.md
@@ -0,0 +1,7 @@
+### 1.9.3 {small}`2023-03-02`
+
+```{rubric} Bug fixes
+```
+
+* Variety of fixes against pandas 2.0.0rc0 {pr}`2434` {smaller}`I Virshup`
+* Compatibility with anndata 0.9.0rc {pr}`2435` {smaller}`I Virshup`
diff --git a/scanpy/datasets/_datasets.py b/scanpy/datasets/_datasets.py
@@ -180,7 +180,7 @@ def paul15() -> ad.AnnData:
         clusters = f['cluster.id'][()].flatten().astype(int)
         infogenes_names = f['info.genes_strings'][()].astype(str)
     # each row has to correspond to a observation, therefore transpose
-    adata = ad.AnnData(X.transpose(), dtype=X.dtype)
+    adata = ad.AnnData(X.transpose(), dtype=np.float32)
     adata.var_names = gene_names
     adata.row_names = cell_names
     # names reflecting the cell type identifications from the paper

diff --git a/scanpy/external/exporting.py b/scanpy/external/exporting.py
@@ -11,7 +11,7 @@
 import h5py
 import matplotlib.pyplot as plt
 from anndata import AnnData
-from pandas.api.types import is_categorical
+from pandas.api.types import is_categorical_dtype
 
 from ..preprocessing._utils import _get_mean_var
 from .._utils import NeighborsView
@@ -148,31 +148,31 @@ def spring_project(
     continuous_extras = {}
     if cell_groupings is None:
         for obs_name in adata.obs:
-            if is_categorical(adata.obs[obs_name]):
+            if is_categorical_dtype(adata.obs[obs_name]):
                 categorical_extras[obs_name] = [str(x) for x in adata.obs[obs_name]]
     else:
         if isinstance(cell_groupings, str):
             cell_groupings = [cell_groupings]
         for obs_name in cell_groupings:
             if obs_name not in adata.obs:
                 logg.warning(f'Cell grouping {obs_name!r} is not in adata.obs')
-            elif is_categorical(adata.obs[obs_name]):
+            elif is_categorical_dtype(adata.obs[obs_name]):
                 categorical_extras[obs_name] = [str(x) for x in adata.obs[obs_name]]
             else:
                 logg.warning(
                     f'Cell grouping {obs_name!r} is not a categorical variable'
                 )
     if custom_color_tracks is None:
         for obs_name in adata.obs:
-            if not is_categorical(adata.obs[obs_name]):
+            if not is_categorical_dtype(adata.obs[obs_name]):
                 continuous_extras[obs_name] = np.array(adata.obs[obs_name])
     else:
         if isinstance(custom_color_tracks, str):
             custom_color_tracks = [custom_color_tracks]
         for obs_name in custom_color_tracks:
             if obs_name not in adata.obs:
                 logg.warning(f'Custom color track {obs_name!r} is not in adata.obs')
-            elif not is_categorical(adata.obs[obs_name]):
+            elif not is_categorical_dtype(adata.obs[obs_name]):
                 continuous_extras[obs_name] = np.array(adata.obs[obs_name])
             else:
                 logg.warning(

diff --git a/scanpy/plotting/_anndata.py b/scanpy/plotting/_anndata.py
@@ -2411,7 +2411,7 @@ def _plot_categories_as_colorblocks(
     labels = []
     label2code = {}  # dictionary of numerical values asigned to each label
     for code, (label, value) in enumerate(
-        obs_tidy.index.value_counts(sort=False).iteritems()
+        obs_tidy.index.value_counts(sort=False).items()
     ):
         ticks.append(value_sum + (value / 2))
         labels.append(label)

diff --git a/scanpy/plotting/_tools/scatterplots.py b/scanpy/plotting/_tools/scatterplots.py
@@ -1168,7 +1168,7 @@ def _get_color_source_vector(
     else:
         values = adata.obs_vector(value_to_plot, layer=layer)
     if groups and is_categorical_dtype(values):
-        values = values.replace(values.categories.difference(groups), np.nan)
+        values = values.remove_categories(values.categories.difference(groups))
     return values
 
 

diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
@@ -482,7 +482,7 @@ def highly_variable_genes(
             missing_hvg['highly_variable'] = missing_hvg['highly_variable'].astype(bool)
             missing_hvg['gene'] = gene_list[~filt]
             hvg['gene'] = adata_subset.var_names.values
-            hvg = hvg.append(missing_hvg, ignore_index=True)
+            hvg = pd.concat([hvg, missing_hvg], ignore_index=True)
 
             # Order as before filtering
             idxs = np.concatenate((np.where(filt)[0], np.where(~filt)[0]))

diff --git a/scanpy/tests/test_get.py b/scanpy/tests/test_get.py
@@ -158,6 +158,7 @@ def test_repeated_gene_symbols():
     adata = sc.AnnData(
         np.arange(3 * 4).reshape((3, 4)),
         var=pd.DataFrame({"gene_symbols": gene_symbols}, index=var_names),
+        dtype=np.float32,
     )
 
     with pytest.raises(KeyError, match="symbol_b"):

diff --git a/scanpy/tests/test_ingest.py b/scanpy/tests/test_ingest.py
@@ -17,10 +17,11 @@
         [7.0, 9.4, 6.8, 9.1, 8.0],
         [8.9, 8.6, 9.6, 1.0, 2.0],
         [6.5, 8.9, 2.2, 4.5, 8.9],
-    ]
+    ],
+    dtype=np.float32,
 )
 
-T = np.array([[2.0, 3.5, 4.0, 1.0, 4.7], [3.2, 2.0, 5.0, 5.0, 8.0]])
+T = np.array([[2.0, 3.5, 4.0, 1.0, 4.7], [3.2, 2.0, 5.0, 5.0, 8.0]], dtype=np.float32)
 
 
 @pytest.fixture

diff --git a/scanpy/tests/test_preprocessing.py b/scanpy/tests/test_preprocessing.py
@@ -15,11 +15,11 @@
 
 
 def test_log1p(tmp_path):
-    A = np.random.rand(200, 10)
+    A = np.random.rand(200, 10).astype(np.float32)
     A_l = np.log1p(A)
-    ad = AnnData(A)
-    ad2 = AnnData(A)
-    ad3 = AnnData(A)
+    ad = AnnData(A.copy())
+    ad2 = AnnData(A.copy())
+    ad3 = AnnData(A.copy())
     ad3.filename = tmp_path / 'test.h5ad'
     sc.pp.log1p(ad)
     assert np.allclose(ad.X, A_l)
@@ -84,18 +84,19 @@ def test_mean_var_sparse():
 
 
 def test_normalize_per_cell():
-    adata = AnnData(np.array([[1, 0], [3, 0], [5, 6]]))
+    A = np.array([[1, 0], [3, 0], [5, 6]], dtype=np.float32)
+    adata = AnnData(A.copy())
     sc.pp.normalize_per_cell(adata, counts_per_cell_after=1, key_n_counts='n_counts2')
     assert adata.X.sum(axis=1).tolist() == [1.0, 1.0, 1.0]
     # now with copy option
-    adata = AnnData(np.array([[1, 0], [3, 0], [5, 6]]))
+    adata = AnnData(A.copy())
     # note that sc.pp.normalize_per_cell is also used in
     # pl.highest_expr_genes with parameter counts_per_cell_after=100
     adata_copy = sc.pp.normalize_per_cell(adata, counts_per_cell_after=1, copy=True)
     assert adata_copy.X.sum(axis=1).tolist() == [1.0, 1.0, 1.0]
     # now sparse
-    adata = AnnData(np.array([[1, 0], [3, 0], [5, 6]]))
-    adata_sparse = AnnData(sp.csr_matrix([[1, 0], [3, 0], [5, 6]]))
+    adata = AnnData(A.copy())
+    adata_sparse = AnnData(sp.csr_matrix(A.copy()))
     sc.pp.normalize_per_cell(adata)
     sc.pp.normalize_per_cell(adata_sparse)
     assert adata.X.sum(axis=1).tolist() == adata_sparse.X.sum(axis=1).A1.tolist()

diff --git a/scanpy/tools/_score_genes.py b/scanpy/tools/_score_genes.py
@@ -148,7 +148,7 @@ def score_genes(
     control_genes = set()
 
     # now pick `ctrl_size` genes from every cut
-    for cut in np.unique(obs_cut.loc[gene_list]):
+    for cut in np.unique(obs_cut.loc[list(gene_list)]):
         r_genes = np.array(obs_cut[obs_cut == cut].index)
         np.random.shuffle(r_genes)
         # uses full r_genes if ctrl_size > len(r_genes)