ENH: Resolve Future Warnings from pandas v2.1.0

has2k1 · has2k1 · commit 4d23ab74cf04 · 2023-09-01T17:50:45.000+03:00
closes #713
diff --git a/plotnine/coords/coord.py b/plotnine/coords/coord.py
@@ -167,10 +167,18 @@ def munch(
     ) -> pd.DataFrame:
         ranges = self.backtransform_range(panel_params)
 
-        data.loc[data["x"] == -np.inf, "x"] = ranges.x[0]
-        data.loc[data["x"] == np.inf, "x"] = ranges.x[1]
-        data.loc[data["y"] == -np.inf, "y"] = ranges.y[0]
-        data.loc[data["y"] == np.inf, "y"] = ranges.y[1]
+        x_neginf = np.isneginf(data["x"])
+        x_posinf = np.isposinf(data["x"])
+        y_neginf = np.isneginf(data["y"])
+        y_posinf = np.isposinf(data["y"])
+        if x_neginf.any():
+            data.loc[x_neginf, "x"] = ranges.x[0]
+        if x_posinf.any():
+            data.loc[x_posinf, "x"] = ranges.x[1]
+        if y_neginf.any():
+            data.loc[y_neginf, "y"] = ranges.y[0]
+        if y_posinf.any():
+            data.loc[y_posinf, "y"] = ranges.y[1]
 
         dist = self.distance(data["x"], data["y"], panel_params)
         bool_idx = (
diff --git a/plotnine/geoms/geom.py b/plotnine/geoms/geom.py
@@ -286,7 +286,7 @@ def draw_layer(
             includes the stacking order of the layer in
             the plot (*zorder*)
         """
-        for pid, pdata in data.groupby("PANEL"):
+        for pid, pdata in data.groupby("PANEL", observed=True):
             if len(pdata) == 0:
                 continue
             ploc = pdata["PANEL"].iat[0] - 1
diff --git a/plotnine/mapping/evaluation.py b/plotnine/mapping/evaluation.py
@@ -160,7 +160,10 @@ def reorder(x, y, fun=np.median, ascending=True):
     if len(x) != len(y):
         raise ValueError(f"Lengths are not equal. {len(x)=}, {len(x)=}")
     summary = (
-        pd.Series(y).groupby(x).apply(fun).sort_values(ascending=ascending)
+        pd.Series(y)
+        .groupby(x, observed=True)
+        .apply(fun)
+        .sort_values(ascending=ascending)
     )
     cats = summary.index.to_list()
     return pd.Categorical(x, categories=cats)
diff --git a/plotnine/stats/binning.py b/plotnine/stats/binning.py
@@ -14,13 +14,13 @@
     from plotnine.typing import FloatArray, TupleFloat2
 
 
-__all__ = [
+__all__ = (
     "freedman_diaconis_bins",
     "breaks_from_bins",
     "breaks_from_binwidth",
     "assign_bins",
     "fuzzybreaks",
-]
+)
 
 
 def freedman_diaconis_bins(a):
@@ -37,7 +37,7 @@ def freedman_diaconis_bins(a):
     if h == 0:
         bins = np.ceil(np.sqrt(a.size))
     else:
-        bins = np.ceil((np.nanmax(a) - np.nanmin(a)) / h)
+        bins = np.ceil((np.nanmax(a) - np.nanmin(a)) / h)  # type: ignore
 
     return int(bins)
 
@@ -168,10 +168,10 @@ def assign_bins(x, breaks, weight=None, pad=False, closed="right"):
     #   - the bins to which each x is assigned
     #   - the weight of each x value
     # Then create a weighted frequency table
-    df = pd.DataFrame({"bin_idx": bin_idx, "weight": weight})
-    wftable = df.pivot_table("weight", index=["bin_idx"], aggfunc=np.sum)[
-        "weight"
-    ]
+    bins_long = pd.DataFrame({"bin_idx": bin_idx, "weight": weight})
+    wftable = bins_long.pivot_table(
+        "weight", index=["bin_idx"], aggfunc="sum"
+    )["weight"]
 
     # Empty bins get no value in the computed frequency table.
     # We need to add the zeros and since frequency table is a
@@ -279,7 +279,7 @@ def fuzzybreaks(
         binwidth = (srange[1] - srange[0]) / bins
 
     if boundary is None or np.isnan(boundary):
-        boundary = round_any(srange[0], binwidth, np.floor)  # pyright: ignore
+        boundary = round_any(srange[0], binwidth, np.floor)
 
     if recompute_bins:
         bins = int(np.ceil((srange[1] - boundary) / binwidth))
diff --git a/plotnine/stats/density.py b/plotnine/stats/density.py
@@ -9,7 +9,8 @@
 """
 
 import numpy as np
-import pandas.api.types as pdtypes
+
+from ..utils import array_kind
 
 
 def kde_scipy(data, grid, **kwargs):
@@ -214,13 +215,10 @@ def get_var_type(col):
     The origin of the character codes is
     :class:`statsmodels.nonparametric.kernel_density.KDEMultivariate`.
     """
-    if pdtypes.is_numeric_dtype(col):
-        # continuous
+    if array_kind.continuous(col):
         return "c"
-    elif pdtypes.is_categorical_dtype(col):
-        # ordered or unordered
-        return "o" if col.cat.ordered else "u"
+    elif array_kind.discrete(col):
+        return "o" if array_kind.ordinal else "u"
     else:
-        # unordered if unsure, e.g string columns that
-        # are not categorical
+        # unordered if unsure
         return "u"
diff --git a/plotnine/stats/stat_bin_2d.py b/plotnine/stats/stat_bin_2d.py
@@ -104,13 +104,13 @@ def compute_group(cls, data, scales, **params):
         xbins = pd.cut(
             x,
             bins=xbreaks,  # pyright: ignore
-            labels=False,  # pyright: ignore
+            labels=False,
             right=True,
         )
         ybins = pd.cut(
             y,
             bins=ybreaks,  # pyright: ignore
-            labels=False,  # pyright: ignore
+            labels=False,
             right=True,
         )
 
@@ -123,15 +123,15 @@ def compute_group(cls, data, scales, **params):
         ybreaks[0] -= np.diff(np.diff(ybreaks))[0]
         xbreaks[0] -= np.diff(np.diff(xbreaks))[0]
 
-        df = pd.DataFrame(
+        bins_grid_long = pd.DataFrame(
             {
                 "xbins": xbins,
                 "ybins": ybins,
                 "weight": weight,
             }
         )
-        table = df.pivot_table(
-            "weight", index=["xbins", "ybins"], aggfunc=np.sum
+        table = bins_grid_long.pivot_table(
+            "weight", index=["xbins", "ybins"], aggfunc="sum"
         )["weight"]
 
         # create rectangles
diff --git a/plotnine/stats/stat_boxplot.py b/plotnine/stats/stat_boxplot.py
@@ -1,6 +1,5 @@
 import numpy as np
 import pandas as pd
-import pandas.api.types as pdtypes
 
 from ..doctools import document
 from ..utils import resolution
@@ -97,7 +96,7 @@ def compute_group(cls, data, scales, **params):
         else:
             width = params["width"]
 
-        if pdtypes.is_categorical_dtype(data["x"]):
+        if isinstance(data["x"].dtype, pd.CategoricalDtype):
             x = data["x"].iloc[0]
         else:
             x = np.mean([data["x"].min(), data["x"].max()])
diff --git a/plotnine/stats/stat_count.py b/plotnine/stats/stat_count.py
@@ -65,9 +65,11 @@ def compute_group(cls, data, scales, **params):
 
         weight = data.get("weight", [1] * len(x))  # pyright: ignore
         width = params["width"]
-        df = pd.DataFrame({"weight": weight, "x": x})
+        xdata_long = pd.DataFrame({"x": x, "weight": weight})
         # weighted frequency count
-        count = df.pivot_table("weight", index=["x"], aggfunc=np.sum)["weight"]
+        count = xdata_long.pivot_table("weight", index=["x"], aggfunc="sum")[
+            "weight"
+        ]
         x = count.index
         count = count.to_numpy()
         return pd.DataFrame(
diff --git a/plotnine/utils.py b/plotnine/utils.py
@@ -14,7 +14,6 @@
 
 import numpy as np
 import pandas as pd
-import pandas.api.types as pdtypes
 
 # missing in type stubs
 from pandas.core.groupby import DataFrameGroupBy  # type: ignore
@@ -207,14 +206,14 @@ def add_margins(
     categories = {}
     for v in itertools.chain(*vars):
         col = df[v]
-        if not pdtypes.is_categorical_dtype(df[v]):
+        if not isinstance(df[v].dtype, pd.CategoricalDtype):
             col = pd.Categorical(df[v])
         categories[v] = col.categories
         if "(all)" not in categories[v]:
             categories[v] = categories[v].insert(len(categories[v]), "(all)")
 
     for v in merged.columns.intersection(list(categories.keys())):
-        merged[v] = merged[v].astype(pdtypes.CategoricalDtype(categories[v]))
+        merged[v] = merged[v].astype(pd.CategoricalDtype(categories[v]))
 
     return merged
 
@@ -286,9 +285,7 @@ def _id_var(x: pd.Series[Any], drop: bool = False) -> list[int]:
     if len(x) == 0:
         return []
 
-    categorical = pdtypes.is_categorical_dtype(x)
-
-    if categorical:
+    if array_kind.categorical(x):
         if drop:
             x = x.cat.remove_unused_categories()
             lst = list(x.cat.codes + 1)
@@ -593,7 +590,7 @@ def groupby_apply(
         axis = 0
 
     lst = []
-    for _, d in df.groupby(cols):
+    for _, d in df.groupby(cols, observed=True):
         # function fn should be free to modify dataframe d, therefore
         # do not mark d as a slice of df i.e no SettingWithCopyWarning
         lst.append(func(d, *args, **kwargs))
@@ -1180,10 +1177,30 @@ def ordinal(arr):
         out : bool
             Whether array `arr` is an ordered categorical
         """
-        if pdtypes.is_categorical_dtype(arr):
+        if isinstance(arr.dtype, pd.CategoricalDtype):
             return arr.cat.ordered
         return False
 
+    @staticmethod
+    def categorical(arr):
+        """
+        Return True if array is a categorical
+
+        Parameters
+        ----------
+        arr : list-like
+            List
+
+        Returns
+        -------
+        bool
+            Whether array `arr` is a categorical
+        """
+        if not hasattr(arr, "dtype"):
+            return False
+
+        return isinstance(arr.dtype, pd.CategoricalDtype)
+
 
 def log(x, base=None):
     """