# pyright: reportUnknownArgumentType=false from __future__ import annotations __all__ = ("brushed_points", "near_points") from typing import TYPE_CHECKING, Literal, Optional, Union, cast from ._typing_extensions import TypedDict from .types import BrushInfo, CoordInfo, CoordXY if TYPE_CHECKING: import numpy.typing as npt import pandas as pd DataFrameColumn = Union[ "pd.Series[int]", "pd.Series[float]", "pd.Series[str]", "pd.Categorical", "pd.DatetimeIndex", ] class SeriesFloatXY(TypedDict): x: pd.Series[float] y: pd.Series[float] def brushed_points( df: pd.DataFrame, brush: BrushInfo | None, xvar: Optional[str] = None, yvar: Optional[str] = None, panelvar1: Optional[str] = None, panelvar2: Optional[str] = None, *, all_rows: bool = False, ) -> pd.DataFrame: """Find rows of data selected on an interactive plot. This function is used with interactive plots. It returns the rows of a data frame which are under a brush. It currently supports plots created by matplotlib, seaborn, and plotnine. If plotnine is used, it can usually automatically infer the x and y variables, along with variables used for facets. Parameters ---------- df A pandas DataFrame from which to select rows. brush The data from a brush, like `input.myplot_brush()`. xvar The name of the column in `df` that contains the x values. (Note that when using plotnine, `xvar`, `yvar`, `panelvar1`, and `panelvar2` can usually be automatically inferred from the brush data.) yvar The name of the column in `df` that contains the y values. panelvar1 The name of the column in `df` that contains the first variable used for subpanels (if subpanels are used). panelvar2 The name of the column in `df` that contains the second variable used for subpanels. all_rows If `False` (the default), return a data frame containing only the rows that are selected. If `True`, then all rows from the data frame will be returned, along with an additional column named `selected_`, which indicates whether or not each row was selected. Returns ------- : A pandas DataFrame containing the rows selected by the brush. If `all_rows` is `True`, then all rows from the original data will be returned, along with an additional column named `selected_`, which indicates whether or not each row was selected. """ import pandas as pd new_df = df.copy() if brush is None: if all_rows: new_df["selected_"] = False else: new_df = new_df.loc[[]] return new_df if "xmin" not in brush: raise ValueError( "brushed_points requires a brush object with xmin, xmax, ymin, and ymax." ) # Which direction(s) the brush is selecting over. Direction can be 'x', 'y', # or 'xy'. use_x = "x" in brush["direction"] use_y = "y" in brush["direction"] # Filter out x and y values keep_rows: pd.Series[bool] = pd.Series( True, index=new_df.index, # pyright: ignore[reportUnknownMemberType] ) if use_x: if xvar is None and "x" in brush["mapping"]: xvar = brush["mapping"]["x"] if xvar is None: raise ValueError( "brushed_points: not able to automatically infer `xvar` from brush. You must supply `xvar` to brushed_points()" ) if xvar not in new_df: raise ValueError(f"brushed_points: `xvar` ({xvar}) not in dataframe") keep_rows &= within_brush(new_df[xvar], brush, "x") if use_y: if yvar is None and "y" in brush["mapping"]: yvar = brush["mapping"]["y"] if yvar is None: raise ValueError( "brushed_points: not able to automatically infer `yvar` from brush. You must supply `yvar` to brushed_points()" ) if yvar not in new_df: raise ValueError(f"brushed_points: `yvar` ({yvar}) not in dataframe") keep_rows &= within_brush(new_df[yvar], brush, "y") # Find which rows are matches for the panel vars (if present) if panelvar1 is None and "panelvar1" in brush["mapping"]: panelvar1 = brush["mapping"]["panelvar1"] if panelvar1 not in new_df: raise ValueError( f"brushed_points: `panelvar1` ({panelvar1}) not in dataframe" ) keep_rows &= new_df[panelvar1] == brush["panelvar1"] # pyright: ignore if panelvar2 is None and "panelvar2" in brush["mapping"]: panelvar2 = brush["mapping"]["panelvar2"] if panelvar2 not in new_df: raise ValueError( f"brushed_points: `panelvar2` ({panelvar2}) not in dataframe" ) keep_rows &= new_df[panelvar2] == brush["panelvar2"] # pyright: ignore if all_rows: new_df["selected_"] = False new_df.loc[keep_rows, "selected_"] = True else: new_df = new_df.loc[keep_rows] return new_df def near_points( df: pd.DataFrame, coordinfo: CoordInfo | None, xvar: Optional[str] = None, yvar: Optional[str] = None, panelvar1: Optional[str] = None, panelvar2: Optional[str] = None, *, threshold: float = 5, max_points: Optional[int] = None, add_dist: bool = False, all_rows: bool = False, ) -> pd.DataFrame: """Find rows of data selected on an interactive plot. This function is used with interactive plots. It returns the rows of a data frame which are under a brush. It currently supports plots created by matplotlib, seaborn, and plotnine. If plotnine is used, it can usually automatically infer the x and y variables, along with variables used for facets. Parameters ---------- df A pandas DataFrame from which to select rows. coordinfo The data from a click/dblclick/hover event, like `input.myplot_click()`. xvar The name of the column in `df` that contains the x values. (Note that when using plotnine, `xvar`, `yvar`, `panelvar1`, and `panelvar2` can usually be automatically inferred from the brush data.) yvar The name of the column in `df` that contains the y values. panelvar1 The name of the column in `df` that contains the first variable used for subpanels (if subpanels are used). panelvar2 The name of the column in `df` that contains the second variable used for subpanels. threshold A maximum distance (in pixels) to the pointer location. Rows in the data frame will be selected if the distance to the pointer is less than `threshold`. max_points Maximum number of rows to return. If `None` (the default), will return all rows within the threshold distance. add_dist If `True`, add a column named `dist_` that contains the distance from the coordinate to the point, in pixels. When no pointer event has yet occurred, the value of `dist_` will be `numpy.NaN`. all_rows If `False` (the default), return a data frame containing only the rows that are selected. If `True`, then all rows from the data frame will be returned, along with an additional column named `selected_`, which indicates whether or not each row was selected. Returns ------- : A pandas DataFrame containing the rows selected by the brush. If `all_rows` is `True`, then all rows from the original data will be returned, along with an additional column named `selected_`, which indicates whether or not each row was selected. """ import numpy as np new_df = df.copy() # For no current coordinfo if coordinfo is None: if add_dist: new_df["dist"] = np.nan if all_rows: new_df["selected_"] = False else: new_df = new_df.loc[[]] return new_df # Try to extract vars from coordinfo object coordinfo_mapping = coordinfo["mapping"] if xvar is None and "x" in coordinfo_mapping: xvar = coordinfo_mapping["x"] if yvar is None and "y" in coordinfo_mapping: yvar = coordinfo_mapping["y"] if xvar is None: if xvar is None and "x" in coordinfo["mapping"]: xvar = coordinfo["mapping"]["x"] raise ValueError( "near_points: not able to automatically infer `xvar` from coordinfo. You must supply `xvar` to near_points()" ) if yvar is None: if yvar is None and "y" in coordinfo["mapping"]: yvar = coordinfo["mapping"]["y"] raise ValueError( "near_points: not able to automatically infer `yvar` from coordinfo. You must supply `yvar` to near_points()" ) if xvar not in new_df.columns: raise ValueError(f"near_points: `xvar` ('{xvar}') not in names of input.") if yvar not in new_df.columns: raise ValueError(f"near_points: `yvar` ('{yvar}') not in names of input.") x: pd.Series[float] = to_float(new_df[xvar]) y: pd.Series[float] = to_float(new_df[yvar]) # Get the coordinates of the point (in img pixel coordinates) point_img: CoordXY = coordinfo["coords_img"] # Get coordinates of data points (in img pixel coordinates) data_img: SeriesFloatXY = scale_coords(x, y, coordinfo) # Get x/y distances (in css coordinates) dist_css: SeriesFloatXY = { "x": (data_img["x"] - point_img["x"]) / coordinfo["img_css_ratio"]["x"], "y": (data_img["y"] - point_img["y"]) / coordinfo["img_css_ratio"]["y"], } # Distances of data points to the target point, in css pixels. dists: pd.Series[float] = (dist_css["x"] ** 2 + dist_css["y"] ** 2) ** 0.5 if add_dist: new_df["dist"] = dists keep_rows = dists <= threshold # Find which rows are matches for the panel vars (if present) if panelvar1 is None and "panelvar1" in coordinfo["mapping"]: panelvar1 = coordinfo["mapping"]["panelvar1"] if panelvar1 not in new_df: raise ValueError(f"near_points: `panelvar1` ({panelvar1}) not in dataframe") keep_rows &= new_df[panelvar1] == coordinfo["panelvar1"] # pyright: ignore if panelvar2 is None and "panelvar2" in coordinfo["mapping"]: panelvar2 = coordinfo["mapping"]["panelvar2"] if panelvar2 not in new_df: raise ValueError(f"near_points: `panelvar2` ({panelvar2}) not in dataframe") keep_rows &= new_df[panelvar2] == coordinfo["panelvar2"] # pyright: ignore # Track the row indices to keep (note this is the row position, 0, 1, 2, not the # pandas index column, which can have arbitrary values). keep_idx = np.where(keep_rows)[0] # Order by distance dists = dists.iloc[keep_idx] keep_idx: npt.NDArray[np.intp] = keep_idx[dists.argsort()] # Keep max number of rows if max_points is not None and len(keep_idx) > max_points: keep_idx = keep_idx[:max_points] if all_rows: # Add selected_ column if needed new_df["selected_"] = False new_df.iloc[ # pyright: ignore[reportArgumentType] keep_idx, new_df.columns.get_loc( # pyright: ignore[reportUnknownMemberType] "selected_" ), ] = True else: new_df = new_df.iloc[keep_idx] return new_df # =============================================================================== # Helper functions # =============================================================================== # Helper to determine if data values are within the limits of # an input brush. def within_brush( vals: DataFrameColumn, brush: BrushInfo, var: Literal["x", "y"] = "x", ) -> pd.Series[bool]: vals = to_float(vals) return (vals >= brush[var + "min"]) & (vals <= brush[var + "max"]) def to_float(x: DataFrameColumn) -> pd.Series[float]: import pandas as pd import pandas.api.types as ptypes """Convert int/float/str/categorical Series to a float Series. If the input is a int or float Series, this returns a copy. Otherwise, it returns a new Series object. """ if ptypes.is_numeric_dtype(x): # pyright: ignore[reportUnknownMemberType] return cast("pd.Series[float]", x) elif isinstance(x, ptypes.CategoricalDtype): return cast("pd.Series[float]", x.cat.codes + 1) # pyright: ignore elif ptypes.is_string_dtype(x): # pyright: ignore[reportUnknownMemberType] return cast( "pd.Series[float]", x.astype("category").cat.codes + 1 # pyright: ignore ) elif ptypes.is_datetime64_any_dtype(x): # pyright: ignore[reportUnknownMemberType] # We need to convert the pandas datetimes, which are in nanoseconds since epoch, # to matplotlib datetimes, which are in days since epoch.. return ( cast( "pd.Series[int]", pd.Series(x.astype("int64")), # pyright: ignore ) / (24 * 60 * 60) / 1e9 ) raise ValueError("to_float: unsupported dtype for x") # =============================================================================== # Scaling functions # =============================================================================== # These functions have direct analogs in Javascript code, except these are # vectorized for x and y. # Map a value x from a domain to a range. If clip is true, clip it to the # range. def map_linear( x: pd.Series[float], domain_min: float, domain_max: float, range_min: float, range_max: float, clip: bool = True, ) -> pd.Series[float]: factor = (range_max - range_min) / (domain_max - domain_min) val: pd.Series[float] = x - domain_min newval: pd.Series[float] = (val * factor) + range_min if clip: maxval = max(range_max, range_min) minval = min(range_max, range_min) newval[newval > maxval] = maxval newval[newval < minval] = minval return newval # Scale val from domain to range. If logbase is present, use log scaling. def scale_1d( val: pd.Series[float], domain_min: float, domain_max: float, range_min: float, range_max: float, logbase: Optional[float] = None, clip: bool = True, ) -> pd.Series[float]: import numpy as np if logbase is not None: val = np.log(val) / np.log(logbase) return map_linear(val, domain_min, domain_max, range_min, range_max, clip) # Scale x and y coordinates from domain to range, using information in scaleinfo. # scaleinfo must contain items $domain, $range, and $log. The scaleinfo object # corresponds to one element from the coordmap object generated by getPrevPlotCoordmap # or getGgplotCoordmap; it is the scaling information for one panel in a plot. def scale_coords( x: pd.Series[float], y: pd.Series[float], coordinfo: CoordInfo, ) -> SeriesFloatXY: domain = coordinfo["domain"] range = coordinfo["range"] log = coordinfo["log"] return { "x": scale_1d( x, domain["left"], domain["right"], range["left"], range["right"], log["x"] ), "y": scale_1d( y, domain["bottom"], domain["top"], range["bottom"], range["top"], log["y"] ), }