openscm
diff --git a/‎docs/tutorials/pandas_accessor_tutorial.py‎
Lines changed: 87 additions & 18 deletions b/‎docs/tutorials/pandas_accessor_tutorial.py‎
Lines changed: 87 additions & 18 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/continuous_timeseries/pandas_accessors.py‎
Lines changed: 118 additions & 6 deletions b/‎src/continuous_timeseries/pandas_accessors.py‎
Lines changed: 118 additions & 6 deletions
@@ -35,7 +35,9 @@
 import numpy as np
 import openscm_units
 import pandas as pd
+import pandas_indexing as pix
 import pint
+import seaborn as sns
 
 import continuous_timeseries as ct
 import continuous_timeseries.pandas_accessors
@@ -115,19 +117,21 @@ def create_df(
             (
                 (s, v, r, units)
                 for s, v, r in itertools.product(
-                    [f"variable_{i}" for i in range(n_variables)],
                     [f"scenario_{i}" for i in range(n_scenarios)],
+                    [f"variable_{i}" for i in range(n_variables)],
                     [i for i in range(n_runs)],
                 )
             ),
-            columns=["scenario", "variable", "region", "units"],
-            # This makes updates later way way faster
+            columns=["scenario", "variable", "run", "units"],
+            # This makes updates and general handling later way way faster.
+            # TODO: make this tip clearer.
             dtype="category",
         )
     )
 
+    n_ts = n_scenarios * n_variables * n_runs
     df = pd.DataFrame(
-        np.random.random((n_variables * n_runs * n_scenarios, timepoints.size)),
+        50.0 * np.linspace(0.3, 1, n_ts)[:,  np.newaxis] * np.linspace(0, 1, timepoints.size)[np.newaxis, :] + np.random.random((n_ts, timepoints.size)),
         columns=timepoints,
         index=idx,
     )
@@ -150,9 +154,9 @@ def create_df(
 
 # %%
 small_df = create_df(
-    n_scenarios=25,
-    n_variables=10,
-    n_runs=30,
+    n_scenarios=3,
+    n_variables=2,
+    n_runs=5,
     timepoints=np.arange(250) + 1850.0,
 )
 small_df
@@ -161,21 +165,36 @@ def create_df(
 # Then we convert it time series.
 
 # %%
-small_df.ct.to_timeseries(
+small_ts = small_df.ct.to_timeseries(
     time_units="yr",
     interpolation=ct.InterpolationOption.PiecewiseConstantPreviousLeftClosed,
 )
+small_ts
 
 # %% [markdown]
 # Then we can use standard Continuous timeseries APIs,
 # e.g. plotting.
 
 # %%
+small_ts.ct.plot(continuous_plot_kwargs=dict(alpha=0.3))
+# # TODO: move this to plotting
+# small_ts.ct.plot(continuous_plot_kwargs=dict(alpha=0.3), progress=True)
 
 # %% [markdown]
-# If we have a bigger `pd.DataFrame`, this process can be much slower.
-# If you're not sure what's happening, you can activate the progress bar if you have
-# [`tdqm`](https://tqdm.github.io/) installed.
+# When combined with [pandas-indexing](https://pandas-indexing.readthedocs.io/en/latest/index.html),
+# this can be quite powerful for quick plots.
+
+# %%
+ax = small_ts.loc[pix.isin(variable="variable_0")].ct.plot(continuous_plot_kwargs=dict(alpha=0.3))
+ax.legend(ncols=3, loc="upper center", bbox_to_anchor=(0.5, -0.15))
+
+# %%
+# TODO: move this to plotting section
+ax = small_ts.loc[pix.isin(variable="variable_0", run=0)].ct.plot(label="scenario", continuous_plot_kwargs=dict(alpha=0.9))
+ax.legend()
+
+# %% [markdown]
+# If we have a bigger `pd.DataFrame`, the conversion process can be much slower.
 
 # %%
 bigger_df = create_df(
@@ -184,7 +203,12 @@ def create_df(
     n_runs=300,
     timepoints=np.arange(351) + 1850.0,
 )
-bigger_df
+bigger_df.shape
+
+# %% [markdown]
+# If want to see the conversion's progress,
+# you can activate the progress bar if you have
+# [`tdqm`](https://tqdm.github.io/) installed.
 
 # %%
 bigger_df.ct.to_timeseries(
@@ -224,7 +248,8 @@ def create_df(
 # If you want nested progress bars in parallel,
 # we support that too
 # (although we're not sure if this works on windows
-# because of the need for forking...).
+# because of the need for forking, for details see
+# [here](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods)).
 
 # %%
 bigger_df.ct.to_timeseries(
@@ -241,11 +266,55 @@ def create_df(
 )
 
 # %% [markdown]
-# - filtering with pandas-indexing
-# - bigger df
-# - convert more rows (progress, parallel, parallel with progress)
+# On big `pd.DataFrame`'s the combination with
+# [pandas indexing](https://pandas-indexing.readthedocs.io/)
+# becomes particularly powerful.
+
+# %%
+ax = (
+    bigger_df
+    .loc[pix.isin(variable="variable_1")]
+    .groupby(["scenario", "variable", "units"], observed=True)
+    .median()
+    .loc[pix.ismatch(scenario="scenario_1*")]
+    .ct.to_timeseries(
+        time_units="yr",
+        interpolation=ct.InterpolationOption.Quadratic,
+    )
+    .ct.plot()
+)
+ax.legend()
+
+# %%
+# # Units don't round trip
+# pd.testing.assert_frame_equal(
+#     small_df,
+#     small_ts.ct.to_df()
+# )
+small_ts.ct.to_df()
+
+# %%
+small_ts.ct.to_df(increase_resolution=3)
+
+# %%
+sns_df = small_ts.loc[pix.isin(scenario=[f"scenario_{i}" for i in range(2)])].ct.to_sns_df(increase_resolution=100)
+sns_df
+
+# %%
+sns.lineplot(
+    data=sns_df[sns_df["time"] <= 1855],
+    x="time",
+    y="value",
+    hue="scenario",
+    style="variable",
+    estimator=None,
+    units="run",
+)
+
+# %% [markdown]
 # - other operations, also with progress, parallel, parallel with progress
-# - convert to seaborn df for more fine-grained plotting control
-#   - also requires adding a `increase_resolution` method to `Timeseries`
+# - plot with basic control over labels
+# - plot with grouping and plumes for ranges
 # - convert with more fine-grained control over interpolation
+#   (e.g. interpolation being passed as pd.Series)
 # - unit conversion
@@ -93,6 +93,7 @@ docs = [
     "jupyterlab==4.3.4",
     "jupytext==1.16.6",
     "mkdocs-jupyter==0.25.1",
+    "pandas-indexing>=0.6.1",
     "seaborn>=0.13.2",
 ]
 tests = [
 
@@ -17,6 +17,7 @@
 from continuous_timeseries.timeseries import Timeseries
 
 if TYPE_CHECKING:
+    import matplotlib
     import pandas as pd
 
     P = TypeVar("P", bound=pd.DataFrame | pd.Series[Any])
@@ -50,6 +51,110 @@ def metadata(self) -> pd.DataFrame:
         """
         return self._series.index.to_frame(index=False)
 
+    def to_df(self, increase_resolution: int | None = None) -> pd.DataFrame:
+        # Late import to avoid hard dependency on pandas
+        try:
+            import pandas as pd
+        except ImportError as exc:
+            raise MissingOptionalDependencyError(
+                "to_pandas_series", requirement="pandas"
+            ) from exc
+
+        series_l = []
+        indexes_l = []
+        # TODO: progress bar and parallelisation
+        for idx, value in self._series.items():
+            # TODO: time_units and out_units passing
+            if increase_resolution is not None:
+                value_use = value.increase_resolution(increase_resolution)
+            else:
+                value_use = value
+
+            pd_series = value_use.to_pandas_series()
+            series_l.append(pd_series)
+            indexes_l.append((*idx, pd_series.name))
+
+        idx = pd.MultiIndex.from_frame(
+            pd.DataFrame(
+                indexes_l,
+                columns=[*self._series.index.names, "units"],
+                dtype="category",
+            )
+        )
+        df = pd.DataFrame(
+            series_l,
+            index=idx,
+        )
+
+        return df
+
+    # TODO: add this to DataFrame accessor to allow for time filtering in the middle
+    def to_sns_df(self, increase_resolution: int = 100):
+        # TODO: progress bar and parallelisation
+        # TODO: time_units and out_units passing
+        return (
+            self.to_df(increase_resolution=increase_resolution)
+            # Will become `.ct.to_sns_df`
+            .melt(
+                var_name="time",
+                ignore_index=False,
+            )
+            .reset_index()
+        )
+
+    def plot(
+        self,
+        label: str | tuple[str, ...] | None = None,
+        show_continuous: bool = True,
+        continuous_plot_kwargs: dict[str, Any] | None = None,
+        show_discrete: bool = False,
+        discrete_plot_kwargs: dict[str, Any] | None = None,
+        ax: matplotlib.axes.Axes | None = None,
+        progress: bool = False,
+    ) -> matplotlib.axes.Axes:
+        iterator = self._series.items()
+        if progress:
+            try:
+                from tqdm.auto import tqdm
+            except ImportError as exc:
+                raise MissingOptionalDependencyError(  # noqa: TRY003
+                    "get_timeseries_parallel_helper(..., progress=True)",
+                    requirement="tdqm",
+                ) from exc
+
+            iterator = tqdm(iterator, desc="Timeseries to plot")
+
+        if label is not None:
+            if isinstance(label, tuple):
+                raise NotImplementedError()
+
+            label_idx: int | None = get_index_level_idx(self._series, index_level=label)
+
+        else:
+            label_idx = None
+
+        for idx, ts in iterator:
+            if label_idx is not None:
+                label = idx[label_idx]
+                if "label" in continuous_plot_kwargs:
+                    # clash (could just warn here instead)
+                    raise KeyError
+
+                continuous_plot_kwargs_use = continuous_plot_kwargs | dict(label=label)
+
+            else:
+                continuous_plot_kwargs_use = continuous_plot_kwargs
+
+            ax = ts.plot(
+                show_continuous=show_continuous,
+                continuous_plot_kwargs=continuous_plot_kwargs_use,
+                show_discrete=show_discrete,
+                discrete_plot_kwargs=discrete_plot_kwargs,
+                ax=ax,
+            )
+
+        return ax
+
 
 def get_chunks(pd_obj: P, n_chunks: int) -> Iterator[P]:
     # Late import to avoid hard dependency on pandas
@@ -79,6 +184,16 @@ def get_chunks(pd_obj: P, n_chunks: int) -> Iterator[P]:
             yield pd_obj.iloc[start:end]
 
 
+def get_index_level_idx(obj: pd.DataFrame | pd.Series, index_level: str) -> int:
+    try:
+        level_idx = obj.index.names.index(index_level)
+    except ValueError as exc:
+        msg = f"{index_level} not available. {obj.index.names=}"
+        raise KeyError(msg) from exc
+
+    return level_idx
+
+
 def get_timeseries_parallel_helper(
     df: pd.DataFrame,
     interpolation: InterpolationOption,
@@ -107,12 +222,7 @@ def get_timeseries_parallel_helper(
     else:
         meth_to_call = "apply"
 
-    try:
-        units_idx = df.index.names.index(units_col)
-    except ValueError as exc:
-        msg = f"{units_col} not available. {df.index.names=}"
-
-        raise KeyError(msg) from exc
+    units_idx = get_index_level_idx(df, index_level=units_col)
 
     res = getattr(df, meth_to_call)(
         # TODO: make this injectable too
@@ -125,6 +235,8 @@ def get_timeseries_parallel_helper(
         idx_separator=idx_separator,
         ur=ur,
     )
+    # Units now handled by timeseries
+    res = res.reset_index(units_col, drop=True)
 
     return res
Original file line number	Diff line number	Diff line change
`@@ -93,6 +93,7 @@ docs = [`
`93`	`93`	`"jupyterlab==4.3.4",`
`94`	`94`	`"jupytext==1.16.6",`
`95`	`95`	`"mkdocs-jupyter==0.25.1",`
	`96`	`+ "pandas-indexing>=0.6.1",`
`96`	`97`	`"seaborn>=0.13.2",`
`97`	`98`	`]`
`98`	`99`	`tests = [`