WIP: Split out apply pandas parallel function

znicholls · znicholls · commit 2141063bdcbf · 2025-01-22T09:30:04.000+01:00
diff --git a/docs/tutorials/pandas_accessor_tutorial.py b/docs/tutorials/pandas_accessor_tutorial.py
@@ -175,6 +175,9 @@ def create_df(
 )
 small_ts
 
+# %%
+small_ts.ct.differentiate(progress=True)
+
 # %% [markdown]
 # Then we can use standard Continuous timeseries APIs,
 # e.g. plotting.
@@ -201,6 +204,15 @@ def create_df(
 )
 ax.legend()
 
+# %%
+# TODO: move this to ops section
+ax = (
+    small_ts.loc[pix.isin(variable="variable_0", run=0)]
+    .ct.differentiate()
+    .ct.plot(label="scenario", continuous_plot_kwargs=dict(alpha=0.9))
+)
+ax.legend()
+
 # %% [markdown]
 # If we have a bigger `pd.DataFrame`, the conversion process can be much slower.
 
@@ -260,7 +272,7 @@ def create_df(
 # [here](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods)).
 
 # %%
-bigger_df.ct.to_timeseries(
+bigger_ts = bigger_df.ct.to_timeseries(
     time_units="yr",
     interpolation=ct.InterpolationOption.Linear,
     n_processes=n_processes,
@@ -272,6 +284,46 @@ def create_df(
     progress_nested=True,
     mp_context=multiprocessing.get_context("fork"),
 )
+bigger_ts
+
+# %% [markdown]
+# The same logic can be applied to other operations.
+
+# %%
+diff_ts = create_df(
+    n_scenarios=50,
+    n_variables=1,
+    n_runs=600,
+    timepoints=np.arange(75) + 2025.0,
+).ct.to_timeseries(
+    time_units="yr",
+    interpolation=ct.InterpolationOption.Linear,
+    n_processes=n_processes,
+    progress=True,
+    progress_nested=True,
+    mp_context=multiprocessing.get_context("fork"),
+)
+diff_ts
+
+# %%
+diff_ts.ct.differentiate(progress=True)
+
+# %%
+diff_ts.ct.differentiate(n_processes=n_processes)
+
+# %%
+diff_ts.ct.differentiate(n_processes=n_processes, progress=True)
+
+# %%
+diff_ts.ct.differentiate(
+    n_processes=n_processes,
+    progress=True,
+    progress_nested=True,
+    mp_context=multiprocessing.get_context("fork"),
+)
+
+# %% [markdown]
+# Demonstrate how to control parallel etc. with global config.
 
 # %% [markdown]
 # On big `pd.DataFrame`'s the combination with
@@ -306,7 +358,7 @@ def create_df(
 # %%
 sns_df = small_ts.loc[
     pix.isin(scenario=[f"scenario_{i}" for i in range(2)])
-# Rename to `to_tidy_df`
+    # Rename to `to_tidy_df`
 ].ct.to_sns_df(increase_resolution=100)
 sns_df
 
@@ -330,9 +382,15 @@ def create_df(
 )
 
 fig, ax = plt.subplots()
-for scenario, s_ts in small_ts.loc[pix.isin(variable="variable_0")].groupby("scenario", observed=True):
+for scenario, s_ts in small_ts.loc[pix.isin(variable="variable_0")].groupby(
+    "scenario", observed=True
+):
     for quantiles, alpha in quantiles_plumes:
-        s_quants = s_ts.ct.to_df(increase_resolution=increase_resolution).groupby(small_ts.index.names.difference(plumes_over), observed=True).quantile(quantiles)
+        s_quants = (
+            s_ts.ct.to_df(increase_resolution=increase_resolution)
+            .groupby(small_ts.index.names.difference(plumes_over), observed=True)
+            .quantile(quantiles)
+        )
         if isinstance(quantiles, tuple):
             ax.fill_between(
                 s_quants.columns.values.squeeze(),
@@ -355,14 +413,12 @@ def create_df(
 
 # %%
 (
-    small_ts
-    .ct.to_df(increase_resolution=5)
+    small_ts.ct.to_df(increase_resolution=5)
     .groupby(small_ts.index.names.difference(["run"]), observed=True)
     .quantile([0.05, 0.5, 0.95])
 )
 
 # %% [markdown]
-# - other operations, also with progress, parallel, parallel with progress
 # - plot with basic control over labels
 # - plot with grouping and plumes for ranges (basically reproduce scmdata API)
 # - convert with more fine-grained control over interpolation
diff --git a/src/continuous_timeseries/pandas_accessors.py b/src/continuous_timeseries/pandas_accessors.py
@@ -6,6 +6,7 @@
 
 import concurrent.futures
 from collections.abc import Iterator
+from functools import partial
 from multiprocessing.context import BaseContext
 from typing import TYPE_CHECKING, Any, TypeVar
 
@@ -23,6 +24,94 @@
     P = TypeVar("P", bound=pd.DataFrame | pd.Series[Any])
 
 
+def apply_pandas_op_parallel(
+    obj,
+    op,
+    n_processes: int,
+    progress: bool = False,
+    progress_nested: bool = False,
+    mp_context: BaseContext | None = None,
+):
+    iterator = get_chunks(obj, n_chunks=n_processes)
+    if progress:
+        try:
+            from tqdm.auto import tqdm
+        except ImportError as exc:
+            raise MissingOptionalDependencyError(  # noqa: TRY003
+                "apply_pandas_op_parallel(..., progress=True)", requirement="tdqm"
+            ) from exc
+
+        iterator = tqdm(iterator, desc="submitting to pool")
+
+    with concurrent.futures.ProcessPoolExecutor(
+        max_workers=n_processes, mp_context=mp_context
+    ) as pool:
+        futures = [
+            pool.submit(
+                op,
+                chunk,
+                progress=progress_nested,
+                progress_bar_position=i,
+            )
+            for i, chunk in enumerate(iterator)
+        ]
+
+        iterator_results = concurrent.futures.as_completed(futures)
+        if progress:
+            iterator_results = tqdm(
+                iterator_results,
+                desc="Retrieving parallel results",
+                total=len(futures),
+            )
+
+        res_l = [future.result() for future in iterator_results]
+
+    # Late import to avoid hard dependency on pandas
+    try:
+        import pandas as pd
+    except ImportError as exc:
+        raise MissingOptionalDependencyError(
+            "apply_pandas_op_parallel", requirement="pandas"
+        ) from exc
+
+    # This assumes that the index isn't mangled.
+    # Using pix.concat might be safer,
+    # or we make the concatenation injectable.
+    res = pd.concat(res_l)
+
+    return res
+
+
+def differentiate_parallel_helper(
+    series: pd.Series[Timeseries],
+    progress: bool = False,
+    progress_bar_position: int = 0,
+) -> pd.Series[Timeseries]:
+    if progress:
+        try:
+            from tqdm.auto import tqdm
+        except ImportError as exc:
+            raise MissingOptionalDependencyError(  # noqa: TRY003
+                "dist(..., progress=True)", requirement="tdqm"
+            ) from exc
+
+        tqdm_kwargs = dict(position=progress_bar_position)
+        tqdm.pandas(**tqdm_kwargs)
+        meth_to_call = "progress_map"
+        # No-one knows why this is needed, but it is in jupyter notebooks
+        print(end=" ")
+
+    else:
+        meth_to_call = "map"
+
+    res = getattr(series, meth_to_call)(
+        lambda x: x.differentiate(),
+        # name="injectable?",
+    )
+
+    return res
+
+
 class SeriesCTAccessor:
     """
     [`pd.Series`][pandas.Series] accessors
@@ -89,7 +178,7 @@ def to_df(self, increase_resolution: int | None = None) -> pd.DataFrame:
         return df
 
     # TODO: add this to DataFrame accessor to allow for time filtering in the middle
-    def to_sns_df(self, increase_resolution: int = 100):
+    def to_sns_df(self, increase_resolution: int = 100) -> pd.DataFrame:
         # TODO: progress bar and parallelisation
         # TODO: time_units and out_units passing
         return (
@@ -102,6 +191,33 @@ def to_sns_df(self, increase_resolution: int = 100):
             .reset_index()
         )
 
+    def differentiate(
+        self,
+        # res_name: str = "ts",
+        progress: bool = False,
+        progress_nested: bool = False,
+        n_processes: int = 1,
+        mp_context: BaseContext | None = None,
+    ) -> pd.Series[Timeseries]:  # type: ignore
+        if n_processes == 1:
+            res = differentiate_parallel_helper(
+                self._series,
+                progress=progress,
+            )
+
+            return res
+
+        res = apply_pandas_op_parallel(
+            self._series,
+            op=differentiate_parallel_helper,
+            n_processes=n_processes,
+            progress=progress,
+            progress_nested=progress_nested,
+            mp_context=mp_context,
+        )
+
+        return res
+
     def plot(
         self,
         label: str | tuple[str, ...] | None = None,
@@ -215,8 +331,7 @@ def get_timeseries_parallel_helper(
         tqdm_kwargs = dict(position=progress_bar_position)
         tqdm.pandas(**tqdm_kwargs)
         meth_to_call = "progress_apply"
-        # No-one knows why this is needed, but it is
-        # jupyter notebooks
+        # No-one knows why this is needed, but it is in jupyter notebooks
         print(end=" ")
 
     else:
@@ -288,56 +403,21 @@ def to_timeseries(  # noqa: PLR0913
 
             return res
 
-        # I think it should be possible to split out a
-        # `apply_pandas_op_parallel` or similar function.
-        iterator = get_chunks(self._df, n_chunks=n_processes)
-        if progress:
-            try:
-                from tqdm.auto import tqdm
-            except ImportError as exc:
-                raise MissingOptionalDependencyError(  # noqa: TRY003
-                    "to_timeseries(..., progress=True)", requirement="tdqm"
-                ) from exc
-
-            iterator = tqdm(iterator, desc="submitting to pool")
-
-        with concurrent.futures.ProcessPoolExecutor(
-            max_workers=n_processes, mp_context=mp_context
-        ) as pool:
-            futures = [
-                pool.submit(
-                    get_timeseries_parallel_helper,
-                    chunk,
-                    interpolation=interpolation,
-                    time_units=time_units,
-                    units_col=units_col,
-                    idx_separator=idx_separator,
-                    ur=ur,
-                    progress=progress_nested,
-                    progress_bar_position=i,
-                )
-                for i, chunk in enumerate(iterator)
-            ]
-
-            iterator_results = concurrent.futures.as_completed(futures)
-            if progress:
-                iterator_results = tqdm(
-                    iterator_results,
-                    desc="Retrieving parallel results",
-                    total=len(futures),
-                )
-
-            res_l = [future.result() for future in iterator_results]
-
-        # Late import to avoid hard dependency on pandas
-        try:
-            import pandas as pd
-        except ImportError as exc:
-            raise MissingOptionalDependencyError(
-                "interpolate", requirement="pandas"
-            ) from exc
-
-        res = pd.concat(res_l)
+        res = apply_pandas_op_parallel(
+            self._df,
+            op=partial(
+                get_timeseries_parallel_helper,
+                interpolation=interpolation,
+                time_units=time_units,
+                units_col=units_col,
+                idx_separator=idx_separator,
+                ur=ur,
+            ),
+            n_processes=n_processes,
+            progress=progress,
+            progress_nested=progress_nested,
+            mp_context=mp_context,
+        )
 
         return res