Skip to content

Commit d46b1a3

Browse files
committed
Add more features
1 parent 70846c9 commit d46b1a3

File tree

8 files changed

+485
-151
lines changed

8 files changed

+485
-151
lines changed

docs/tutorials/pandas_accessor_tutorial.py

Lines changed: 87 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,9 @@
3535
import numpy as np
3636
import openscm_units
3737
import pandas as pd
38+
import pandas_indexing as pix
3839
import pint
40+
import seaborn as sns
3941

4042
import continuous_timeseries as ct
4143
import continuous_timeseries.pandas_accessors
@@ -115,19 +117,21 @@ def create_df(
115117
(
116118
(s, v, r, units)
117119
for s, v, r in itertools.product(
118-
[f"variable_{i}" for i in range(n_variables)],
119120
[f"scenario_{i}" for i in range(n_scenarios)],
121+
[f"variable_{i}" for i in range(n_variables)],
120122
[i for i in range(n_runs)],
121123
)
122124
),
123-
columns=["scenario", "variable", "region", "units"],
124-
# This makes updates later way way faster
125+
columns=["scenario", "variable", "run", "units"],
126+
# This makes updates and general handling later way way faster.
127+
# TODO: make this tip clearer.
125128
dtype="category",
126129
)
127130
)
128131

132+
n_ts = n_scenarios * n_variables * n_runs
129133
df = pd.DataFrame(
130-
np.random.random((n_variables * n_runs * n_scenarios, timepoints.size)),
134+
50.0 * np.linspace(0.3, 1, n_ts)[:, np.newaxis] * np.linspace(0, 1, timepoints.size)[np.newaxis, :] + np.random.random((n_ts, timepoints.size)),
131135
columns=timepoints,
132136
index=idx,
133137
)
@@ -150,9 +154,9 @@ def create_df(
150154

151155
# %%
152156
small_df = create_df(
153-
n_scenarios=25,
154-
n_variables=10,
155-
n_runs=30,
157+
n_scenarios=3,
158+
n_variables=2,
159+
n_runs=5,
156160
timepoints=np.arange(250) + 1850.0,
157161
)
158162
small_df
@@ -161,21 +165,36 @@ def create_df(
161165
# Then we convert it time series.
162166

163167
# %%
164-
small_df.ct.to_timeseries(
168+
small_ts = small_df.ct.to_timeseries(
165169
time_units="yr",
166170
interpolation=ct.InterpolationOption.PiecewiseConstantPreviousLeftClosed,
167171
)
172+
small_ts
168173

169174
# %% [markdown]
170175
# Then we can use standard Continuous timeseries APIs,
171176
# e.g. plotting.
172177

173178
# %%
179+
small_ts.ct.plot(continuous_plot_kwargs=dict(alpha=0.3))
180+
# # TODO: move this to plotting
181+
# small_ts.ct.plot(continuous_plot_kwargs=dict(alpha=0.3), progress=True)
174182

175183
# %% [markdown]
176-
# If we have a bigger `pd.DataFrame`, this process can be much slower.
177-
# If you're not sure what's happening, you can activate the progress bar if you have
178-
# [`tdqm`](https://tqdm.github.io/) installed.
184+
# When combined with [pandas-indexing](https://pandas-indexing.readthedocs.io/en/latest/index.html),
185+
# this can be quite powerful for quick plots.
186+
187+
# %%
188+
ax = small_ts.loc[pix.isin(variable="variable_0")].ct.plot(continuous_plot_kwargs=dict(alpha=0.3))
189+
ax.legend(ncols=3, loc="upper center", bbox_to_anchor=(0.5, -0.15))
190+
191+
# %%
192+
# TODO: move this to plotting section
193+
ax = small_ts.loc[pix.isin(variable="variable_0", run=0)].ct.plot(label="scenario", continuous_plot_kwargs=dict(alpha=0.9))
194+
ax.legend()
195+
196+
# %% [markdown]
197+
# If we have a bigger `pd.DataFrame`, the conversion process can be much slower.
179198

180199
# %%
181200
bigger_df = create_df(
@@ -184,7 +203,12 @@ def create_df(
184203
n_runs=300,
185204
timepoints=np.arange(351) + 1850.0,
186205
)
187-
bigger_df
206+
bigger_df.shape
207+
208+
# %% [markdown]
209+
# If want to see the conversion's progress,
210+
# you can activate the progress bar if you have
211+
# [`tdqm`](https://tqdm.github.io/) installed.
188212

189213
# %%
190214
bigger_df.ct.to_timeseries(
@@ -224,7 +248,8 @@ def create_df(
224248
# If you want nested progress bars in parallel,
225249
# we support that too
226250
# (although we're not sure if this works on windows
227-
# because of the need for forking...).
251+
# because of the need for forking, for details see
252+
# [here](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods)).
228253

229254
# %%
230255
bigger_df.ct.to_timeseries(
@@ -241,11 +266,55 @@ def create_df(
241266
)
242267

243268
# %% [markdown]
244-
# - filtering with pandas-indexing
245-
# - bigger df
246-
# - convert more rows (progress, parallel, parallel with progress)
269+
# On big `pd.DataFrame`'s the combination with
270+
# [pandas indexing](https://pandas-indexing.readthedocs.io/)
271+
# becomes particularly powerful.
272+
273+
# %%
274+
ax = (
275+
bigger_df
276+
.loc[pix.isin(variable="variable_1")]
277+
.groupby(["scenario", "variable", "units"], observed=True)
278+
.median()
279+
.loc[pix.ismatch(scenario="scenario_1*")]
280+
.ct.to_timeseries(
281+
time_units="yr",
282+
interpolation=ct.InterpolationOption.Quadratic,
283+
)
284+
.ct.plot()
285+
)
286+
ax.legend()
287+
288+
# %%
289+
# # Units don't round trip
290+
# pd.testing.assert_frame_equal(
291+
# small_df,
292+
# small_ts.ct.to_df()
293+
# )
294+
small_ts.ct.to_df()
295+
296+
# %%
297+
small_ts.ct.to_df(increase_resolution=3)
298+
299+
# %%
300+
sns_df = small_ts.loc[pix.isin(scenario=[f"scenario_{i}" for i in range(2)])].ct.to_sns_df(increase_resolution=100)
301+
sns_df
302+
303+
# %%
304+
sns.lineplot(
305+
data=sns_df[sns_df["time"] <= 1855],
306+
x="time",
307+
y="value",
308+
hue="scenario",
309+
style="variable",
310+
estimator=None,
311+
units="run",
312+
)
313+
314+
# %% [markdown]
247315
# - other operations, also with progress, parallel, parallel with progress
248-
# - convert to seaborn df for more fine-grained plotting control
249-
# - also requires adding a `increase_resolution` method to `Timeseries`
316+
# - plot with basic control over labels
317+
# - plot with grouping and plumes for ranges
250318
# - convert with more fine-grained control over interpolation
319+
# (e.g. interpolation being passed as pd.Series)
251320
# - unit conversion

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ docs = [
9393
"jupyterlab==4.3.4",
9494
"jupytext==1.16.6",
9595
"mkdocs-jupyter==0.25.1",
96+
"pandas-indexing>=0.6.1",
9697
"seaborn>=0.13.2",
9798
]
9899
tests = [

src/continuous_timeseries/pandas_accessors.py

Lines changed: 118 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from continuous_timeseries.timeseries import Timeseries
1818

1919
if TYPE_CHECKING:
20+
import matplotlib
2021
import pandas as pd
2122

2223
P = TypeVar("P", bound=pd.DataFrame | pd.Series[Any])
@@ -50,6 +51,110 @@ def metadata(self) -> pd.DataFrame:
5051
"""
5152
return self._series.index.to_frame(index=False)
5253

54+
def to_df(self, increase_resolution: int | None = None) -> pd.DataFrame:
55+
# Late import to avoid hard dependency on pandas
56+
try:
57+
import pandas as pd
58+
except ImportError as exc:
59+
raise MissingOptionalDependencyError(
60+
"to_pandas_series", requirement="pandas"
61+
) from exc
62+
63+
series_l = []
64+
indexes_l = []
65+
# TODO: progress bar and parallelisation
66+
for idx, value in self._series.items():
67+
# TODO: time_units and out_units passing
68+
if increase_resolution is not None:
69+
value_use = value.increase_resolution(increase_resolution)
70+
else:
71+
value_use = value
72+
73+
pd_series = value_use.to_pandas_series()
74+
series_l.append(pd_series)
75+
indexes_l.append((*idx, pd_series.name))
76+
77+
idx = pd.MultiIndex.from_frame(
78+
pd.DataFrame(
79+
indexes_l,
80+
columns=[*self._series.index.names, "units"],
81+
dtype="category",
82+
)
83+
)
84+
df = pd.DataFrame(
85+
series_l,
86+
index=idx,
87+
)
88+
89+
return df
90+
91+
# TODO: add this to DataFrame accessor to allow for time filtering in the middle
92+
def to_sns_df(self, increase_resolution: int = 100):
93+
# TODO: progress bar and parallelisation
94+
# TODO: time_units and out_units passing
95+
return (
96+
self.to_df(increase_resolution=increase_resolution)
97+
# Will become `.ct.to_sns_df`
98+
.melt(
99+
var_name="time",
100+
ignore_index=False,
101+
)
102+
.reset_index()
103+
)
104+
105+
def plot(
106+
self,
107+
label: str | tuple[str, ...] | None = None,
108+
show_continuous: bool = True,
109+
continuous_plot_kwargs: dict[str, Any] | None = None,
110+
show_discrete: bool = False,
111+
discrete_plot_kwargs: dict[str, Any] | None = None,
112+
ax: matplotlib.axes.Axes | None = None,
113+
progress: bool = False,
114+
) -> matplotlib.axes.Axes:
115+
iterator = self._series.items()
116+
if progress:
117+
try:
118+
from tqdm.auto import tqdm
119+
except ImportError as exc:
120+
raise MissingOptionalDependencyError( # noqa: TRY003
121+
"get_timeseries_parallel_helper(..., progress=True)",
122+
requirement="tdqm",
123+
) from exc
124+
125+
iterator = tqdm(iterator, desc="Timeseries to plot")
126+
127+
if label is not None:
128+
if isinstance(label, tuple):
129+
raise NotImplementedError()
130+
131+
label_idx: int | None = get_index_level_idx(self._series, index_level=label)
132+
133+
else:
134+
label_idx = None
135+
136+
for idx, ts in iterator:
137+
if label_idx is not None:
138+
label = idx[label_idx]
139+
if "label" in continuous_plot_kwargs:
140+
# clash (could just warn here instead)
141+
raise KeyError
142+
143+
continuous_plot_kwargs_use = continuous_plot_kwargs | dict(label=label)
144+
145+
else:
146+
continuous_plot_kwargs_use = continuous_plot_kwargs
147+
148+
ax = ts.plot(
149+
show_continuous=show_continuous,
150+
continuous_plot_kwargs=continuous_plot_kwargs_use,
151+
show_discrete=show_discrete,
152+
discrete_plot_kwargs=discrete_plot_kwargs,
153+
ax=ax,
154+
)
155+
156+
return ax
157+
53158

54159
def get_chunks(pd_obj: P, n_chunks: int) -> Iterator[P]:
55160
# Late import to avoid hard dependency on pandas
@@ -79,6 +184,16 @@ def get_chunks(pd_obj: P, n_chunks: int) -> Iterator[P]:
79184
yield pd_obj.iloc[start:end]
80185

81186

187+
def get_index_level_idx(obj: pd.DataFrame | pd.Series, index_level: str) -> int:
188+
try:
189+
level_idx = obj.index.names.index(index_level)
190+
except ValueError as exc:
191+
msg = f"{index_level} not available. {obj.index.names=}"
192+
raise KeyError(msg) from exc
193+
194+
return level_idx
195+
196+
82197
def get_timeseries_parallel_helper(
83198
df: pd.DataFrame,
84199
interpolation: InterpolationOption,
@@ -107,12 +222,7 @@ def get_timeseries_parallel_helper(
107222
else:
108223
meth_to_call = "apply"
109224

110-
try:
111-
units_idx = df.index.names.index(units_col)
112-
except ValueError as exc:
113-
msg = f"{units_col} not available. {df.index.names=}"
114-
115-
raise KeyError(msg) from exc
225+
units_idx = get_index_level_idx(df, index_level=units_col)
116226

117227
res = getattr(df, meth_to_call)(
118228
# TODO: make this injectable too
@@ -125,6 +235,8 @@ def get_timeseries_parallel_helper(
125235
idx_separator=idx_separator,
126236
ur=ur,
127237
)
238+
# Units now handled by timeseries
239+
res = res.reset_index(units_col, drop=True)
128240

129241
return res
130242

0 commit comments

Comments
 (0)