Skip to content

Commit d371ff9

Browse files
committed
fix(singling-out): make singling out evaluator robust to weird column names in input data.
1 parent 063837a commit d371ff9

File tree

2 files changed

+87
-92
lines changed

2 files changed

+87
-92
lines changed

src/anonymeter/evaluators/singling_out_evaluator.py

Lines changed: 68 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import operator
88
from collections.abc import Sequence
99
from functools import reduce
10+
from keyword import iskeyword
1011
from typing import Any, Callable, Optional, Union, cast
1112

1213
import numpy as np
@@ -20,6 +21,40 @@
2021
logger = logging.getLogger(__name__)
2122

2223

24+
def _safe_column_names(df: pd.DataFrame) -> pd.DataFrame:
25+
"""Modify column names in dataframes so that we can use it to build queries.
26+
27+
Mathematical symbols like `-` or other python keywords (or 'datetime')
28+
in column names are replaced.
29+
30+
Parameters
31+
----------
32+
df : pd.DataFrame
33+
Input dataframe
34+
35+
Returns
36+
-------
37+
pd.DataFrame
38+
Dataframe with safe column names
39+
40+
"""
41+
symbols = ["-", "*", "/", "+"]
42+
replace_with = "_"
43+
replacements = {}
44+
for old_column in df.columns:
45+
new_column = old_column
46+
for symbol in symbols:
47+
if symbol in new_column:
48+
new_column = new_column.replace(symbol, replace_with)
49+
50+
if iskeyword(new_column) or new_column == "datetime":
51+
old_column = "_anonymeter_" + new_column
52+
53+
replacements[old_column] = old_column
54+
55+
return df.rename(columns=replacements)
56+
57+
2358
def _escape_quotes(string: str) -> str:
2459
return string.replace('"', '\\"').replace("'", "\\'")
2560

@@ -66,15 +101,14 @@ def _query_from_record(
66101
expr = reduce(operator.and_, expr_components)
67102
return expr
68103

104+
69105
def _operator_choice(
70-
operators: Sequence[Callable[[Any, Any], bool]],
71-
rng: np.random.Generator
106+
operators: Sequence[Callable[[Any, Any], bool]], rng: np.random.Generator
72107
) -> Callable[[Any, Any], bool]:
73-
return rng.choice(operators) #type: ignore[arg-type] # signature of "choice" does not accept a list of callables but works fine in practice
108+
return rng.choice(operators) # type: ignore[arg-type] # signature of "choice" does not accept a list of callables but works fine in practice
109+
74110

75-
def _random_operator(
76-
data_type: str, rng: np.random.Generator
77-
) -> Callable[[Any, Any], Union[bool, pl.Expr]]:
111+
def _random_operator(data_type: str, rng: np.random.Generator) -> Callable[[Any, Any], Union[bool, pl.Expr]]:
78112
if data_type in ["categorical", "boolean"]:
79113
ops: Sequence[Callable[[Any, Any], bool]] = [operator.eq, operator.ne]
80114
elif data_type == "numerical":
@@ -143,32 +177,20 @@ def _random_queries(
143177
rng: np.random.Generator,
144178
) -> list[pl.Expr]:
145179
unique_values = {col: df[col].unique().to_list() for col in df.columns}
146-
column_types = {
147-
col: _convert_polars_dtype(df[col].dtype)
148-
for col in df.columns
149-
}
180+
column_types = {col: _convert_polars_dtype(df[col].dtype) for col in df.columns}
150181

151182
queries = []
152183
for _ in range(n_queries):
153-
selected_cols = rng.choice(
154-
df.columns, size=n_cols, replace=False
155-
).tolist()
184+
selected_cols = rng.choice(df.columns, size=n_cols, replace=False).tolist()
156185

157186
queries.append(
158-
_random_query(
159-
unique_values=unique_values,
160-
cols=selected_cols,
161-
column_types=column_types,
162-
rng=rng
163-
)
187+
_random_query(unique_values=unique_values, cols=selected_cols, column_types=column_types, rng=rng)
164188
)
165189

166190
return queries
167191

168192

169-
def singling_out_probability_integral(
170-
n: int, w_min: float, w_max: float
171-
) -> float:
193+
def singling_out_probability_integral(n: int, w_min: float, w_max: float) -> float:
172194
"""Integral of the singling out probability within a given range.
173195
174196
The probability that a query singles out in a population of size
@@ -198,18 +220,14 @@ def singling_out_probability_integral(
198220
199221
"""
200222
if w_min < 0 or w_min > 1:
201-
raise ValueError(
202-
f"Parameter `w_min` must be between 0 and 1. Got {w_min} instead."
203-
)
223+
raise ValueError(f"Parameter `w_min` must be between 0 and 1. Got {w_min} instead.")
204224

205225
if w_max < w_min or w_max > 1:
206226
raise ValueError(
207227
f"Parameter `w_max` must be greater than w_min ({w_min}) and smaller than 1. Got {w_max} instead."
208228
)
209229

210-
return (
211-
(n * w_min + 1) * (1 - w_min) ** n - (n * w_max + 1) * (1 - w_max) ** n
212-
) / (n + 1)
230+
return ((n * w_min + 1) * (1 - w_min) ** n - (n * w_max + 1) * (1 - w_max) ** n) / (n + 1)
213231

214232

215233
def _measure_queries_success(
@@ -233,9 +251,7 @@ def _model(x, w_eff, norm):
233251
def _fit_model(sizes: npt.NDArray, successes: npt.NDArray) -> Callable:
234252
# initial guesses
235253
w_eff_guess = 1 / np.max(sizes)
236-
norm_guess = 1 / singling_out_probability_integral(
237-
n=np.max(sizes), w_min=0, w_max=w_eff_guess
238-
)
254+
norm_guess = 1 / singling_out_probability_integral(n=np.max(sizes), w_min=0, w_max=w_eff_guess)
239255

240256
popt, _ = curve_fit(
241257
_model,
@@ -265,9 +281,7 @@ def fit_correction_term(df: pl.DataFrame, queries: list[pl.Expr]) -> Callable:
265281
depends on the size of the dataset.
266282
267283
"""
268-
sizes, successes = _measure_queries_success(
269-
df=df, queries=queries, n_repeat=5, n_meas=10
270-
)
284+
sizes, successes = _measure_queries_success(df=df, queries=queries, n_repeat=5, n_meas=10)
271285
return _fit_model(sizes=sizes, successes=successes)
272286

273287

@@ -323,9 +337,7 @@ def queries(self) -> list[pl.Expr]:
323337
return self._list
324338

325339

326-
def univariate_singling_out_queries(
327-
df: pl.DataFrame, n_queries: int, rng: np.random.Generator
328-
) -> list[pl.Expr]:
340+
def univariate_singling_out_queries(df: pl.DataFrame, n_queries: int, rng: np.random.Generator) -> list[pl.Expr]:
329341
"""Generate singling out queries from rare attributes.
330342
331343
Parameters
@@ -374,7 +386,7 @@ def univariate_singling_out_queries(
374386
if len(rare_values) > 0:
375387
queries.extend([pl.col(col) == val for val in rare_values])
376388

377-
rng.shuffle(queries) #type: ignore[arg-type] # signature of "shuffle" does not accept a list of expressions but works fine in practice
389+
rng.shuffle(queries) # type: ignore[arg-type] # signature of "shuffle" does not accept a list of expressions but works fine in practice
378390

379391
unique_so_queries = UniqueSinglingOutQueries(max_size=n_queries)
380392
unique_so_queries.check_and_extend(queries, df)
@@ -444,18 +456,13 @@ def multivariate_singling_out_queries(
444456
# Generate a batch of queries
445457

446458
# Pre-sample all random row indices
447-
random_indices = rng.integers(
448-
low=0, high=df.shape[0], size=batch_size
449-
)
459+
random_indices = rng.integers(low=0, high=df.shape[0], size=batch_size)
450460

451461
# Extract all records in bulk
452462
records = df[random_indices].to_dicts()
453463

454464
# Pre-sample all column choices
455-
selected_columns = [
456-
rng.choice(df.columns, size=n_cols, replace=False).tolist()
457-
for _ in range(batch_size)
458-
]
465+
selected_columns = [rng.choice(df.columns, size=n_cols, replace=False).tolist() for _ in range(batch_size)]
459466

460467
queries_batch = [
461468
_query_from_record(
@@ -478,25 +485,16 @@ def multivariate_singling_out_queries(
478485
return unique_so_queries.queries
479486

480487

481-
def _evaluate_queries(
482-
df: pl.DataFrame, queries: list[pl.Expr]
483-
) -> tuple[int, ...]:
488+
def _evaluate_queries(df: pl.DataFrame, queries: list[pl.Expr]) -> tuple[int, ...]:
484489
if len(queries) == 0:
485490
return ()
486491

487-
result_df = df.select(
488-
[
489-
q.cast(pl.Int64).sum().alias(f"count_{i}")
490-
for i, q in enumerate(queries)
491-
]
492-
)
492+
result_df = df.select([q.cast(pl.Int64).sum().alias(f"count_{i}") for i, q in enumerate(queries)])
493493
counts = result_df.row(0)
494494
return counts
495495

496496

497-
def _evaluate_queries_and_return_successful(
498-
df: pl.DataFrame, queries: list[pl.Expr]
499-
) -> list[pl.Expr]:
497+
def _evaluate_queries_and_return_successful(df: pl.DataFrame, queries: list[pl.Expr]) -> list[pl.Expr]:
500498
counts = _evaluate_queries(df=df, queries=queries)
501499

502500
counts_np = np.array(counts, dtype=float)
@@ -520,9 +518,7 @@ def _generate_singling_out_queries(
520518
rng: np.random.Generator,
521519
) -> list[pl.Expr]:
522520
if mode == "univariate":
523-
queries = univariate_singling_out_queries(
524-
df=df, n_queries=n_attacks, rng=rng
525-
)
521+
queries = univariate_singling_out_queries(df=df, n_queries=n_attacks, rng=rng)
526522

527523
elif mode == "multivariate":
528524
queries = multivariate_singling_out_queries(
@@ -534,9 +530,7 @@ def _generate_singling_out_queries(
534530
)
535531

536532
else:
537-
raise RuntimeError(
538-
f"Parameter `mode` can be either `univariate` or `multivariate`. Got {mode} instead."
539-
)
533+
raise RuntimeError(f"Parameter `mode` can be either `univariate` or `multivariate`. Got {mode} instead.")
540534

541535
if len(queries) < n_attacks:
542536
logger.warning(
@@ -604,16 +598,16 @@ def __init__(
604598
max_attempts: Optional[int] = 10000000,
605599
seed: Optional[int] = None,
606600
):
607-
ori = pl.DataFrame(ori)
608-
syn = pl.DataFrame(syn)
601+
ori = pl.DataFrame(_safe_column_names(ori))
602+
syn = pl.DataFrame(_safe_column_names(syn))
609603
self._ori = ori.unique(maintain_order=True)
610604
self._syn = syn.unique(maintain_order=True)
611605
self._n_attacks = n_attacks
612606
self._n_cols = n_cols
613607
if control is None:
614608
self._control = None
615609
else:
616-
control = pl.DataFrame(control)
610+
control = pl.DataFrame(_safe_column_names(control))
617611
self._control = control.unique(maintain_order=True)
618612
self._max_attempts = max_attempts
619613
self._queries: list[pl.Expr] = []
@@ -659,9 +653,7 @@ def evaluate(self, mode: str = "multivariate") -> "SinglingOutEvaluator":
659653
elif mode == "univariate":
660654
n_cols = 1
661655
else:
662-
raise ValueError(
663-
f"mode must be either 'multivariate' or 'univariate', got {mode} instead."
664-
)
656+
raise ValueError(f"mode must be either 'multivariate' or 'univariate', got {mode} instead.")
665657

666658
queries = _generate_singling_out_queries(
667659
df=self._syn,
@@ -671,9 +663,7 @@ def evaluate(self, mode: str = "multivariate") -> "SinglingOutEvaluator":
671663
max_attempts=self._max_attempts,
672664
rng=self._rng,
673665
)
674-
self._queries = _evaluate_queries_and_return_successful(
675-
df=self._ori, queries=queries
676-
)
666+
self._queries = _evaluate_queries_and_return_successful(df=self._ori, queries=queries)
677667
self._n_success = len(self._queries)
678668

679669
baseline_queries = _random_queries(
@@ -682,31 +672,21 @@ def evaluate(self, mode: str = "multivariate") -> "SinglingOutEvaluator":
682672
n_cols=n_cols,
683673
rng=self._rng,
684674
)
685-
self._baseline_queries = _evaluate_queries_and_return_successful(
686-
df=self._ori, queries=baseline_queries
687-
)
675+
self._baseline_queries = _evaluate_queries_and_return_successful(df=self._ori, queries=baseline_queries)
688676
self._n_baseline = len(self._baseline_queries)
689677

690678
if self._control is None:
691679
self._n_control = None
692680
else:
693-
self._n_control = len(
694-
_evaluate_queries_and_return_successful(
695-
df=self._control, queries=queries
696-
)
697-
)
681+
self._n_control = len(_evaluate_queries_and_return_successful(df=self._control, queries=queries))
698682

699683
# correct the number of success against the control set
700684
# to account for different dataset sizes.
701685
if len(self._control) != len(self._ori):
702686
# fit the model to the data:
703-
fitted_model = fit_correction_term(
704-
df=self._control, queries=queries
705-
)
687+
fitted_model = fit_correction_term(df=self._control, queries=queries)
706688

707-
correction = fitted_model(len(self._ori)) / fitted_model(
708-
len(self._control)
709-
)
689+
correction = fitted_model(len(self._ori)) / fitted_model(len(self._control))
710690
self._n_control *= correction
711691

712692
self._evaluated = True
@@ -727,9 +707,7 @@ def results(self, confidence_level: float = 0.95) -> EvaluationResults:
727707
728708
"""
729709
if not self._evaluated:
730-
raise RuntimeError(
731-
"The singling out evaluator wasn't evaluated yet. Please, run `evaluate()` first."
732-
)
710+
raise RuntimeError("The singling out evaluator wasn't evaluated yet. Please, run `evaluate()` first.")
733711

734712
return EvaluationResults(
735713
n_attacks=self._n_attacks,
@@ -739,9 +717,7 @@ def results(self, confidence_level: float = 0.95) -> EvaluationResults:
739717
confidence_level=confidence_level,
740718
)
741719

742-
def risk(
743-
self, confidence_level: float = 0.95, baseline: bool = False
744-
) -> PrivacyRisk:
720+
def risk(self, confidence_level: float = 0.95, baseline: bool = False) -> PrivacyRisk:
745721
"""Estimate the singling out risk.
746722
747723
The risk is estimated comparing the number of successfull singling out

tests/test_singling_out_evaluator.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,3 +181,22 @@ def test_so_evaluator_max_attempts(max_attempts: int) -> None:
181181
soe.evaluate(mode="multivariate")
182182

183183
assert len(soe.queries()) <= max_attempts
184+
185+
186+
@pytest.mark.parametrize("mode", ["univariate", "multivariate"])
187+
def test_so_weird_column_names(mode: str) -> None:
188+
ori = pd.DataFrame(
189+
{
190+
"capital-gain": [100321.23, -2341.2, 4552.343],
191+
"hr/week": [32, 48, 38],
192+
"datetime": ["11:52", "06:00", "11:11"],
193+
}
194+
)
195+
196+
soe = SinglingOutEvaluator(
197+
ori=ori,
198+
syn=ori,
199+
n_attacks=3,
200+
n_cols=3,
201+
)
202+
soe.evaluate(mode=mode)

0 commit comments

Comments
 (0)