7
7
import operator
8
8
from collections .abc import Sequence
9
9
from functools import reduce
10
+ from keyword import iskeyword
10
11
from typing import Any , Callable , Optional , Union , cast
11
12
12
13
import numpy as np
20
21
logger = logging .getLogger (__name__ )
21
22
22
23
24
+ def _safe_column_names (df : pd .DataFrame ) -> pd .DataFrame :
25
+ """Modify column names in dataframes so that we can use it to build queries.
26
+
27
+ Mathematical symbols like `-` or other python keywords (or 'datetime')
28
+ in column names are replaced.
29
+
30
+ Parameters
31
+ ----------
32
+ df : pd.DataFrame
33
+ Input dataframe
34
+
35
+ Returns
36
+ -------
37
+ pd.DataFrame
38
+ Dataframe with safe column names
39
+
40
+ """
41
+ symbols = ["-" , "*" , "/" , "+" ]
42
+ replace_with = "_"
43
+ replacements = {}
44
+ for old_column in df .columns :
45
+ new_column = old_column
46
+ for symbol in symbols :
47
+ if symbol in new_column :
48
+ new_column = new_column .replace (symbol , replace_with )
49
+
50
+ if iskeyword (new_column ) or new_column == "datetime" :
51
+ old_column = "_anonymeter_" + new_column
52
+
53
+ replacements [old_column ] = old_column
54
+
55
+ return df .rename (columns = replacements )
56
+
57
+
23
58
def _escape_quotes (string : str ) -> str :
24
59
return string .replace ('"' , '\\ "' ).replace ("'" , "\\ '" )
25
60
@@ -66,15 +101,14 @@ def _query_from_record(
66
101
expr = reduce (operator .and_ , expr_components )
67
102
return expr
68
103
104
+
69
105
def _operator_choice (
70
- operators : Sequence [Callable [[Any , Any ], bool ]],
71
- rng : np .random .Generator
106
+ operators : Sequence [Callable [[Any , Any ], bool ]], rng : np .random .Generator
72
107
) -> Callable [[Any , Any ], bool ]:
73
- return rng .choice (operators ) #type: ignore[arg-type] # signature of "choice" does not accept a list of callables but works fine in practice
108
+ return rng .choice (operators ) # type: ignore[arg-type] # signature of "choice" does not accept a list of callables but works fine in practice
109
+
74
110
75
- def _random_operator (
76
- data_type : str , rng : np .random .Generator
77
- ) -> Callable [[Any , Any ], Union [bool , pl .Expr ]]:
111
+ def _random_operator (data_type : str , rng : np .random .Generator ) -> Callable [[Any , Any ], Union [bool , pl .Expr ]]:
78
112
if data_type in ["categorical" , "boolean" ]:
79
113
ops : Sequence [Callable [[Any , Any ], bool ]] = [operator .eq , operator .ne ]
80
114
elif data_type == "numerical" :
@@ -143,32 +177,20 @@ def _random_queries(
143
177
rng : np .random .Generator ,
144
178
) -> list [pl .Expr ]:
145
179
unique_values = {col : df [col ].unique ().to_list () for col in df .columns }
146
- column_types = {
147
- col : _convert_polars_dtype (df [col ].dtype )
148
- for col in df .columns
149
- }
180
+ column_types = {col : _convert_polars_dtype (df [col ].dtype ) for col in df .columns }
150
181
151
182
queries = []
152
183
for _ in range (n_queries ):
153
- selected_cols = rng .choice (
154
- df .columns , size = n_cols , replace = False
155
- ).tolist ()
184
+ selected_cols = rng .choice (df .columns , size = n_cols , replace = False ).tolist ()
156
185
157
186
queries .append (
158
- _random_query (
159
- unique_values = unique_values ,
160
- cols = selected_cols ,
161
- column_types = column_types ,
162
- rng = rng
163
- )
187
+ _random_query (unique_values = unique_values , cols = selected_cols , column_types = column_types , rng = rng )
164
188
)
165
189
166
190
return queries
167
191
168
192
169
- def singling_out_probability_integral (
170
- n : int , w_min : float , w_max : float
171
- ) -> float :
193
+ def singling_out_probability_integral (n : int , w_min : float , w_max : float ) -> float :
172
194
"""Integral of the singling out probability within a given range.
173
195
174
196
The probability that a query singles out in a population of size
@@ -198,18 +220,14 @@ def singling_out_probability_integral(
198
220
199
221
"""
200
222
if w_min < 0 or w_min > 1 :
201
- raise ValueError (
202
- f"Parameter `w_min` must be between 0 and 1. Got { w_min } instead."
203
- )
223
+ raise ValueError (f"Parameter `w_min` must be between 0 and 1. Got { w_min } instead." )
204
224
205
225
if w_max < w_min or w_max > 1 :
206
226
raise ValueError (
207
227
f"Parameter `w_max` must be greater than w_min ({ w_min } ) and smaller than 1. Got { w_max } instead."
208
228
)
209
229
210
- return (
211
- (n * w_min + 1 ) * (1 - w_min ) ** n - (n * w_max + 1 ) * (1 - w_max ) ** n
212
- ) / (n + 1 )
230
+ return ((n * w_min + 1 ) * (1 - w_min ) ** n - (n * w_max + 1 ) * (1 - w_max ) ** n ) / (n + 1 )
213
231
214
232
215
233
def _measure_queries_success (
@@ -233,9 +251,7 @@ def _model(x, w_eff, norm):
233
251
def _fit_model (sizes : npt .NDArray , successes : npt .NDArray ) -> Callable :
234
252
# initial guesses
235
253
w_eff_guess = 1 / np .max (sizes )
236
- norm_guess = 1 / singling_out_probability_integral (
237
- n = np .max (sizes ), w_min = 0 , w_max = w_eff_guess
238
- )
254
+ norm_guess = 1 / singling_out_probability_integral (n = np .max (sizes ), w_min = 0 , w_max = w_eff_guess )
239
255
240
256
popt , _ = curve_fit (
241
257
_model ,
@@ -265,9 +281,7 @@ def fit_correction_term(df: pl.DataFrame, queries: list[pl.Expr]) -> Callable:
265
281
depends on the size of the dataset.
266
282
267
283
"""
268
- sizes , successes = _measure_queries_success (
269
- df = df , queries = queries , n_repeat = 5 , n_meas = 10
270
- )
284
+ sizes , successes = _measure_queries_success (df = df , queries = queries , n_repeat = 5 , n_meas = 10 )
271
285
return _fit_model (sizes = sizes , successes = successes )
272
286
273
287
@@ -323,9 +337,7 @@ def queries(self) -> list[pl.Expr]:
323
337
return self ._list
324
338
325
339
326
- def univariate_singling_out_queries (
327
- df : pl .DataFrame , n_queries : int , rng : np .random .Generator
328
- ) -> list [pl .Expr ]:
340
+ def univariate_singling_out_queries (df : pl .DataFrame , n_queries : int , rng : np .random .Generator ) -> list [pl .Expr ]:
329
341
"""Generate singling out queries from rare attributes.
330
342
331
343
Parameters
@@ -374,7 +386,7 @@ def univariate_singling_out_queries(
374
386
if len (rare_values ) > 0 :
375
387
queries .extend ([pl .col (col ) == val for val in rare_values ])
376
388
377
- rng .shuffle (queries ) # type: ignore[arg-type] # signature of "shuffle" does not accept a list of expressions but works fine in practice
389
+ rng .shuffle (queries ) # type: ignore[arg-type] # signature of "shuffle" does not accept a list of expressions but works fine in practice
378
390
379
391
unique_so_queries = UniqueSinglingOutQueries (max_size = n_queries )
380
392
unique_so_queries .check_and_extend (queries , df )
@@ -444,18 +456,13 @@ def multivariate_singling_out_queries(
444
456
# Generate a batch of queries
445
457
446
458
# Pre-sample all random row indices
447
- random_indices = rng .integers (
448
- low = 0 , high = df .shape [0 ], size = batch_size
449
- )
459
+ random_indices = rng .integers (low = 0 , high = df .shape [0 ], size = batch_size )
450
460
451
461
# Extract all records in bulk
452
462
records = df [random_indices ].to_dicts ()
453
463
454
464
# Pre-sample all column choices
455
- selected_columns = [
456
- rng .choice (df .columns , size = n_cols , replace = False ).tolist ()
457
- for _ in range (batch_size )
458
- ]
465
+ selected_columns = [rng .choice (df .columns , size = n_cols , replace = False ).tolist () for _ in range (batch_size )]
459
466
460
467
queries_batch = [
461
468
_query_from_record (
@@ -478,25 +485,16 @@ def multivariate_singling_out_queries(
478
485
return unique_so_queries .queries
479
486
480
487
481
- def _evaluate_queries (
482
- df : pl .DataFrame , queries : list [pl .Expr ]
483
- ) -> tuple [int , ...]:
488
+ def _evaluate_queries (df : pl .DataFrame , queries : list [pl .Expr ]) -> tuple [int , ...]:
484
489
if len (queries ) == 0 :
485
490
return ()
486
491
487
- result_df = df .select (
488
- [
489
- q .cast (pl .Int64 ).sum ().alias (f"count_{ i } " )
490
- for i , q in enumerate (queries )
491
- ]
492
- )
492
+ result_df = df .select ([q .cast (pl .Int64 ).sum ().alias (f"count_{ i } " ) for i , q in enumerate (queries )])
493
493
counts = result_df .row (0 )
494
494
return counts
495
495
496
496
497
- def _evaluate_queries_and_return_successful (
498
- df : pl .DataFrame , queries : list [pl .Expr ]
499
- ) -> list [pl .Expr ]:
497
+ def _evaluate_queries_and_return_successful (df : pl .DataFrame , queries : list [pl .Expr ]) -> list [pl .Expr ]:
500
498
counts = _evaluate_queries (df = df , queries = queries )
501
499
502
500
counts_np = np .array (counts , dtype = float )
@@ -520,9 +518,7 @@ def _generate_singling_out_queries(
520
518
rng : np .random .Generator ,
521
519
) -> list [pl .Expr ]:
522
520
if mode == "univariate" :
523
- queries = univariate_singling_out_queries (
524
- df = df , n_queries = n_attacks , rng = rng
525
- )
521
+ queries = univariate_singling_out_queries (df = df , n_queries = n_attacks , rng = rng )
526
522
527
523
elif mode == "multivariate" :
528
524
queries = multivariate_singling_out_queries (
@@ -534,9 +530,7 @@ def _generate_singling_out_queries(
534
530
)
535
531
536
532
else :
537
- raise RuntimeError (
538
- f"Parameter `mode` can be either `univariate` or `multivariate`. Got { mode } instead."
539
- )
533
+ raise RuntimeError (f"Parameter `mode` can be either `univariate` or `multivariate`. Got { mode } instead." )
540
534
541
535
if len (queries ) < n_attacks :
542
536
logger .warning (
@@ -604,16 +598,16 @@ def __init__(
604
598
max_attempts : Optional [int ] = 10000000 ,
605
599
seed : Optional [int ] = None ,
606
600
):
607
- ori = pl .DataFrame (ori )
608
- syn = pl .DataFrame (syn )
601
+ ori = pl .DataFrame (_safe_column_names ( ori ) )
602
+ syn = pl .DataFrame (_safe_column_names ( syn ) )
609
603
self ._ori = ori .unique (maintain_order = True )
610
604
self ._syn = syn .unique (maintain_order = True )
611
605
self ._n_attacks = n_attacks
612
606
self ._n_cols = n_cols
613
607
if control is None :
614
608
self ._control = None
615
609
else :
616
- control = pl .DataFrame (control )
610
+ control = pl .DataFrame (_safe_column_names ( control ) )
617
611
self ._control = control .unique (maintain_order = True )
618
612
self ._max_attempts = max_attempts
619
613
self ._queries : list [pl .Expr ] = []
@@ -659,9 +653,7 @@ def evaluate(self, mode: str = "multivariate") -> "SinglingOutEvaluator":
659
653
elif mode == "univariate" :
660
654
n_cols = 1
661
655
else :
662
- raise ValueError (
663
- f"mode must be either 'multivariate' or 'univariate', got { mode } instead."
664
- )
656
+ raise ValueError (f"mode must be either 'multivariate' or 'univariate', got { mode } instead." )
665
657
666
658
queries = _generate_singling_out_queries (
667
659
df = self ._syn ,
@@ -671,9 +663,7 @@ def evaluate(self, mode: str = "multivariate") -> "SinglingOutEvaluator":
671
663
max_attempts = self ._max_attempts ,
672
664
rng = self ._rng ,
673
665
)
674
- self ._queries = _evaluate_queries_and_return_successful (
675
- df = self ._ori , queries = queries
676
- )
666
+ self ._queries = _evaluate_queries_and_return_successful (df = self ._ori , queries = queries )
677
667
self ._n_success = len (self ._queries )
678
668
679
669
baseline_queries = _random_queries (
@@ -682,31 +672,21 @@ def evaluate(self, mode: str = "multivariate") -> "SinglingOutEvaluator":
682
672
n_cols = n_cols ,
683
673
rng = self ._rng ,
684
674
)
685
- self ._baseline_queries = _evaluate_queries_and_return_successful (
686
- df = self ._ori , queries = baseline_queries
687
- )
675
+ self ._baseline_queries = _evaluate_queries_and_return_successful (df = self ._ori , queries = baseline_queries )
688
676
self ._n_baseline = len (self ._baseline_queries )
689
677
690
678
if self ._control is None :
691
679
self ._n_control = None
692
680
else :
693
- self ._n_control = len (
694
- _evaluate_queries_and_return_successful (
695
- df = self ._control , queries = queries
696
- )
697
- )
681
+ self ._n_control = len (_evaluate_queries_and_return_successful (df = self ._control , queries = queries ))
698
682
699
683
# correct the number of success against the control set
700
684
# to account for different dataset sizes.
701
685
if len (self ._control ) != len (self ._ori ):
702
686
# fit the model to the data:
703
- fitted_model = fit_correction_term (
704
- df = self ._control , queries = queries
705
- )
687
+ fitted_model = fit_correction_term (df = self ._control , queries = queries )
706
688
707
- correction = fitted_model (len (self ._ori )) / fitted_model (
708
- len (self ._control )
709
- )
689
+ correction = fitted_model (len (self ._ori )) / fitted_model (len (self ._control ))
710
690
self ._n_control *= correction
711
691
712
692
self ._evaluated = True
@@ -727,9 +707,7 @@ def results(self, confidence_level: float = 0.95) -> EvaluationResults:
727
707
728
708
"""
729
709
if not self ._evaluated :
730
- raise RuntimeError (
731
- "The singling out evaluator wasn't evaluated yet. Please, run `evaluate()` first."
732
- )
710
+ raise RuntimeError ("The singling out evaluator wasn't evaluated yet. Please, run `evaluate()` first." )
733
711
734
712
return EvaluationResults (
735
713
n_attacks = self ._n_attacks ,
@@ -739,9 +717,7 @@ def results(self, confidence_level: float = 0.95) -> EvaluationResults:
739
717
confidence_level = confidence_level ,
740
718
)
741
719
742
- def risk (
743
- self , confidence_level : float = 0.95 , baseline : bool = False
744
- ) -> PrivacyRisk :
720
+ def risk (self , confidence_level : float = 0.95 , baseline : bool = False ) -> PrivacyRisk :
745
721
"""Estimate the singling out risk.
746
722
747
723
The risk is estimated comparing the number of successfull singling out
0 commit comments