Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions pipeline_dp/histograms.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,18 @@ def quantiles(self, q: List[float]) -> List[int]:
return result[::-1]


def histogram_precomputed_tails(hist: Histogram):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

compute_percentage_dropped_for_threshold? (and then to update computations)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, but what do you want to return then? Only one value? or two values: accumulated_weight / total_weight and accumulated_count / total_count?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIU we can return something like
(bin.lower, ratio_data_dropped) (i.e. ratio data dropped with fixed lower).
If needed absolute numbers, we can return total_data
WDYT?

result = [None] * len(hist.bins)
accumulated_weight = 0
accumulated_count = 0
for i in range(1, len(hist.bins) + 1):
b = hist.bins[-i]
accumulated_weight += b.lower * b.count
accumulated_count += b.count
result[-i] = (b.lower, (accumulated_weight, accumulated_count))
return result


@dataclass
class DatasetHistograms:
"""Contains histograms useful for parameter tuning."""
Expand Down
25 changes: 10 additions & 15 deletions pipeline_dp/private_contribution_bounds.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import pipeline_dp
from pipeline_dp import dp_computations

from pipeline_dp.histograms import Histogram
from pipeline_dp import histograms
from pipeline_dp import pipeline_functions


Expand All @@ -46,7 +46,7 @@ def __init__(self,

@dataclass
class Inputs:
l0_histogram: Histogram
l0_histogram: histograms.Histogram
number_of_partitions: int

@lru_cache(maxsize=None)
Expand Down Expand Up @@ -94,11 +94,14 @@ class L0ScoringFunction(dp_computations.ExponentialMechanism.ScoringFunction):

def __init__(self,
params: pipeline_dp.CalculatePrivateContributionBoundsParams,
number_of_partitions: int, l0_histogram: Histogram):
number_of_partitions: int, l0_histogram: histograms.Histogram):
super().__init__()
self._params = params
self._number_of_partitions = number_of_partitions
self._l0_histogram = l0_histogram
tails = histograms.histogram_precomputed_tails(l0_histogram)
self._max_bin_lower = 0 if len(tails) == 0 else tails[-1][0]
self._precomputed_tails = dict(tails)

def score(self, k: int) -> float:
"""Computes score of a given parameter k.
Expand Down Expand Up @@ -162,18 +165,10 @@ def _l0_impact_noise(self, k: int):
dp_computations.compute_dp_count_noise_std(noise_params))

def _l0_impact_dropped(self, k: int):
# TODO: precalculate it and make it work in O(1) time.
capped_contributions = map(
lambda bin: max(
min(
bin.lower,
self._max_partitions_contributed_best_upper_bound(),
) - k,
0,
) * bin.count,
self._l0_histogram.bins,
)
return sum(capped_contributions)
if k > self._max_bin_lower:
return 0
tail = self._precomputed_tails[k]
return tail[0] - k * tail[1]


def _generate_possible_contribution_bounds(upper_bound: int) -> List[int]:
Expand Down