Skip to content

Commit 7850375

Browse files
authored
Merge pull request #1 from OpenMined/main
Fetch upstream
2 parents 41a7e7e + f905c0d commit 7850375

File tree

8 files changed

+420
-29
lines changed

8 files changed

+420
-29
lines changed

.gitignore

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
2+
# Created by https://www.toptal.com/developers/gitignore/api/python,jupyternotebooks
3+
# Edit at https://www.toptal.com/developers/gitignore?templates=python,jupyternotebooks
4+
5+
### JupyterNotebooks ###
6+
# gitignore template for Jupyter Notebooks
7+
# website: http://jupyter.org/
8+
9+
.ipynb_checkpoints
10+
*/.ipynb_checkpoints/*
11+
12+
# IPython
13+
profile_default/
14+
ipython_config.py
15+
16+
# Remove previous ipynb_checkpoints
17+
# git rm -r .ipynb_checkpoints/
18+
19+
### Python ###
20+
# Byte-compiled / optimized / DLL files
21+
__pycache__/
22+
*.py[cod]
23+
*$py.class
24+
25+
# C extensions
26+
*.so
27+
28+
# Distribution / packaging
29+
.Python
30+
build/
31+
develop-eggs/
32+
dist/
33+
downloads/
34+
eggs/
35+
.eggs/
36+
parts/
37+
sdist/
38+
var/
39+
wheels/
40+
pip-wheel-metadata/
41+
share/python-wheels/
42+
*.egg-info/
43+
.installed.cfg
44+
*.egg
45+
MANIFEST
46+
47+
# PyInstaller
48+
# Usually these files are written by a python script from a template
49+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
50+
*.manifest
51+
*.spec
52+
53+
# Installer logs
54+
pip-log.txt
55+
pip-delete-this-directory.txt
56+
57+
# Unit test / coverage reports
58+
htmlcov/
59+
.tox/
60+
.nox/
61+
.coverage
62+
.coverage.*
63+
.cache
64+
nosetests.xml
65+
coverage.xml
66+
*.cover
67+
*.py,cover
68+
.hypothesis/
69+
.pytest_cache/
70+
pytestdebug.log
71+
72+
# Translations
73+
*.mo
74+
*.pot
75+
76+
# Django stuff:
77+
*.log
78+
local_settings.py
79+
db.sqlite3
80+
db.sqlite3-journal
81+
82+
# Flask stuff:
83+
instance/
84+
.webassets-cache
85+
86+
# Scrapy stuff:
87+
.scrapy
88+
89+
# Sphinx documentation
90+
docs/_build/
91+
doc/_build/
92+
93+
# PyBuilder
94+
target/
95+
96+
# Jupyter Notebook
97+
98+
# IPython
99+
100+
# pyenv
101+
.python-version
102+
103+
# pipenv
104+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
105+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
106+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
107+
# install all needed dependencies.
108+
#Pipfile.lock
109+
110+
# poetry
111+
#poetry.lock
112+
113+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
114+
__pypackages__/
115+
116+
# Celery stuff
117+
celerybeat-schedule
118+
celerybeat.pid
119+
120+
# SageMath parsed files
121+
*.sage.py
122+
123+
# Environments
124+
# .env
125+
.env/
126+
.venv/
127+
env/
128+
venv/
129+
ENV/
130+
env.bak/
131+
venv.bak/
132+
pythonenv*
133+
134+
# Spyder project settings
135+
.spyderproject
136+
.spyproject
137+
138+
# Rope project settings
139+
.ropeproject
140+
141+
# mkdocs documentation
142+
/site
143+
144+
# mypy
145+
.mypy_cache/
146+
.dmypy.json
147+
dmypy.json
148+
149+
# Pyre type checker
150+
.pyre/
151+
152+
# pytype static type analyzer
153+
.pytype/
154+
155+
# operating system-related files
156+
# file properties cache/storage on macOS
157+
*.DS_Store
158+
# thumbnail cache on Windows
159+
Thumbs.db
160+
161+
# profiling data
162+
.prof
163+
164+
165+
# End of https://www.toptal.com/developers/gitignore/api/python,jupyternotebooks
166+
167+
poetry.lock

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@ Google Python Style Guide https://google.github.io/styleguide/pyguide.html
1212

1313
### Installing dependencies
1414

15-
This project depends on numpy apache-beam absl-py dataclasses
15+
This project depends on numpy apache-beam pyspark absl-py dataclasses
1616

1717
For installing with pip please run:
1818

19-
1. `pip install numpy apache-beam absl-py`
19+
1. `pip install numpy apache-beam pyspark absl-py`
2020

2121
2. (for Python 3.6) `pip install dataclasses`
2222

pipeline_dp/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@
33
from pipeline_dp.dp_engine import DataExtractors
44
from pipeline_dp.dp_engine import Metrics
55
from pipeline_dp.dp_engine import DPEngine
6-
from pipeline_dp.pipeline_operations import BeamOperations
6+
from pipeline_dp.pipeline_operations import BeamOperations
7+
from pipeline_dp.pipeline_operations import SparkRDDOperations

pipeline_dp/budget_accounting.py

Lines changed: 96 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,102 @@
11
"""Privacy budget accounting for DP pipelines."""
22

3+
import logging
4+
from dataclasses import dataclass
5+
6+
@dataclass
7+
class Budget:
8+
"""Manages the budget allocated for an operation.
9+
10+
The values for eps and delta are computed when the method compute_budgets
11+
of the corresponding BudgetAccount is called.
12+
"""
13+
_eps: float = None
14+
_delta: float = None
15+
16+
@property
17+
def eps(self):
18+
"""Parameter of (eps, delta)-differential privacy.
19+
20+
Raises:
21+
AssertionError: The privacy budget is not calculated yet.
22+
"""
23+
if self._eps is None:
24+
raise AssertionError("Privacy budget is not calculated yet.")
25+
return self._eps
26+
27+
@property
28+
def delta(self):
29+
"""Parameter of (eps, delta)-differential privacy.
30+
31+
Raises:
32+
AssertionError: The privacy budget is not calculated yet.
33+
"""
34+
if self._delta is None:
35+
raise AssertionError("Privacy budget is not calculated yet.")
36+
return self._delta
37+
38+
def set_eps_delta(self, eps, delta):
39+
self._eps = eps
40+
self._delta = delta
41+
42+
@dataclass
43+
class RequestedBudget:
44+
"""Manages the budget requested for an operation."""
45+
budget: Budget
46+
weight: float
47+
use_eps: bool
48+
use_delta: bool
349

450
class BudgetAccountant:
5-
"""Manages privacy budget."""
51+
"""Manages the privacy budget."""
652

7-
def __init__(self, epsilon, delta):
8-
"""Construct a BudgetAccountant
53+
def __init__(self, epsilon: float, delta: float):
54+
"""Constructs a BudgetAccountant.
955
10-
Args:
11-
epsilon, delta: parameters of (epsilon, delta)-differential privacy.
12-
"""
13-
if epsilon <= 0:
14-
raise ValueError(f"Epsilon must be positive, not {epsilon}")
15-
if delta < 0:
16-
raise ValueError(f"Delta must non-negative, not {delta}")
17-
self._eps = epsilon
18-
self._delta = delta
19-
20-
# TODO: implement BudgetAccountant functionality.
56+
Args:
57+
epsilon, delta: Parameters of (epsilon, delta)-differential privacy.
58+
"""
59+
if epsilon <= 0:
60+
raise ValueError(f"Epsilon must be positive, not {epsilon}.")
61+
if delta < 0:
62+
raise ValueError(f"Delta must be non-negative, not {delta}.")
63+
64+
self._eps = epsilon
65+
self._delta = delta
66+
self._requested_budgets = []
67+
68+
def request_budget(self, weight: float, *, use_eps: bool, use_delta: bool) -> Budget:
69+
"""Requests a budget.
70+
71+
Args:
72+
weight: The weight used to compute epsilon and delta for the budget.
73+
use_eps: A boolean that is False when the operation doesn't need epsilon.
74+
use_delta: A boolean that is False when the operation doesn't need delta.
75+
76+
Returns:
77+
A "lazy" budget object that doesn't contain epsilon/delta until the
78+
method compute_budgets is called.
79+
"""
80+
budget = Budget()
81+
requested_budget = RequestedBudget(budget, weight, use_eps, use_delta)
82+
self._requested_budgets.append(requested_budget)
83+
return budget
84+
85+
def compute_budgets(self):
86+
"""All previously requested Budget objects are updated with corresponding budget values."""
87+
if not self._requested_budgets:
88+
logging.warning("No budgets were requested.")
89+
return
90+
91+
total_weight_eps = total_weight_delta = 0
92+
for requested_budget in self._requested_budgets:
93+
total_weight_eps += requested_budget.use_eps * requested_budget.weight
94+
total_weight_delta += requested_budget.use_delta * requested_budget.weight
95+
96+
for requested_budget in self._requested_budgets:
97+
eps = delta = 0
98+
if total_weight_eps:
99+
eps = requested_budget.use_eps * self._eps * requested_budget.weight / total_weight_eps
100+
if total_weight_delta:
101+
delta = requested_budget.use_delta * self._delta * requested_budget.weight / total_weight_delta
102+
requested_budget.budget.set_eps_delta(eps, delta)

pipeline_dp/pipeline_operations.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""Adapters for working with pipeline frameworks."""
22

3+
import random
4+
35
import abc
46
import apache_beam as beam
57
import apache_beam.transforms.combiners as combiners
@@ -58,6 +60,16 @@ def map_values(self, col, fn, stage_name: str):
5860
return col | stage_name >> beam.MapTuple(lambda k, v: (k, fn(v)))
5961

6062
def group_by_key(self, col, stage_name: str):
63+
"""Group the values for each key in the PCollection into a single sequence.
64+
65+
Args:
66+
col: input collection
67+
stage_name: name of the stage
68+
69+
Returns:
70+
An PCollection of tuples in which the type of the second item is list.
71+
72+
"""
6173
return col | stage_name >> beam.GroupByKey()
6274

6375
def filter(self, col, fn, stage_name: str):
@@ -76,6 +88,62 @@ def count_per_element(self, col, stage_name: str):
7688
return col | stage_name >> combiners.Count.PerElement()
7789

7890

91+
class SparkRDDOperations(PipelineOperations):
92+
"""Apache Spark RDD adapter."""
93+
94+
def map(self, rdd, fn, stage_name: str = None):
95+
return rdd.map(fn)
96+
97+
def map_tuple(self, rdd, fn, stage_name: str = None):
98+
return rdd.map(fn)
99+
100+
def map_values(self, rdd, fn, stage_name: str = None):
101+
return rdd.mapValues(fn)
102+
103+
def group_by_key(self, rdd, stage_name: str = None):
104+
"""Group the values for each key in the RDD into a single sequence.
105+
106+
Args:
107+
rdd: input RDD
108+
stage_name: not used
109+
110+
Returns:
111+
An RDD of tuples in which the type of the second item
112+
is the pyspark.resultiterable.ResultIterable.
113+
114+
"""
115+
return rdd.groupByKey()
116+
117+
def filter(self, rdd, fn, stage_name: str = None):
118+
return rdd.filter(fn)
119+
120+
def keys(self, rdd, stage_name: str = None):
121+
return rdd.keys()
122+
123+
def values(self, rdd, stage_name: str = None):
124+
return rdd.values()
125+
126+
def sample_fixed_per_key(self, rdd, n: int, stage_name: str = None):
127+
"""Get fixed-size random samples for each unique key in an RDD of key-values.
128+
Sampling is not guaranteed to be uniform across partitions.
129+
130+
Args:
131+
rdd: input RDD
132+
n: number of values to sample for each key
133+
stage_name: not used
134+
135+
Returns:
136+
An RDD of tuples.
137+
138+
"""
139+
return rdd.mapValues(lambda x: [x])\
140+
.reduceByKey(lambda x, y: random.sample(x+y, min(len(x)+len(y), n)))
141+
142+
def count_per_element(self, rdd, stage_name: str = None):
143+
return rdd.map(lambda x: (x, 1))\
144+
.reduceByKey(lambda x, y: (x + y))
145+
146+
79147
class LocalPipelineOperations(PipelineOperations):
80148
"""Local Pipeline adapter."""
81149

0 commit comments

Comments
 (0)