OpenMined
diff --git a/‎.gitignore‎
Lines changed: 167 additions & 0 deletions b/‎.gitignore‎
Lines changed: 167 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pipeline_dp/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎pipeline_dp/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎pipeline_dp/budget_accounting.py‎
Lines changed: 96 additions & 14 deletions b/‎pipeline_dp/budget_accounting.py‎
Lines changed: 96 additions & 14 deletions
diff --git a/‎pipeline_dp/pipeline_operations.py‎
Lines changed: 68 additions & 0 deletions b/‎pipeline_dp/pipeline_operations.py‎
Lines changed: 68 additions & 0 deletions
@@ -0,0 +1,167 @@
+
+# Created by https://www.toptal.com/developers/gitignore/api/python,jupyternotebooks
+# Edit at https://www.toptal.com/developers/gitignore?templates=python,jupyternotebooks
+
+### JupyterNotebooks ###
+# gitignore template for Jupyter Notebooks
+# website: http://jupyter.org/
+
+.ipynb_checkpoints
+*/.ipynb_checkpoints/*
+
+# IPython
+profile_default/
+ipython_config.py
+
+# Remove previous ipynb_checkpoints
+#   git rm -r .ipynb_checkpoints/
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+pytestdebug.log
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+doc/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+
+# IPython
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#poetry.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+# .env
+.env/
+.venv/
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pythonenv*
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# operating system-related files
+# file properties cache/storage on macOS
+*.DS_Store
+# thumbnail cache on Windows
+Thumbs.db
+
+# profiling data
+.prof
+
+
+# End of https://www.toptal.com/developers/gitignore/api/python,jupyternotebooks
+
+poetry.lock
@@ -12,11 +12,11 @@ Google Python Style Guide https://google.github.io/styleguide/pyguide.html
 
 ### Installing dependencies
 
-   This project depends on numpy apache-beam absl-py dataclasses
+   This project depends on numpy apache-beam pyspark absl-py dataclasses
 
    For installing with pip please run: 
 
-   1. `pip install numpy apache-beam absl-py`
+   1. `pip install numpy apache-beam pyspark absl-py`
 
    2. (for Python 3.6) `pip install dataclasses`
 
 
@@ -3,4 +3,5 @@
 from pipeline_dp.dp_engine import DataExtractors
 from pipeline_dp.dp_engine import Metrics
 from pipeline_dp.dp_engine import DPEngine
-from pipeline_dp.pipeline_operations import BeamOperations
+from pipeline_dp.pipeline_operations import BeamOperations
+from pipeline_dp.pipeline_operations import SparkRDDOperations
@@ -1,20 +1,102 @@
 """Privacy budget accounting for DP pipelines."""
 
+import logging
+from dataclasses import dataclass
+
+@dataclass
+class Budget:
+    """Manages the budget allocated for an operation.
+
+    The values for eps and delta are computed when the method compute_budgets
+    of the corresponding BudgetAccount is called.
+    """
+    _eps: float = None
+    _delta: float = None
+
+    @property
+    def eps(self):
+        """Parameter of (eps, delta)-differential privacy.
+
+        Raises:
+            AssertionError: The privacy budget is not calculated yet.
+        """
+        if self._eps is None:
+            raise AssertionError("Privacy budget is not calculated yet.")
+        return self._eps
+
+    @property
+    def delta(self):
+        """Parameter of (eps, delta)-differential privacy.
+
+        Raises:
+            AssertionError: The privacy budget is not calculated yet.
+        """
+        if self._delta is None:
+            raise AssertionError("Privacy budget is not calculated yet.")
+        return self._delta
+
+    def set_eps_delta(self, eps, delta):
+        self._eps = eps
+        self._delta = delta
+
+@dataclass
+class RequestedBudget:
+    """Manages the budget requested for an operation."""
+    budget: Budget
+    weight: float
+    use_eps: bool
+    use_delta: bool
 
 class BudgetAccountant:
-  """Manages privacy budget."""
+    """Manages the privacy budget."""
 
-  def __init__(self, epsilon, delta):
-    """Construct a BudgetAccountant
+    def __init__(self, epsilon: float, delta: float):
+        """Constructs a BudgetAccountant.
 
-    Args:
-      epsilon, delta: parameters of (epsilon, delta)-differential privacy.
-    """
-    if epsilon <= 0:
-      raise ValueError(f"Epsilon must be positive, not {epsilon}")
-    if delta < 0:
-      raise ValueError(f"Delta must non-negative, not {delta}")
-    self._eps = epsilon
-    self._delta = delta
-
-  # TODO: implement BudgetAccountant functionality.
+        Args:
+            epsilon, delta: Parameters of (epsilon, delta)-differential privacy.
+        """
+        if epsilon <= 0:
+            raise ValueError(f"Epsilon must be positive, not {epsilon}.")
+        if delta < 0:
+            raise ValueError(f"Delta must be non-negative, not {delta}.")
+
+        self._eps = epsilon
+        self._delta = delta
+        self._requested_budgets = []
+
+    def request_budget(self, weight: float, *, use_eps: bool, use_delta: bool) -> Budget:
+        """Requests a budget.
+
+        Args:
+            weight: The weight used to compute epsilon and delta for the budget.
+            use_eps: A boolean that is False when the operation doesn't need epsilon.
+            use_delta: A boolean that is False when the operation doesn't need delta.
+
+        Returns:
+            A "lazy" budget object that doesn't contain epsilon/delta until the
+            method compute_budgets is called.
+        """
+        budget = Budget()
+        requested_budget = RequestedBudget(budget, weight, use_eps, use_delta)
+        self._requested_budgets.append(requested_budget)
+        return budget
+
+    def compute_budgets(self):
+        """All previously requested Budget objects are updated with corresponding budget values."""
+        if not self._requested_budgets:
+            logging.warning("No budgets were requested.")
+            return
+
+        total_weight_eps = total_weight_delta = 0
+        for requested_budget in self._requested_budgets:
+            total_weight_eps += requested_budget.use_eps * requested_budget.weight
+            total_weight_delta += requested_budget.use_delta * requested_budget.weight
+
+        for requested_budget in self._requested_budgets:
+            eps = delta = 0
+            if total_weight_eps:
+                eps = requested_budget.use_eps * self._eps * requested_budget.weight / total_weight_eps
+            if total_weight_delta:
+                delta = requested_budget.use_delta * self._delta * requested_budget.weight / total_weight_delta
+            requested_budget.budget.set_eps_delta(eps, delta)
@@ -1,5 +1,7 @@
 """Adapters for working with pipeline frameworks."""
 
+import random
+
 import abc
 import apache_beam as beam
 import apache_beam.transforms.combiners as combiners
@@ -58,6 +60,16 @@ def map_values(self, col, fn, stage_name: str):
         return col | stage_name >> beam.MapTuple(lambda k, v: (k, fn(v)))
 
     def group_by_key(self, col, stage_name: str):
+        """Group the values for each key in the PCollection into a single sequence.
+
+        Args:
+          col: input collection
+          stage_name: name of the stage
+
+        Returns:
+          An PCollection of tuples in which the type of the second item is list.
+
+        """
         return col | stage_name >> beam.GroupByKey()
 
     def filter(self, col, fn, stage_name: str):
@@ -76,6 +88,62 @@ def count_per_element(self, col, stage_name: str):
         return col | stage_name >> combiners.Count.PerElement()
 
 
+class SparkRDDOperations(PipelineOperations):
+    """Apache Spark RDD adapter."""
+
+    def map(self, rdd, fn, stage_name: str = None):
+        return rdd.map(fn)
+
+    def map_tuple(self, rdd, fn, stage_name: str = None):
+        return rdd.map(fn)
+
+    def map_values(self, rdd, fn, stage_name: str = None):
+        return rdd.mapValues(fn)
+
+    def group_by_key(self, rdd, stage_name: str = None):
+        """Group the values for each key in the RDD into a single sequence.
+
+        Args:
+          rdd: input RDD
+          stage_name: not used
+
+        Returns:
+          An RDD of tuples in which the type of the second item
+          is the pyspark.resultiterable.ResultIterable.
+
+        """
+        return rdd.groupByKey()
+
+    def filter(self, rdd, fn, stage_name: str = None):
+        return rdd.filter(fn)
+
+    def keys(self, rdd, stage_name: str = None):
+        return rdd.keys()
+
+    def values(self, rdd, stage_name: str = None):
+        return rdd.values()
+
+    def sample_fixed_per_key(self, rdd, n: int, stage_name: str = None):
+        """Get fixed-size random samples for each unique key in an RDD of key-values.
+        Sampling is not guaranteed to be uniform across partitions.
+
+        Args:
+          rdd: input RDD
+          n: number of values to sample for each key
+          stage_name: not used
+
+        Returns:
+          An RDD of tuples.
+
+        """
+        return rdd.mapValues(lambda x: [x])\
+            .reduceByKey(lambda x, y: random.sample(x+y, min(len(x)+len(y), n)))
+
+    def count_per_element(self, rdd, stage_name: str = None):
+        return rdd.map(lambda x: (x, 1))\
+            .reduceByKey(lambda x, y: (x + y))
+
+
 class LocalPipelineOperations(PipelineOperations):
     """Local Pipeline adapter."""