From f0cf725a08e75c6bb0f1e89fb7dcc285b806f5d3 Mon Sep 17 00:00:00 2001 From: Kulankhina Date: Wed, 23 Feb 2022 17:18:47 +0100 Subject: [PATCH 1/2] Add conference colab --- examples/conference.ipynb | 295 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 295 insertions(+) create mode 100644 examples/conference.ipynb diff --git a/examples/conference.ipynb b/examples/conference.ipynb new file mode 100644 index 00000000..407f1df7 --- /dev/null +++ b/examples/conference.ipynb @@ -0,0 +1,295 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Conference example", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "bW1gifIe0pUt" + }, + "source": [ + "\n", + " \n", + " \n", + "
\n", + " Run in Google Colab\n", + " \n", + " View source on GitHub\n", + "
" + ] + }, + { + "cell_type": "markdown", + "source": [ + "This is a simple example that shows how to calculate anonymized statistics using PipelineDP. The input data is a simulated dataset of an imaginary conference participants including their origin coutries. We use PipelineDP to calculate anonymized count of participants aggregated by country." + ], + "metadata": { + "id": "ddrCVxp53UjV" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zxcPpZGuAPq8" + }, + "source": [ + "# Install dependencies and download data\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "E8yzpKYNbHTF", + "cellView": "form" + }, + "outputs": [], + "source": [ + "#@markdown Install dependencies and download data\n", + "\n", + "import os\n", + "os.chdir('/content')\n", + "!pip install pipeline-dp apache_beam\n", + "\n", + "import sys\n", + "sys.path.insert(0,'/content/PipelineDP')\n", + "\n", + "from IPython.display import clear_output\n", + "clear_output()\n", + "\n", + "import apache_beam as beam\n", + "from apache_beam.runners.portability import fn_api_runner\n", + "from apache_beam.runners.interactive import interactive_runner\n", + "from apache_beam.runners.interactive.interactive_beam import *\n", + "from dataclasses import dataclass\n", + "import pipeline_dp\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oi-D38dUApM1" + }, + "source": [ + "# Construct and inspect the input data\n", + "\n", + "Below we construct the input dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "Mimkjqt9h9gr", + "cellView": "form" + }, + "outputs": [], + "source": [ + "#@markdown Construct the input data\n", + "input = [(f\"{u}\", \"Germany\") for u in range(50)]\n", + "input += [(f\"{u + 50}\", \"Switzerland\") for u in range(75)]\n", + "input += [(f\"{u + 125}\", \"France\") for u in range(30)]\n", + "input += [(f\"{u + 155}\", \"Italy\") for u in range(40)]\n", + "input += [(f\"{u + 195}\", \"UK\") for u in range(100)]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e2SOjo8qiNnw" + }, + "source": [ + "The goal of this Colab is to demonstrate how to compute the count of participants aggregated by country in a DP manner.\n", + "\n", + "The plot below demonstrates the non-private result." + ] + }, + { + "cell_type": "code", + "source": [ + "#@title Non-private statistics\n", + "countries = [\"Germany\", \"Switzerland\", \"France\", \"Italy\", \"UK\"]\n", + "non_dp_count = [0] * len(countries)\n", + "for participant_info in input:\n", + " country = participant_info[1]\n", + " index = countries.index(country)\n", + " non_dp_count[index] = non_dp_count[index] + 1\n", + "\n", + "plt.bar(countries, non_dp_count)\n", + "plt.suptitle('Count of participants')\n", + "plt.show()\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 294 + }, + "id": "qR1dBaCiqNAa", + "outputId": "fe6e7953-cd5c-49b3-cf4e-6fe577bcc040", + "cellView": "form" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Run DP pipeline\n", + "Below we compute the same statistics using differential privacy and PipelineDP." + ], + "metadata": { + "id": "IIjQrB3eFmvp" + } + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "eN9fu0NkSA6u", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "76a402a5-4ba0-459a-bc38-9b18ff97cca0" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[('Germany', MetricsTuple(privacy_id_count=51.11806528184752)), ('Switzerland', MetricsTuple(privacy_id_count=75.05021202241187)), ('France', MetricsTuple(privacy_id_count=28.699075757142054)), ('Italy', MetricsTuple(privacy_id_count=39.81641489278627)), ('UK', MetricsTuple(privacy_id_count=100.014823352647))]\n" + ] + } + ], + "source": [ + "#@title DP statistics\n", + "\n", + "# Choose the backed: local, Beam or Spark\n", + "backend = pipeline_dp.LocalBackend()\n", + "\n", + "# Define the total privacy loss that can be introduced by this pipeline\n", + "budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6)\n", + "\n", + "# Create DPEngine\n", + "dp_engine = pipeline_dp.DPEngine(budget_accountant, backend)\n", + "\n", + "# Configure functions to extract partition key, privacy ID and aggregated value\n", + "# from the input data\n", + "data_extractors = pipeline_dp.DataExtractors(\n", + " partition_extractor=lambda row: row[1],\n", + " privacy_id_extractor=lambda row: row[0],\n", + " value_extractor=lambda row: 1)\n", + "\n", + "# Configure the aggregation parameters\n", + "params = pipeline_dp.AggregateParams(\n", + " noise_kind=pipeline_dp.NoiseKind.LAPLACE,\n", + " metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT],\n", + " max_partitions_contributed=1,\n", + " max_contributions_per_partition=1,\n", + " min_value=0,\n", + " max_value=1)\n", + "\n", + "# Build computational graph for the aggregation\n", + "dp_result = dp_engine.aggregate(input, params, data_extractors)\n", + "\n", + "# Compute budget per each DP operation. \n", + "budget_accountant.compute_budgets()\n", + "\n", + "# Run computation.\n", + "dp_result = list(dp_result)\n", + "print(dp_result)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Inspect the result" + ], + "metadata": { + "id": "QsguG0DeF_8L" + } + }, + { + "cell_type": "code", + "source": [ + "#@markdown ##Inspect the result\n", + "#@markdown Below you can see the DP and non-DP results.\n", + "\n", + "dp_count = [0] * len(countries)\n", + "i = 0\n", + "for dp_count_per_country in dp_result:\n", + " dp_count[i] = dp_count_per_country[1][0]\n", + " i = i + 1\n", + "\n", + "x = np.arange(len(countries))\n", + "\n", + "width = 0.35\n", + "fig, ax = plt.subplots()\n", + "rects1 = ax.bar(x - width/2, non_dp_count, width, label='non-DP')\n", + "rects2 = ax.bar(x + width/2, dp_count, width, label='DP')\n", + "ax.set_title('Count participants per country')\n", + "ax.set_xticks(x)\n", + "ax.set_xticklabels(countries)\n", + "ax.legend()\n", + "fig.tight_layout()\n", + "plt.savefig(\"chart.png\")\n", + "plt.show()\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 297 + }, + "id": "sTkYZ0wSbo3h", + "outputId": "7d2701b4-1c9f-4232-8cc2-2c067c4526d9", + "cellView": "form" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + } + } + ] + } + ] +} From 66047353e782628cff4fa34519e68940cc9cee97 Mon Sep 17 00:00:00 2001 From: Kulankhina Date: Thu, 24 Feb 2022 12:06:05 +0100 Subject: [PATCH 2/2] Fix imports and comments --- examples/conference.ipynb | 52 +++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 30 deletions(-) diff --git a/examples/conference.ipynb b/examples/conference.ipynb index 407f1df7..213b80df 100644 --- a/examples/conference.ipynb +++ b/examples/conference.ipynb @@ -55,31 +55,17 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "id": "E8yzpKYNbHTF", - "cellView": "form" + "id": "E8yzpKYNbHTF" }, "outputs": [], "source": [ "#@markdown Install dependencies and download data\n", "\n", - "import os\n", - "os.chdir('/content')\n", "!pip install pipeline-dp apache_beam\n", "\n", - "import sys\n", - "sys.path.insert(0,'/content/PipelineDP')\n", - "\n", "from IPython.display import clear_output\n", "clear_output()\n", - "\n", - "import apache_beam as beam\n", - "from apache_beam.runners.portability import fn_api_runner\n", - "from apache_beam.runners.interactive import interactive_runner\n", - "from apache_beam.runners.interactive.interactive_beam import *\n", - "from dataclasses import dataclass\n", "import pipeline_dp\n", - "\n", - "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] @@ -105,6 +91,9 @@ "outputs": [], "source": [ "#@markdown Construct the input data\n", + "# The format of the input is: (participant_id, country).\n", + "# Participants u_0...u_49 come from Germany, participants u_50...u_149 come from\n", + "# Switzerland, etc.\n", "input = [(f\"{u}\", \"Germany\") for u in range(50)]\n", "input += [(f\"{u + 50}\", \"Switzerland\") for u in range(75)]\n", "input += [(f\"{u + 125}\", \"France\") for u in range(30)]\n", @@ -144,10 +133,10 @@ "height": 294 }, "id": "qR1dBaCiqNAa", - "outputId": "fe6e7953-cd5c-49b3-cf4e-6fe577bcc040", + "outputId": "9ccdf49d-a9f9-4fce-d7c7-6fc3d66f2fda", "cellView": "form" }, - "execution_count": 4, + "execution_count": 3, "outputs": [ { "output_type": "display_data", @@ -175,27 +164,27 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": { "id": "eN9fu0NkSA6u", "colab": { "base_uri": "https://localhost:8080/" }, - "outputId": "76a402a5-4ba0-459a-bc38-9b18ff97cca0" + "outputId": "15bf4d5c-6a2e-48a0-8830-81c1a5ecbab4" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ - "[('Germany', MetricsTuple(privacy_id_count=51.11806528184752)), ('Switzerland', MetricsTuple(privacy_id_count=75.05021202241187)), ('France', MetricsTuple(privacy_id_count=28.699075757142054)), ('Italy', MetricsTuple(privacy_id_count=39.81641489278627)), ('UK', MetricsTuple(privacy_id_count=100.014823352647))]\n" + "[('Germany', MetricsTuple(privacy_id_count=49.10272994384104)), ('Switzerland', MetricsTuple(privacy_id_count=75.26271629976691)), ('France', MetricsTuple(privacy_id_count=32.206397102141636)), ('Italy', MetricsTuple(privacy_id_count=37.134226348807715)), ('UK', MetricsTuple(privacy_id_count=97.95130274764233))]\n" ] } ], "source": [ "#@title DP statistics\n", "\n", - "# Choose the backed: local, Beam or Spark\n", + "# Choose the backend: local, Beam or Spark\n", "backend = pipeline_dp.LocalBackend()\n", "\n", "# Define the total privacy loss that can be introduced by this pipeline\n", @@ -220,13 +209,18 @@ " min_value=0,\n", " max_value=1)\n", "\n", - "# Build computational graph for the aggregation\n", + "# Create a computational graph for the aggregation.\n", + "# All computations are lazy. dp_result is iterable, but iterating it would\n", + "# fail until budget is computed (below).\n", + "# It’s possible to call DPEngine.aggregate multiple times with different\n", + "# metrics to compute.\n", "dp_result = dp_engine.aggregate(input, params, data_extractors)\n", "\n", "# Compute budget per each DP operation. \n", "budget_accountant.compute_budgets()\n", "\n", - "# Run computation.\n", + "# Here's where the lazy iterator initiates computations and gets transformed\n", + "# into actual results\n", "dp_result = list(dp_result)\n", "print(dp_result)" ] @@ -247,10 +241,9 @@ "#@markdown Below you can see the DP and non-DP results.\n", "\n", "dp_count = [0] * len(countries)\n", - "i = 0\n", - "for dp_count_per_country in dp_result:\n", + "for i, dp_count_per_country in enumerate(dp_result):\n", " dp_count[i] = dp_count_per_country[1][0]\n", - " i = i + 1\n", + "\n", "\n", "x = np.arange(len(countries))\n", "\n", @@ -272,15 +265,14 @@ "height": 297 }, "id": "sTkYZ0wSbo3h", - "outputId": "7d2701b4-1c9f-4232-8cc2-2c067c4526d9", - "cellView": "form" + "outputId": "82d59080-d00b-4c00-ff90-dde4838541d6" }, - "execution_count": 6, + "execution_count": 5, "outputs": [ { "output_type": "display_data", "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ]