SparkOperations implementation (#14)

boyaryn · web-flow · commit ba2b12ff70b3 · 2021-05-07T19:59:05.000+02:00
SparkOperations implementation
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+*.pyc
+/.idea
diff --git a/README.md b/README.md
@@ -12,11 +12,11 @@ Google Python Style Guide https://google.github.io/styleguide/pyguide.html
 
 ### Installing dependencies
 
-   This project depends on numpy apache-beam absl-py dataclasses
+   This project depends on numpy apache-beam pyspark absl-py dataclasses
  
    For installing with pip please run: 
    
-   1. `pip install numpy apache-beam absl-py`
+   1. `pip install numpy apache-beam pyspark absl-py`
    
    2. (for Python 3.6) `pip install dataclasses`
    
diff --git a/pipeline_dp/__init__.py b/pipeline_dp/__init__.py
@@ -3,4 +3,5 @@
 from pipeline_dp.dp_engine import DataExtractors
 from pipeline_dp.dp_engine import Metrics
 from pipeline_dp.dp_engine import DPEngine
-from pipeline_dp.pipeline_operations import BeamOperations
+from pipeline_dp.pipeline_operations import BeamOperations
+from pipeline_dp.pipeline_operations import SparkRDDOperations
diff --git a/pipeline_dp/pipeline_operations.py b/pipeline_dp/pipeline_operations.py
@@ -1,5 +1,7 @@
 """Adapters for working with pipeline frameworks."""
 
+import random
+
 import abc
 import apache_beam as beam
 import apache_beam.transforms.combiners as combiners
@@ -58,6 +60,16 @@ def map_values(self, col, fn, stage_name: str):
         return col | stage_name >> beam.MapTuple(lambda k, v: (k, fn(v)))
 
     def group_by_key(self, col, stage_name: str):
+        """Group the values for each key in the PCollection into a single sequence.
+
+        Args:
+          col: input collection
+          stage_name: name of the stage
+
+        Returns:
+          An PCollection of tuples in which the type of the second item is list.
+
+        """
         return col | stage_name >> beam.GroupByKey()
 
     def filter(self, col, fn, stage_name: str):
@@ -76,13 +88,69 @@ def count_per_element(self, col, stage_name: str):
         return col | stage_name >> combiners.Count.PerElement()
 
 
+class SparkRDDOperations(PipelineOperations):
+    """Apache Spark RDD adapter."""
+
+    def map(self, rdd, fn, stage_name: str = None):
+        return rdd.map(fn)
+
+    def map_tuple(self, rdd, fn, stage_name: str = None):
+        return rdd.map(fn)
+
+    def map_values(self, rdd, fn, stage_name: str = None):
+        return rdd.mapValues(fn)
+
+    def group_by_key(self, rdd, stage_name: str = None):
+        """Group the values for each key in the RDD into a single sequence.
+
+        Args:
+          rdd: input RDD
+          stage_name: not used
+
+        Returns:
+          An RDD of tuples in which the type of the second item
+          is the pyspark.resultiterable.ResultIterable.
+
+        """
+        return rdd.groupByKey()
+
+    def filter(self, rdd, fn, stage_name: str = None):
+        return rdd.filter(fn)
+
+    def keys(self, rdd, stage_name: str = None):
+        return rdd.keys()
+
+    def values(self, rdd, stage_name: str = None):
+        return rdd.values()
+
+    def sample_fixed_per_key(self, rdd, n: int, stage_name: str = None):
+        """Get fixed-size random samples for each unique key in an RDD of key-values.
+        Sampling is not guaranteed to be uniform across partitions.
+
+        Args:
+          rdd: input RDD
+          n: number of values to sample for each key
+          stage_name: not used
+
+        Returns:
+          An RDD of tuples.
+
+        """
+        return rdd.mapValues(lambda x: [x])\
+            .reduceByKey(lambda x, y: random.sample(x+y, min(len(x)+len(y), n)))
+
+    def count_per_element(self, rdd, stage_name: str = None):
+        return rdd.map(lambda x: (x, 1))\
+            .reduceByKey(lambda x, y: (x + y))
+
+
 class LocalPipelineOperations(PipelineOperations):
     """Local Pipeline adapter."""
 
     def map(self, col, fn, stage_name: str = None):
         return map(fn, col)
 
-    def map_tuple(self, col, fn, stage_name: str = None):
+    def map_tuple(self, col, fn, stage_name: str):
         return (fn(k, v) for k, v in col)
 
     def map_values(self, col, fn, stage_name: str):
diff --git a/tests/pipeline_operations_test.py b/tests/pipeline_operations_test.py
@@ -1,12 +1,45 @@
 import unittest
+import pyspark
 
+from pipeline_dp.pipeline_operations import SparkRDDOperations
 from pipeline_dp.pipeline_operations import LocalPipelineOperations
 
 
 class PipelineOperationsTest(unittest.TestCase):
     pass
 
 
+class SparkRDDOperationsTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        conf = pyspark.SparkConf()
+        cls.sc = pyspark.SparkContext(conf=conf)
+
+    def test_sample_fixed_per_key(self):
+        spark_operations = SparkRDDOperations()
+        data = [(1, 11), (2, 22), (3, 33), (1, 14), (2, 25), (1, 16)]
+        dist_data = SparkRDDOperationsTest.sc.parallelize(data)
+        rdd = spark_operations.sample_fixed_per_key(dist_data, 2)
+        result = dict(rdd.collect())
+        self.assertEqual(len(result[1]), 2)
+        self.assertTrue(set(result[1]).issubset({11, 14, 16}))
+        self.assertSetEqual(set(result[2]), {22, 25})
+        self.assertSetEqual(set(result[3]), {33})
+
+    def test_count_per_element(self):
+        spark_operations = SparkRDDOperations()
+        data = ['a', 'b', 'a']
+        dist_data = SparkRDDOperationsTest.sc.parallelize(data)
+        rdd = spark_operations.count_per_element(dist_data)
+        result = rdd.collect()
+        result = dict(result)
+        self.assertDictEqual(result, {'a': 2, 'b': 1})
+
+        @classmethod
+        def tearDownClass(cls):
+            cls.sc.stop()
+
+
 class LocalPipelineOperationsTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -22,10 +55,20 @@ def test_local_map(self):
         self.assertEqual(list(self.ops.map(range(5), lambda x: x ** 2)),
                          [0, 1, 4, 9, 16])
 
+    def test_local_map_tuple(self):
+        some_map = self.ops.map([1, 2, 3], lambda x: x)
+        # some_map is its own consumable iterator
+        self.assertIs(some_map, iter(some_map))
+
+        self.assertEqual(list(self.ops.map([1, 2, 3], str)),
+                         ["1", "2", "3"])
+        self.assertEqual(list(self.ops.map(range(5), lambda x: x ** 2)),
+                         [0, 1, 4, 9, 16])
+
     def test_local_map_tuple(self):
         tuple_list = [(1, 2), (2, 3), (3, 4)]
 
-        self.assertEqual(list(self.ops.map_tuple(tuple_list, lambda k, v: k+v)),
+        self.assertEqual(list(self.ops.map_tuple(tuple_list, lambda k, v: k + v)),
                          [3, 5, 7])
 
         self.assertEqual(list(self.ops.map_tuple(tuple_list, lambda k, v: (