1
1
"""Adapters for working with pipeline frameworks."""
2
2
3
+ import random
4
+
3
5
import abc
4
6
import apache_beam as beam
5
7
import apache_beam .transforms .combiners as combiners
@@ -58,6 +60,16 @@ def map_values(self, col, fn, stage_name: str):
58
60
return col | stage_name >> beam .MapTuple (lambda k , v : (k , fn (v )))
59
61
60
62
def group_by_key (self , col , stage_name : str ):
63
+ """Group the values for each key in the PCollection into a single sequence.
64
+
65
+ Args:
66
+ col: input collection
67
+ stage_name: name of the stage
68
+
69
+ Returns:
70
+ An PCollection of tuples in which the type of the second item is list.
71
+
72
+ """
61
73
return col | stage_name >> beam .GroupByKey ()
62
74
63
75
def filter (self , col , fn , stage_name : str ):
@@ -76,13 +88,69 @@ def count_per_element(self, col, stage_name: str):
76
88
return col | stage_name >> combiners .Count .PerElement ()
77
89
78
90
91
+ class SparkRDDOperations (PipelineOperations ):
92
+ """Apache Spark RDD adapter."""
93
+
94
+ def map (self , rdd , fn , stage_name : str = None ):
95
+ return rdd .map (fn )
96
+
97
+ def map_tuple (self , rdd , fn , stage_name : str = None ):
98
+ return rdd .map (fn )
99
+
100
+ def map_values (self , rdd , fn , stage_name : str = None ):
101
+ return rdd .mapValues (fn )
102
+
103
+ def group_by_key (self , rdd , stage_name : str = None ):
104
+ """Group the values for each key in the RDD into a single sequence.
105
+
106
+ Args:
107
+ rdd: input RDD
108
+ stage_name: not used
109
+
110
+ Returns:
111
+ An RDD of tuples in which the type of the second item
112
+ is the pyspark.resultiterable.ResultIterable.
113
+
114
+ """
115
+ return rdd .groupByKey ()
116
+
117
+ def filter (self , rdd , fn , stage_name : str = None ):
118
+ return rdd .filter (fn )
119
+
120
+ def keys (self , rdd , stage_name : str = None ):
121
+ return rdd .keys ()
122
+
123
+ def values (self , rdd , stage_name : str = None ):
124
+ return rdd .values ()
125
+
126
+ def sample_fixed_per_key (self , rdd , n : int , stage_name : str = None ):
127
+ """Get fixed-size random samples for each unique key in an RDD of key-values.
128
+ Sampling is not guaranteed to be uniform across partitions.
129
+
130
+ Args:
131
+ rdd: input RDD
132
+ n: number of values to sample for each key
133
+ stage_name: not used
134
+
135
+ Returns:
136
+ An RDD of tuples.
137
+
138
+ """
139
+ return rdd .mapValues (lambda x : [x ])\
140
+ .reduceByKey (lambda x , y : random .sample (x + y , min (len (x )+ len (y ), n )))
141
+
142
+ def count_per_element (self , rdd , stage_name : str = None ):
143
+ return rdd .map (lambda x : (x , 1 ))\
144
+ .reduceByKey (lambda x , y : (x + y ))
145
+
146
+
79
147
class LocalPipelineOperations (PipelineOperations ):
80
148
"""Local Pipeline adapter."""
81
149
82
150
def map (self , col , fn , stage_name : str = None ):
83
151
return map (fn , col )
84
152
85
- def map_tuple (self , col , fn , stage_name : str = None ):
153
+ def map_tuple (self , col , fn , stage_name : str ):
86
154
return (fn (k , v ) for k , v in col )
87
155
88
156
def map_values (self , col , fn , stage_name : str ):
0 commit comments