Skip to content

Commit 86bbd5c

Browse files
committed
[NeoML] Remove excess CUDA syncs in layers (neoml-lib#1070)
Signed-off-by: Kirill Golikov <[email protected]>
1 parent 6910163 commit 86bbd5c

File tree

81 files changed

+2080
-2600
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

81 files changed

+2080
-2600
lines changed

NeoML/include/NeoML/Dnn/DnnSolver.h

Lines changed: 31 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,13 @@ class CDnn;
2828
class NEOML_API CDnnSolver : virtual public IObject {
2929
public:
3030
// Stores the calculated values of layer parameters gradients for further use in Train method
31-
// forSharedWeightsLayer=true should only be used within layers that share weights with other layers.
31+
// sharedWeights=true should only be used within layers that share weights with other layers
3232
void AddDiff( CBaseLayer* layer, const CObjectArray<CDnnBlob>& paramDiffBlobs,
3333
bool sharedWeights = false );
3434

3535
// Modifies the trainable parameters of the network layers,
3636
// using the accumulated gradients and previous steps' history (moment, etc.)
3737
void Train( float distributedCoeff = 1.f );
38-
3938
// Resets to the initial state
4039
void Reset();
4140

@@ -62,11 +61,17 @@ class NEOML_API CDnnSolver : virtual public IObject {
6261

6362
// Gets the reference to the math engine
6463
IMathEngine& MathEngine() const { return mathEngine; }
64+
// Get the intermediate result storing blob
65+
const CDnnBlob& TempBlob() const { return *temporaryBlob; }
66+
// Intermediate result storing blob
67+
// hide it to private, its allocated size may > actual
68+
CFloatHandle TempData();
69+
// Reinitialize the intermediate result storing blob
70+
bool ReInitTempBlob( int dataSize );
6571

6672
// Called once on Reset method call
6773
// Resets the stats in the inheriting instances to the initial state
6874
virtual void OnReset() {}
69-
7075
// On each training step the method is called once, before the call to TrainLayer for all layers
7176
virtual void OnTrain() {}
7277

@@ -78,13 +83,20 @@ class NEOML_API CDnnSolver : virtual public IObject {
7883

7984
private:
8085
IMathEngine& mathEngine;
86+
CPtr<CDnnBlob> gradParams;
87+
88+
// MathEngine memory stored variables for calculations
8189
float learningRate;
8290
float regularizationL2;
8391
float regularizationL1;
8492
float maxGradientNorm;
8593
float clipGradientMin;
8694
float clipGradientMax;
8795

96+
// Intermediate result storing
97+
// hide it to private, its allocated size may > actual
98+
CPtr<CDnnBlob> temporaryBlob;
99+
88100
// The blobs sum
89101
struct CDiffBlobSum final {
90102
const CBaseLayer* LayerOwner{}; // for the given layer
@@ -141,7 +153,7 @@ void NEOML_API SerializeSolver( CArchive& archive, CDnn& dnn, CPtr<CDnnSolver>&
141153
//---------------------------------------------------------------------------------------------------------------------
142154

143155
template<class T>
144-
class CSolverClassRegistrar {
156+
class CSolverClassRegistrar final {
145157
public:
146158
explicit CSolverClassRegistrar( const char* solverName );
147159
~CSolverClassRegistrar();
@@ -168,40 +180,27 @@ inline CSolverClassRegistrar<T>::~CSolverClassRegistrar()
168180
class NEOML_API CDnnSimpleGradientSolver : public CDnnSolver {
169181
NEOML_DNN_SOLVER( CDnnSimpleGradientSolver )
170182
public:
171-
CDnnSimpleGradientSolver( IMathEngine& mathEngine );
183+
explicit CDnnSimpleGradientSolver( IMathEngine& mathEngine );
172184

173185
// Moment decay rate (moment is a weighted sum of previous gradients)
174186
float GetMomentDecayRate() const { return momentDecayRate; }
175187
void SetMomentDecayRate(float decayRate) { momentDecayRate = decayRate; }
176-
188+
// Backward compatibility mode
177189
bool IsInCompatibilityMode() const { return isInCompatibilityMode; }
178190
void SetCompatibilityMode( bool compatibilityMode ) { isInCompatibilityMode = compatibilityMode; }
179191

180192
void Serialize( CArchive& archive, const CDnn& dnn ) override;
181193

182194
protected:
195+
// Updates the trainable weights of the layer
183196
void TrainLayer( const CBaseLayer* layer, const CObjectArray<CDnnBlob>& paramBlobs,
184197
const CObjectArray<CDnnBlob>& paramDiffBlobs, CObjectArray<CDnnBlob>& gradientHistory ) override;
185198

186199
private:
187200
// Moment decay rate (moment is a weighted sum of previous gradients)
188201
float momentDecayRate;
189-
190202
// Backward compatibility mode
191203
bool isInCompatibilityMode;
192-
193-
// Temporary variables of Handle type, used for calculations
194-
enum TTempVariable {
195-
TV_MomentDecayRateVar = 0,
196-
TV_OpMomentDecayRateVar,
197-
TV_OpRegL2MomentDecayRateVar,
198-
TV_RateVar,
199-
TV_L1Threshold,
200-
TV_L1Mult,
201-
TV_Count
202-
};
203-
204-
CPtr<CDnnBlob> tempVariables;
205204
};
206205

207206
//---------------------------------------------------------------------------------------------------------------------
@@ -210,7 +209,7 @@ class NEOML_API CDnnSimpleGradientSolver : public CDnnSolver {
210209
class NEOML_API CDnnAdaptiveGradientSolver : public CDnnSolver {
211210
NEOML_DNN_SOLVER( CDnnAdaptiveGradientSolver )
212211
public:
213-
CDnnAdaptiveGradientSolver( IMathEngine& mathEngine );
212+
explicit CDnnAdaptiveGradientSolver( IMathEngine& mathEngine );
214213

215214
// Retrieves and sets the moment decay rate (moment is a weighted sum of previous gradients)
216215
float GetMomentDecayRate() const { return momentDecayRate; }
@@ -222,7 +221,7 @@ class NEOML_API CDnnAdaptiveGradientSolver : public CDnnSolver {
222221
// Retrieves and sets the espilon used to avoid division by zero when calculating second moment
223222
float GetEpsilon() const { return epsilon; }
224223
void SetEpsilon( float newEpsilon ) { epsilon = newEpsilon; }
225-
224+
// Backward compatibility mode
226225
bool IsInCompatibilityMode() const { return isInCompatibilityMode; }
227226
void SetCompatibilityMode( bool compatibilityMode ) { isInCompatibilityMode = compatibilityMode; }
228227

@@ -249,7 +248,7 @@ class NEOML_API CDnnAdaptiveGradientSolver : public CDnnSolver {
249248
// Prepares for the next training step
250249
void OnTrain() override;
251250
// Updates the trainable weights of the layer
252-
virtual void TrainLayer( const CBaseLayer* layer, const CObjectArray<CDnnBlob>& paramBlobs,
251+
void TrainLayer( const CBaseLayer* layer, const CObjectArray<CDnnBlob>& paramBlobs,
253252
const CObjectArray<CDnnBlob>& paramDiffBlobs, CObjectArray<CDnnBlob>& gradientHistory ) override;
254253

255254
private:
@@ -284,27 +283,8 @@ class NEOML_API CDnnAdaptiveGradientSolver : public CDnnSolver {
284283
bool isAmsGradEnabled;
285284
// Perform weight decay after calculating the moving averages
286285
bool isDecoupledWeightDecay;
287-
288286
// Backward compatibility mode
289287
bool isInCompatibilityMode;
290-
291-
enum TTempVariable {
292-
TV_MomentDecayRateVar = 0,
293-
TV_SecondMomentDecayRateVar,
294-
TV_RegL2Var,
295-
TV_OpMomentDecayRateVar,
296-
TV_OpSecondMomentDecayRateVar,
297-
TV_RateVar,
298-
TV_L1Threshold,
299-
TV_L1Mult,
300-
TV_EpsilonVar,
301-
TV_Count
302-
};
303-
304-
// Temporary Handle variables for calculations
305-
CPtr<CDnnBlob> tempVariables;
306-
307-
CPtr<CDnnBlob> temporaryBlob;
308288
};
309289

310290
//---------------------------------------------------------------------------------------------------------------------
@@ -389,26 +369,6 @@ class NEOML_API CDnnNesterovGradientSolver : public CDnnSolver {
389369
float muTPlusOne; // the mu coefficient for the next step
390370
float productMuT; // the product of mu coefficient over all steps including the current one
391371

392-
enum TTempVariable {
393-
TV_MomentDecayRateVar = 0,
394-
TV_SecondMomentDecayRateVar,
395-
TV_RegL2Var,
396-
TV_OpMomentDecayRateVar,
397-
TV_OpSecondMomentDecayRateVar,
398-
TV_RateVar,
399-
TV_L1Threshold,
400-
TV_L1Mult,
401-
TV_EpsilonVar,
402-
TV_InvOpSecondMomentDecayRateNVar, // 1 / (1 - secondMomentDecay ^ N)
403-
TV_MBarGradMultVar, // the gradient coefficient in the total sum
404-
TV_MBarMomentMultVar, // the moment coefficient in the total sum
405-
TV_Count
406-
};
407-
408-
// Temporary blobs for calculations
409-
CPtr<CDnnBlob> tempVariables;
410-
411-
CPtr<CDnnBlob> temporaryBlob;
412372
// m with a stroke (from the paper referred to)
413373
// It is a weighted sum of the gradient and the first moment
414374
CPtr<CDnnBlob> mBarBlob;
@@ -492,11 +452,12 @@ class NEOML_API CDnnLambGradientSolver : public CDnnSolver {
492452
void Serialize( CArchive& archive, const CDnn& dnn ) override;
493453

494454
protected:
455+
// Prepares for the next training step
456+
void OnTrain() override;
457+
// Updates the trainable weights of the layer
495458
void TrainLayer( const CBaseLayer* layer, const CObjectArray<CDnnBlob>& paramBlobs,
496459
const CObjectArray<CDnnBlob>& paramDiffBlobs, CObjectArray<CDnnBlob>& gradientHistory ) override;
497460

498-
void OnTrain() override;
499-
500461
private:
501462
// The gradientHistory array stores the previous values of gradients of different types
502463
enum TGradientHistoryType {
@@ -519,48 +480,28 @@ class NEOML_API CDnnLambGradientSolver : public CDnnSolver {
519480
// Is NVLamb modification used
520481
bool useNvLamb;
521482

522-
enum TTempVariable {
523-
TV_MomentDecayRateVar,
524-
TV_SecondMomentDecayRateVar,
525-
TV_OpMomentDecayRateVar,
526-
TV_OpSecondMomentDecayRateVar,
527-
TV_RateVar,
528-
TV_EpsilonVar,
529-
TV_WeightDecayVar,
530-
TV_ClipMultiplierVar,
531-
TV_LayerNormVar,
532-
TV_TrustRatioVar,
533-
TV_L2NormVar,
534-
535-
TV_Count
536-
};
537-
538-
CPtr<CDnnBlob> tempVariables;
539-
540-
CPtr<CDnnBlob> tempBlob;
541-
483+
CPtr<CDnnBlob> normL2Var;
542484
CArray<float> layersGradientNormSquare;
543485
float totalGradientNorm;
544486

545487
// Layer excluded from optimization
546-
struct CExcludedLayer {
488+
struct CExcludedLayer final {
547489
// Layer name (or substring)
548490
CString LayerName;
549491
// Match type (exact or substring)
550-
TExcludeLayerNameMatchType MatchType;
492+
TExcludeLayerNameMatchType MatchType{ ELNMT_Exact };
551493
// Parameter number
552494
// -1 if all parameters
553-
int ParamIndex;
554-
555-
CExcludedLayer() : MatchType( ELNMT_Exact ), ParamIndex( NotFound ) {}
495+
int ParamIndex{ NotFound };
556496
};
557497
// Layers excluded from weight decay
558498
CArray<CExcludedLayer> excludedLayers;
499+
mutable CPtr<CDnnBlob> tempNormBlob;
559500

560501
float calcL2NormAverage( const CConstFloatHandle& data, int dataSize ) const;
561502
void getWeightDecayIndices( const CBaseLayer& layer, int paramsCount, CHashTable<int>& indexes ) const;
562503

563-
void calcNormalizeMultiplier( const CDnnBlob& weights, const CDnnBlob& update, const CFloatHandle& multiplier ) const;
504+
float calcNormalizeMultiplier( const CDnnBlob& weights, const CDnnBlob& update ) const;
564505
};
565506

566507
template<typename TLayer>

NeoML/include/NeoML/Dnn/Layers/ActivationLayers.h

Lines changed: 20 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ class CActivationDesc;
2828
class NEOML_API CLinearLayer : public CBaseInPlaceLayer, public IActivationLayer {
2929
NEOML_DNN_LAYER( CLinearLayer )
3030
public:
31-
enum TParam { TP_Multiplier, TP_FreeTerm, /*...*/ TP_Count };
3231
using CParam = CLinearActivationParam;
3332
static constexpr float DefaultMultiplier = CParam::DefaultMultiplier;
3433
static constexpr float DefaultFreeTerm = CParam::DefaultFreeTerm;
@@ -38,23 +37,21 @@ class NEOML_API CLinearLayer : public CBaseInPlaceLayer, public IActivationLayer
3837
void Serialize( CArchive& archive ) override;
3938

4039
float GetMultiplier() const { return multiplier; }
41-
void SetMultiplier( float _multiplier ) { multiplier = _multiplier; ForceReshape(); }
40+
void SetMultiplier( float _multiplier ) { multiplier = _multiplier; }
4241
float GetFreeTerm() const { return freeTerm; }
43-
void SetFreeTerm( float _freeTerm ) { freeTerm = _freeTerm; ForceReshape(); }
42+
void SetFreeTerm( float _freeTerm ) { freeTerm = _freeTerm; }
4443

4544
void ApplyParam( CParam param ) { SetMultiplier( param.Multiplier ); SetFreeTerm( param.FreeTerm ); }
4645
CActivationDesc GetDesc() const override;
4746

4847
protected:
49-
void OnReshaped() override;
5048
void RunOnce() override;
5149
void BackwardOnce() override;
5250
int BlobsForBackward() const override { return 0; }
5351

5452
private:
5553
float multiplier = DefaultMultiplier;
5654
float freeTerm = DefaultFreeTerm;
57-
CPtr<CDnnBlob> vars;
5855
};
5956

6057
NEOML_API CLayerWrapper<CLinearLayer> Linear( float multiplier, float freeTerm );
@@ -84,6 +81,9 @@ class NEOML_API CELULayer : public CBaseInPlaceLayer, public IActivationLayer {
8481
void RunOnce() override;
8582
void BackwardOnce() override;
8683
int BlobsForBackward() const override { return TOutputBlobs; }
84+
85+
private:
86+
float alpha = DefaultAlpha;
8787
};
8888

8989
NEOML_API CLayerWrapper<CELULayer> Elu( float alpha = CELULayer::DefaultAlpha );
@@ -104,8 +104,8 @@ class NEOML_API CReLULayer : public CBaseInPlaceLayer, public IActivationLayer {
104104
// The upper cutoff for the function value. If you set it to a value > 0,
105105
// the function will be ReLU(x) = Upper_Threshold for x > Upper_Threshold
106106
// The default value is 0: no cutoff
107-
float GetUpperThreshold() const { return upperThreshold->GetData().GetValue(); }
108-
void SetUpperThreshold( float threshold ) { upperThreshold->GetData().SetValue( threshold ); }
107+
float GetUpperThreshold() const { return upperThreshold; }
108+
void SetUpperThreshold( float threshold ) { upperThreshold = threshold; }
109109

110110
void ApplyParam( CParam param ) { SetUpperThreshold( param.UpperThreshold ); }
111111
CActivationDesc GetDesc() const override;
@@ -116,7 +116,7 @@ class NEOML_API CReLULayer : public CBaseInPlaceLayer, public IActivationLayer {
116116
int BlobsForBackward() const override { return TOutputBlobs; }
117117

118118
private:
119-
CPtr<CDnnBlob> upperThreshold;
119+
float upperThreshold = DefaultUpperThreshold;
120120
};
121121

122122
NEOML_API CLayerWrapper<CReLULayer> Relu( float threshold = CReLULayer::DefaultUpperThreshold );
@@ -146,6 +146,9 @@ class NEOML_API CLeakyReLULayer : public CBaseInPlaceLayer, public IActivationLa
146146
void RunOnce() override;
147147
void BackwardOnce() override;
148148
int BlobsForBackward() const override { return TOutputBlobs; }
149+
150+
private:
151+
float alpha = DefaultAlpha;
149152
};
150153

151154
NEOML_API CLayerWrapper<CLeakyReLULayer> LeakyRelu( float alpha = CLeakyReLULayer::DefaultAlpha );
@@ -268,10 +271,10 @@ class NEOML_API CHardSigmoidLayer : public CBaseInPlaceLayer, public IActivation
268271

269272
void Serialize( CArchive& archive ) override;
270273

271-
float GetSlope() const { return paramBlobs[0]->GetData().GetValue(); }
272-
void SetSlope( float slope ) { paramBlobs[0]->GetData().SetValue( slope ); }
273-
float GetBias() const { return paramBlobs[1]->GetData().GetValue(); }
274-
void SetBias( float bias ) { paramBlobs[1]->GetData().SetValue( bias ); }
274+
float GetSlope() const { return slope; }
275+
void SetSlope( float _slope ) { slope = _slope; }
276+
float GetBias() const { return bias; }
277+
void SetBias( float _bias ) { bias = _bias; }
275278

276279
void ApplyParam( CParam param ) { SetSlope( param.Slope ); SetBias( param.Bias ); }
277280
CActivationDesc GetDesc() const override;
@@ -282,7 +285,8 @@ class NEOML_API CHardSigmoidLayer : public CBaseInPlaceLayer, public IActivation
282285
int BlobsForBackward() const override { return TOutputBlobs; }
283286

284287
private:
285-
void setDefaultParamBlobs( IMathEngine& mathEngine );
288+
float slope = DefaultSlope;
289+
float bias = DefaultBias;
286290
};
287291

288292
NEOML_API CLayerWrapper<CHardSigmoidLayer> HardSigmoid( float slope, float bias );
@@ -373,9 +377,6 @@ class NEOML_API CErfLayer : public CBaseLayer, public IActivationLayer {
373377
void RunOnce() override;
374378
void BackwardOnce() override;
375379
int BlobsForBackward() const override { return TInputBlobs; }
376-
377-
private:
378-
CPtr<CDnnBlob> mult;
379380
};
380381

381382
NEOML_API CLayerWrapper<CErfLayer> Erf();
@@ -395,7 +396,9 @@ class NEOML_API CGELULayer : public CBaseLayer, public IActivationLayer {
395396
static const TCalculationMode CM_Precise = CParam::TCalculationMode::CM_Precise;
396397
static const TCalculationMode CM_SigmoidApproximate = CParam::TCalculationMode::CM_SigmoidApproximate;
397398

398-
explicit CGELULayer( IMathEngine& mathEngine );
399+
explicit CGELULayer( IMathEngine& mathEngine ) :
400+
CBaseLayer( mathEngine, "CGELULayer", false )
401+
{}
399402

400403
void Serialize( CArchive& archive ) override;
401404

@@ -415,18 +418,6 @@ class NEOML_API CGELULayer : public CBaseLayer, public IActivationLayer {
415418

416419
private:
417420
TCalculationMode mode = DefaultCalculationMode;
418-
419-
// 1
420-
CFloatHandleVar oneVar;
421-
// 0.5
422-
CFloatHandleVar halfVar;
423-
// 1/sqrt(2)
424-
CFloatHandleVar sqrt2InvVar;
425-
// 1/sqrt(2pi)
426-
CFloatHandleVar sqrt2PiInvVar;
427-
// 1.702f
428-
CFloatHandleVar approxScaleVar;
429-
430421
CPtr<CDnnBlob> erfMemoization;
431422

432423
void runPrecise();

0 commit comments

Comments
 (0)