@@ -51,31 +51,70 @@ void CpuDeviceInterface::initialize(
51
51
const VideoStreamOptions& videoStreamOptions,
52
52
const std::vector<std::unique_ptr<Transform>>& transforms,
53
53
const AVRational& timeBase,
54
- const FrameDims& outputDims) {
54
+ [[maybe_unused]] const FrameDims& metadataDims,
55
+ const std::optional<FrameDims>& resizedOutputDims) {
55
56
videoStreamOptions_ = videoStreamOptions;
56
57
timeBase_ = timeBase;
57
- outputDims_ = outputDims;
58
-
59
- // We want to use swscale for color conversion if possible because it is
60
- // faster than filtergraph. The following are the conditions we need to meet
61
- // to use it.
58
+ resizedOutputDims_ = resizedOutputDims;
62
59
63
60
// We can only use swscale when we have a single resize transform. Note that
64
61
// this means swscale will not support the case of having several,
65
62
// back-to-base resizes. There's no strong reason to even do that, but if
66
63
// someone does, it's more correct to implement that with filtergraph.
67
- bool areTransformsSwScaleCompatible = transforms.empty () ||
64
+ //
65
+ // We calculate this value during initilization but we don't refer to it until
66
+ // getColorConversionLibrary() is called. Calculating this value during
67
+ // initialization saves us from having to save all of the transforms.
68
+ areTransformsSwScaleCompatible_ = transforms.empty () ||
68
69
(transforms.size () == 1 && transforms[0 ]->isResize ());
69
70
70
- // swscale requires widths to be multiples of 32:
71
- // https://stackoverflow.com/questions/74351955/turn-off-sw-scale-conversion-to-planar-yuv-32-byte-alignment-requirements
72
- bool isWidthSwScaleCompatible = (outputDims_.width % 32 ) == 0 ;
73
-
74
71
// Note that we do not expose this capability in the public API, only through
75
72
// the core API.
76
- bool userRequestedSwScale = videoStreamOptions_.colorConversionLibrary ==
73
+ //
74
+ // Same as above, we calculate this value during initialization and refer to
75
+ // it in getColorConversionLibrary().
76
+ userRequestedSwScale_ = videoStreamOptions_.colorConversionLibrary ==
77
77
ColorConversionLibrary::SWSCALE;
78
78
79
+ // We can only use swscale when we have a single resize transform. Note that
80
+ // we actually decide on whether or not to actually use swscale at the last
81
+ // possible moment, when we actually convert the frame. This is because we
82
+ // need to know the actual frame dimensions.
83
+ if (transforms.size () == 1 && transforms[0 ]->isResize ()) {
84
+ auto resize = dynamic_cast <ResizeTransform*>(transforms[0 ].get ());
85
+ TORCH_CHECK (resize != nullptr , " ResizeTransform expected but not found!" )
86
+ swsFlags_ = resize->getSwsFlags ();
87
+ }
88
+
89
+ // If we have any transforms, replace filters_ with the filter strings from
90
+ // the transforms. As noted above, we decide between swscale and filtergraph
91
+ // when we actually decode a frame.
92
+ std::stringstream filters;
93
+ bool first = true ;
94
+ for (const auto & transform : transforms) {
95
+ if (!first) {
96
+ filters << " ," ;
97
+ }
98
+ filters << transform->getFilterGraphCpu ();
99
+ first = false ;
100
+ }
101
+ if (!transforms.empty ()) {
102
+ filters_ = filters.str ();
103
+ }
104
+
105
+ initialized_ = true ;
106
+ }
107
+
108
+ ColorConversionLibrary CpuDeviceInterface::getColorConversionLibrary (
109
+ const FrameDims& outputDims) {
110
+ // swscale requires widths to be multiples of 32:
111
+ // https://stackoverflow.com/questions/74351955/turn-off-sw-scale-conversion-to-planar-yuv-32-byte-alignment-requirements
112
+ bool isWidthSwScaleCompatible = (outputDims.width % 32 ) == 0 ;
113
+
114
+ // We want to use swscale for color conversion if possible because it is
115
+ // faster than filtergraph. The following are the conditions we need to meet
116
+ // to use it.
117
+ //
79
118
// Note that we treat the transform limitation differently from the width
80
119
// limitation. That is, we consider the transforms being compatible with
81
120
// swscale as a hard requirement. If the transforms are not compatiable,
@@ -86,38 +125,12 @@ void CpuDeviceInterface::initialize(
86
125
// behavior. Since we don't expose the ability to choose swscale or
87
126
// filtergraph in our public API, this is probably okay. It's also the only
88
127
// way that we can be certain we are testing one versus the other.
89
- if (areTransformsSwScaleCompatible &&
90
- (userRequestedSwScale || isWidthSwScaleCompatible)) {
91
- colorConversionLibrary_ = ColorConversionLibrary::SWSCALE;
92
-
93
- // We established above that if the transforms are swscale compatible and
94
- // non-empty, then they must have only one transform, and that transform is
95
- // ResizeTransform.
96
- if (!transforms.empty ()) {
97
- auto resize = dynamic_cast <ResizeTransform*>(transforms[0 ].get ());
98
- TORCH_CHECK (resize != nullptr , " ResizeTransform expected but not found!" )
99
- swsFlags_ = resize->getSwsFlags ();
100
- }
128
+ if (areTransformsSwScaleCompatible_ &&
129
+ (userRequestedSwScale_ || isWidthSwScaleCompatible)) {
130
+ return ColorConversionLibrary::SWSCALE;
101
131
} else {
102
- colorConversionLibrary_ = ColorConversionLibrary::FILTERGRAPH;
103
-
104
- // If we have any transforms, replace filters_ with the filter strings from
105
- // the transforms.
106
- std::stringstream filters;
107
- bool first = true ;
108
- for (const auto & transform : transforms) {
109
- if (!first) {
110
- filters << " ," ;
111
- }
112
- filters << transform->getFilterGraphCpu ();
113
- first = false ;
114
- }
115
- if (!transforms.empty ()) {
116
- filters_ = filters.str ();
117
- }
132
+ return ColorConversionLibrary::FILTERGRAPH;
118
133
}
119
-
120
- initialized_ = true ;
121
134
}
122
135
123
136
// Note [preAllocatedOutputTensor with swscale and filtergraph]:
@@ -134,24 +147,42 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
134
147
FrameOutput& frameOutput,
135
148
std::optional<torch::Tensor> preAllocatedOutputTensor) {
136
149
TORCH_CHECK (initialized_, " CpuDeviceInterface was not initialized." );
150
+
151
+ // Note that we ignore the dimensions from the metadata; we don't even bother
152
+ // storing them. The resized dimensions take priority. If we don't have any,
153
+ // then we use the dimensions from the actual decoded frame. We use the actual
154
+ // decoded frame and not the metadata for two reasons:
155
+ //
156
+ // 1. Metadata may be wrong. If we access to more accurate information, we
157
+ // should use it.
158
+ // 2. Video streams can have variable resolution. This fact is not captured
159
+ // in the stream metadata.
160
+ //
161
+ // Both cases cause problems for our batch APIs, as we allocate
162
+ // FrameBatchOutputs based on the the stream metadata. But single-frame APIs
163
+ // can still work in such situations, so they should.
164
+ auto outputDims =
165
+ resizedOutputDims_.value_or (FrameDims (avFrame->width , avFrame->height ));
166
+
137
167
if (preAllocatedOutputTensor.has_value ()) {
138
168
auto shape = preAllocatedOutputTensor.value ().sizes ();
139
169
TORCH_CHECK (
140
- (shape.size () == 3 ) && (shape[0 ] == outputDims_ .height ) &&
141
- (shape[1 ] == outputDims_ .width ) && (shape[2 ] == 3 ),
170
+ (shape.size () == 3 ) && (shape[0 ] == outputDims .height ) &&
171
+ (shape[1 ] == outputDims .width ) && (shape[2 ] == 3 ),
142
172
" Expected pre-allocated tensor of shape " ,
143
- outputDims_ .height ,
173
+ outputDims .height ,
144
174
" x" ,
145
- outputDims_ .width ,
175
+ outputDims .width ,
146
176
" x3, got " ,
147
177
shape);
148
178
}
149
179
180
+ auto colorConversionLibrary = getColorConversionLibrary (outputDims);
150
181
torch::Tensor outputTensor;
151
182
enum AVPixelFormat frameFormat =
152
183
static_cast <enum AVPixelFormat>(avFrame->format );
153
184
154
- if (colorConversionLibrary_ == ColorConversionLibrary::SWSCALE) {
185
+ if (colorConversionLibrary == ColorConversionLibrary::SWSCALE) {
155
186
// We need to compare the current frame context with our previous frame
156
187
// context. If they are different, then we need to re-create our colorspace
157
188
// conversion objects. We create our colorspace conversion objects late so
@@ -163,11 +194,11 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
163
194
avFrame->width ,
164
195
avFrame->height ,
165
196
frameFormat,
166
- outputDims_ .width ,
167
- outputDims_ .height );
197
+ outputDims .width ,
198
+ outputDims .height );
168
199
169
200
outputTensor = preAllocatedOutputTensor.value_or (
170
- allocateEmptyHWCTensor (outputDims_ , torch::kCPU ));
201
+ allocateEmptyHWCTensor (outputDims , torch::kCPU ));
171
202
172
203
if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) {
173
204
createSwsContext (swsFrameContext, avFrame->colorspace );
@@ -180,42 +211,42 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
180
211
// the expected height.
181
212
// TODO: Can we do the same check for width?
182
213
TORCH_CHECK (
183
- resultHeight == outputDims_ .height ,
184
- " resultHeight != outputDims_ .height: " ,
214
+ resultHeight == outputDims .height ,
215
+ " resultHeight != outputDims .height: " ,
185
216
resultHeight,
186
217
" != " ,
187
- outputDims_ .height );
218
+ outputDims .height );
188
219
189
220
frameOutput.data = outputTensor;
190
- } else if (colorConversionLibrary_ == ColorConversionLibrary::FILTERGRAPH) {
221
+ } else if (colorConversionLibrary == ColorConversionLibrary::FILTERGRAPH) {
191
222
FiltersContext filtersContext (
192
223
avFrame->width ,
193
224
avFrame->height ,
194
225
frameFormat,
195
226
avFrame->sample_aspect_ratio ,
196
- outputDims_ .width ,
197
- outputDims_ .height ,
227
+ outputDims .width ,
228
+ outputDims .height ,
198
229
AV_PIX_FMT_RGB24,
199
230
filters_,
200
231
timeBase_);
201
232
202
- if (!filterGraphContext_ || prevFiltersContext_ != filtersContext) {
203
- filterGraphContext_ =
233
+ if (!filterGraph_ || prevFiltersContext_ != filtersContext) {
234
+ filterGraph_ =
204
235
std::make_unique<FilterGraph>(filtersContext, videoStreamOptions_);
205
236
prevFiltersContext_ = std::move (filtersContext);
206
237
}
207
- outputTensor = rgbAVFrameToTensor (filterGraphContext_ ->convert (avFrame));
238
+ outputTensor = rgbAVFrameToTensor (filterGraph_ ->convert (avFrame));
208
239
209
240
// Similarly to above, if this check fails it means the frame wasn't
210
241
// reshaped to its expected dimensions by filtergraph.
211
242
auto shape = outputTensor.sizes ();
212
243
TORCH_CHECK (
213
- (shape.size () == 3 ) && (shape[0 ] == outputDims_ .height ) &&
214
- (shape[1 ] == outputDims_ .width ) && (shape[2 ] == 3 ),
244
+ (shape.size () == 3 ) && (shape[0 ] == outputDims .height ) &&
245
+ (shape[1 ] == outputDims .width ) && (shape[2 ] == 3 ),
215
246
" Expected output tensor of shape " ,
216
- outputDims_ .height ,
247
+ outputDims .height ,
217
248
" x" ,
218
- outputDims_ .width ,
249
+ outputDims .width ,
219
250
" x3, got " ,
220
251
shape);
221
252
@@ -231,7 +262,7 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
231
262
TORCH_CHECK (
232
263
false ,
233
264
" Invalid color conversion library: " ,
234
- static_cast <int >(colorConversionLibrary_ ));
265
+ static_cast <int >(colorConversionLibrary ));
235
266
}
236
267
}
237
268
0 commit comments