@@ -178,33 +178,13 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
178
178
179
179
auto colorConversionLibrary = getColorConversionLibrary (outputDims);
180
180
torch::Tensor outputTensor;
181
- enum AVPixelFormat frameFormat =
182
- static_cast <enum AVPixelFormat>(avFrame->format );
183
181
184
182
if (colorConversionLibrary == ColorConversionLibrary::SWSCALE) {
185
- // We need to compare the current frame context with our previous frame
186
- // context. If they are different, then we need to re-create our colorspace
187
- // conversion objects. We create our colorspace conversion objects late so
188
- // that we don't have to depend on the unreliable metadata in the header.
189
- // And we sometimes re-create them because it's possible for frame
190
- // resolution to change mid-stream. Finally, we want to reuse the colorspace
191
- // conversion objects as much as possible for performance reasons.
192
- SwsFrameContext swsFrameContext (
193
- avFrame->width ,
194
- avFrame->height ,
195
- frameFormat,
196
- outputDims.width ,
197
- outputDims.height );
198
-
199
183
outputTensor = preAllocatedOutputTensor.value_or (
200
184
allocateEmptyHWCTensor (outputDims, torch::kCPU ));
201
185
202
- if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) {
203
- createSwsContext (swsFrameContext, avFrame->colorspace );
204
- prevSwsFrameContext_ = swsFrameContext;
205
- }
206
186
int resultHeight =
207
- convertAVFrameToTensorUsingSwScale (avFrame, outputTensor);
187
+ convertAVFrameToTensorUsingSwScale (avFrame, outputTensor, outputDims );
208
188
209
189
// If this check failed, it would mean that the frame wasn't reshaped to
210
190
// the expected height.
@@ -218,23 +198,7 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
218
198
219
199
frameOutput.data = outputTensor;
220
200
} else if (colorConversionLibrary == ColorConversionLibrary::FILTERGRAPH) {
221
- FiltersContext filtersContext (
222
- avFrame->width ,
223
- avFrame->height ,
224
- frameFormat,
225
- avFrame->sample_aspect_ratio ,
226
- outputDims.width ,
227
- outputDims.height ,
228
- AV_PIX_FMT_RGB24,
229
- filters_,
230
- timeBase_);
231
-
232
- if (!filterGraph_ || prevFiltersContext_ != filtersContext) {
233
- filterGraph_ =
234
- std::make_unique<FilterGraph>(filtersContext, videoStreamOptions_);
235
- prevFiltersContext_ = std::move (filtersContext);
236
- }
237
- outputTensor = rgbAVFrameToTensor (filterGraph_->convert (avFrame));
201
+ outputTensor = convertAVFrameToTensorUsingFilterGraph (avFrame, outputDims);
238
202
239
203
// Similarly to above, if this check fails it means the frame wasn't
240
204
// reshaped to its expected dimensions by filtergraph.
@@ -267,7 +231,30 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
267
231
268
232
int CpuDeviceInterface::convertAVFrameToTensorUsingSwScale (
269
233
const UniqueAVFrame& avFrame,
270
- torch::Tensor& outputTensor) {
234
+ torch::Tensor& outputTensor,
235
+ const FrameDims& outputDims) {
236
+ enum AVPixelFormat frameFormat =
237
+ static_cast <enum AVPixelFormat>(avFrame->format );
238
+
239
+ // We need to compare the current frame context with our previous frame
240
+ // context. If they are different, then we need to re-create our colorspace
241
+ // conversion objects. We create our colorspace conversion objects late so
242
+ // that we don't have to depend on the unreliable metadata in the header.
243
+ // And we sometimes re-create them because it's possible for frame
244
+ // resolution to change mid-stream. Finally, we want to reuse the colorspace
245
+ // conversion objects as much as possible for performance reasons.
246
+ SwsFrameContext swsFrameContext (
247
+ avFrame->width ,
248
+ avFrame->height ,
249
+ frameFormat,
250
+ outputDims.width ,
251
+ outputDims.height );
252
+
253
+ if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) {
254
+ createSwsContext (swsFrameContext, avFrame->colorspace );
255
+ prevSwsFrameContext_ = swsFrameContext;
256
+ }
257
+
271
258
uint8_t * pointers[4 ] = {
272
259
outputTensor.data_ptr <uint8_t >(), nullptr , nullptr , nullptr };
273
260
int expectedOutputWidth = outputTensor.sizes ()[1 ];
@@ -293,7 +280,7 @@ void CpuDeviceInterface::createSwsContext(
293
280
swsFrameContext.outputWidth ,
294
281
swsFrameContext.outputHeight ,
295
282
AV_PIX_FMT_RGB24,
296
- SWS_BILINEAR ,
283
+ swsFlags_ ,
297
284
nullptr ,
298
285
nullptr ,
299
286
nullptr );
@@ -328,4 +315,29 @@ void CpuDeviceInterface::createSwsContext(
328
315
swsContext_.reset (swsContext);
329
316
}
330
317
318
+ torch::Tensor CpuDeviceInterface::convertAVFrameToTensorUsingFilterGraph (
319
+ const UniqueAVFrame& avFrame,
320
+ const FrameDims& outputDims) {
321
+ enum AVPixelFormat frameFormat =
322
+ static_cast <enum AVPixelFormat>(avFrame->format );
323
+
324
+ FiltersContext filtersContext (
325
+ avFrame->width ,
326
+ avFrame->height ,
327
+ frameFormat,
328
+ avFrame->sample_aspect_ratio ,
329
+ outputDims.width ,
330
+ outputDims.height ,
331
+ AV_PIX_FMT_RGB24,
332
+ filters_,
333
+ timeBase_);
334
+
335
+ if (!filterGraph_ || prevFiltersContext_ != filtersContext) {
336
+ filterGraph_ =
337
+ std::make_unique<FilterGraph>(filtersContext, videoStreamOptions_);
338
+ prevFiltersContext_ = std::move (filtersContext);
339
+ }
340
+ return rgbAVFrameToTensor (filterGraph_->convert (avFrame));
341
+ }
342
+
331
343
} // namespace facebook::torchcodec
0 commit comments