From 1325603fd08144d6a6413079cbdbf2289f05544b Mon Sep 17 00:00:00 2001 From: mani Date: Thu, 26 Feb 2026 01:19:55 +0100 Subject: [PATCH] Fix CRT shader: rewrite to NV12, remove scale_opencl format conversions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scale_opencl does not support rgba output in this jellyfin-ffmpeg build. Rewrite the OpenCL kernel to accept and emit NV12 planes directly (src_y, src_uv, dst_y, dst_uv) doing YCbCr↔RGB conversion internally. Remove the scale_opencl=format=rgba and scale_opencl=format=nv12 wrappers from GetCrtShaderOclFilters — program_opencl alone is enough. VAAPI decoder path: hwdownload+hwupload to QSV (safe; program_opencl creates new output frames without a VAAPI reverse link). Co-Authored-By: Claude Sonnet 4.6 --- .../Resources/Shaders/crt_lottes.cl | 154 +++++++++++------- .../MediaEncoding/EncodingHelper.cs | 33 ++-- 2 files changed, 111 insertions(+), 76 deletions(-) diff --git a/Jellyfin.Server/Resources/Shaders/crt_lottes.cl b/Jellyfin.Server/Resources/Shaders/crt_lottes.cl index f0d8b7d8e2..c60e03f12f 100644 --- a/Jellyfin.Server/Resources/Shaders/crt_lottes.cl +++ b/Jellyfin.Server/Resources/Shaders/crt_lottes.cl @@ -8,8 +8,12 @@ // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. +// +// Input/output: NV12 (Y plane + interleaved CbCr plane). +// Kernel signature: (src_y, src_uv, dst_y, dst_uv) +// FFmpeg program_opencl passes one image2d_t per plane in plane order. -// ── Parameters (override via build_opts, e.g. -DSHADOW_MASK=2) ─────────────── +// ── Parameters ──────────────────────────────────────────────────────────────── #ifndef HARD_SCAN #define HARD_SCAN (-8.0f) #endif @@ -61,27 +65,29 @@ static float3 delinearize_rgb(float3 c) return c; } -// ── Texture helper ──────────────────────────────────────────────────────────── +// ── NV12 fetch helper ───────────────────────────────────────────────────────── +// Reads Y + CbCr from NV12 planes and returns linearised RGB. +// Both planes use the same normalised coordinates [0,1]; the UV plane is +// half-resolution but the sampler maps the same normalised position to the +// corresponding chroma sample automatically. -static float3 fetch_sample( - __read_only image2d_t src, - sampler_t smp, - float2 pos, // normalised (0..1) in SOURCE space - float2 off_texels, // offset in texel units - float2 src_size) -{ - float2 p = pos + off_texels / src_size; - return BRIGHTNESS_BOOST * read_imagef(src, smp, p).xyz; -} - -static float3 nearest_emulated_sample( - __read_only image2d_t src, +static float3 fetch_linear_nv12( + __read_only image2d_t y_plane, + __read_only image2d_t uv_plane, sampler_t smp, float2 pos, float2 off_texels, float2 src_size) { - return linearize_rgb(fetch_sample(src, smp, pos, off_texels, src_size)); + float2 p = pos + off_texels / src_size; + float y = BRIGHTNESS_BOOST * read_imagef(y_plane, smp, p).x; + float2 cbcr = read_imagef(uv_plane, smp, p).xy - 0.5f; // centre Cb/Cr + + // BT.709 full-range YCbCr → RGB + float r = clamp(y + 1.5748f * cbcr.y, 0.0f, 1.0f); + float g = clamp(y - 0.1873f * cbcr.x - 0.4681f * cbcr.y, 0.0f, 1.0f); + float b = clamp(y + 1.8556f * cbcr.x, 0.0f, 1.0f); + return linearize_rgb((float3)(r, g, b)); } // ── Gaussian kernel ─────────────────────────────────────────────────────────── @@ -91,7 +97,6 @@ static float gauss1d(float pos, float scale) return exp2(scale * pow(fabs(pos), SHAPE)); } -// distance from pos to its nearest texel centre (fractional part, −0.5..+0.5) static float2 distance_to_texel(float2 pos, float2 src_size) { return -1.0f * fract(pos * src_size - 0.5f); @@ -100,12 +105,14 @@ static float2 distance_to_texel(float2 pos, float2 src_size) // ── Horizontal reconstruction (3 / 5 / 7 tap) ──────────────────────────────── static float3 horz3( - __read_only image2d_t src, sampler_t smp, + __read_only image2d_t y_plane, + __read_only image2d_t uv_plane, + sampler_t smp, float2 pos, float off_y, float scale, float2 src_size) { - float3 c = nearest_emulated_sample(src, smp, pos, (float2)(-1.0f, off_y), src_size); - float3 d = nearest_emulated_sample(src, smp, pos, (float2)( 0.0f, off_y), src_size); - float3 e = nearest_emulated_sample(src, smp, pos, (float2)( 1.0f, off_y), src_size); + float3 c = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)(-1.0f, off_y), src_size); + float3 d = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 0.0f, off_y), src_size); + float3 e = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 1.0f, off_y), src_size); float dst = distance_to_texel(pos, src_size).x; float wc = gauss1d(dst - 1.0f, scale); float wd = gauss1d(dst, scale); @@ -114,14 +121,16 @@ static float3 horz3( } static float3 horz5( - __read_only image2d_t src, sampler_t smp, + __read_only image2d_t y_plane, + __read_only image2d_t uv_plane, + sampler_t smp, float2 pos, float off_y, float scale, float2 src_size) { - float3 b = nearest_emulated_sample(src, smp, pos, (float2)(-2.0f, off_y), src_size); - float3 c = nearest_emulated_sample(src, smp, pos, (float2)(-1.0f, off_y), src_size); - float3 d = nearest_emulated_sample(src, smp, pos, (float2)( 0.0f, off_y), src_size); - float3 e = nearest_emulated_sample(src, smp, pos, (float2)( 1.0f, off_y), src_size); - float3 f = nearest_emulated_sample(src, smp, pos, (float2)( 2.0f, off_y), src_size); + float3 b = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)(-2.0f, off_y), src_size); + float3 c = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)(-1.0f, off_y), src_size); + float3 d = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 0.0f, off_y), src_size); + float3 e = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 1.0f, off_y), src_size); + float3 f = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 2.0f, off_y), src_size); float dst = distance_to_texel(pos, src_size).x; float wb = gauss1d(dst - 2.0f, scale); float wc = gauss1d(dst - 1.0f, scale); @@ -132,16 +141,18 @@ static float3 horz5( } static float3 horz7( - __read_only image2d_t src, sampler_t smp, + __read_only image2d_t y_plane, + __read_only image2d_t uv_plane, + sampler_t smp, float2 pos, float off_y, float scale, float2 src_size) { - float3 a = nearest_emulated_sample(src, smp, pos, (float2)(-3.0f, off_y), src_size); - float3 b = nearest_emulated_sample(src, smp, pos, (float2)(-2.0f, off_y), src_size); - float3 c = nearest_emulated_sample(src, smp, pos, (float2)(-1.0f, off_y), src_size); - float3 d = nearest_emulated_sample(src, smp, pos, (float2)( 0.0f, off_y), src_size); - float3 e = nearest_emulated_sample(src, smp, pos, (float2)( 1.0f, off_y), src_size); - float3 f = nearest_emulated_sample(src, smp, pos, (float2)( 2.0f, off_y), src_size); - float3 g = nearest_emulated_sample(src, smp, pos, (float2)( 3.0f, off_y), src_size); + float3 a = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)(-3.0f, off_y), src_size); + float3 b = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)(-2.0f, off_y), src_size); + float3 c = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)(-1.0f, off_y), src_size); + float3 d = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 0.0f, off_y), src_size); + float3 e = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 1.0f, off_y), src_size); + float3 f = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 2.0f, off_y), src_size); + float3 g = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 3.0f, off_y), src_size); float dst = distance_to_texel(pos, src_size).x; float wa = gauss1d(dst - 3.0f, scale); float wb = gauss1d(dst - 2.0f, scale); @@ -181,12 +192,14 @@ static float bloom_scan_weight(float2 pos, float off, float2 src_size) // ── Main CRT reconstruction ─────────────────────────────────────────────────── static float3 tri( - __read_only image2d_t src, sampler_t smp, + __read_only image2d_t y_plane, + __read_only image2d_t uv_plane, + sampler_t smp, float2 pos, float2 src_size) { - float3 a = horz3(src, smp, pos, -1.0f, -10.0f, src_size); - float3 b = horz5(src, smp, pos, 0.0f, -10.0f, src_size); - float3 c = horz3(src, smp, pos, 1.0f, -10.0f, src_size); + float3 a = horz3(y_plane, uv_plane, smp, pos, -1.0f, -10.0f, src_size); + float3 b = horz5(y_plane, uv_plane, smp, pos, 0.0f, -10.0f, src_size); + float3 c = horz3(y_plane, uv_plane, smp, pos, 1.0f, -10.0f, src_size); float wa = scan_weight(pos, -1.0f, src_size); float wb = scan_weight(pos, 0.0f, src_size); float wc = scan_weight(pos, 1.0f, src_size); @@ -194,14 +207,16 @@ static float3 tri( } static float3 bloom( - __read_only image2d_t src, sampler_t smp, + __read_only image2d_t y_plane, + __read_only image2d_t uv_plane, + sampler_t smp, float2 pos, float2 src_size) { - float3 a = horz5(src, smp, pos, -2.0f, -3.0f, src_size); - float3 b = horz7(src, smp, pos, -1.0f, -1.5f, src_size); - float3 c = horz7(src, smp, pos, 0.0f, -1.5f, src_size); - float3 d = horz7(src, smp, pos, 1.0f, -1.5f, src_size); - float3 e = horz5(src, smp, pos, 2.0f, -3.0f, src_size); + float3 a = horz5(y_plane, uv_plane, smp, pos, -2.0f, -3.0f, src_size); + float3 b = horz7(y_plane, uv_plane, smp, pos, -1.0f, -1.5f, src_size); + float3 c = horz7(y_plane, uv_plane, smp, pos, 0.0f, -1.5f, src_size); + float3 d = horz7(y_plane, uv_plane, smp, pos, 1.0f, -1.5f, src_size); + float3 e = horz5(y_plane, uv_plane, smp, pos, 2.0f, -3.0f, src_size); float wa = bloom_scan_weight(pos, -2.0f, src_size); float wb = bloom_scan_weight(pos, -1.0f, src_size); float wc = bloom_scan_weight(pos, 0.0f, src_size); @@ -252,44 +267,48 @@ static float3 apply_mask(float2 px) } // ── Entry point ─────────────────────────────────────────────────────────────── +// NV12: FFmpeg program_opencl passes planes in order, so for 2-plane NV12: +// arg 0 = src_y (input Y, R channel, full resolution) +// arg 1 = src_uv (input UV, RG channels, half resolution) +// arg 2 = dst_y (output Y, full resolution) +// arg 3 = dst_uv (output UV, half resolution) +// Global work size is set to dst_y dimensions (full resolution). __kernel void crt_lottes( - __read_only image2d_t src, - __write_only image2d_t dst) + __read_only image2d_t src_y, + __read_only image2d_t src_uv, + __write_only image2d_t dst_y, + __write_only image2d_t dst_uv) { int2 coord = (int2)(get_global_id(0), get_global_id(1)); - const int dst_w = get_image_width(dst); - const int dst_h = get_image_height(dst); + const int dst_w = get_image_width(dst_y); + const int dst_h = get_image_height(dst_y); if (coord.x >= dst_w || coord.y >= dst_h) return; - const int src_w = get_image_width(src); - const int src_h = get_image_height(src); + const int src_w = get_image_width(src_y); + const int src_h = get_image_height(src_y); const float2 dst_size = (float2)(dst_w, dst_h); const float2 src_size = (float2)(src_w, src_h); - // Linear (normalised) position in output space + // Normalised position in output space const float2 out_pos = ((float2)(coord.x, coord.y) + 0.5f) / dst_size; - // Sampler: normalised coords + linear filter + clamp-to-edge + // Sampler: normalised coords, linear filter, clamp-to-edge const sampler_t smp = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR; - // Map to source coords (src may differ from dst when upscaling) - // With FFmpeg program_opencl output resolution matches input (same frame size). - const float2 src_pos = out_pos; - // Apply CRT barrel-curvature - float2 bent = bend_screen(src_pos); + float2 bent = bend_screen(out_pos); // Main scanline reconstruction + bloom - float3 color = tri(src, smp, bent, src_size); - color += bloom(src, smp, bent, src_size) * BLOOM_AMOUNT; + float3 color = tri(src_y, src_uv, smp, bent, src_size); + color += bloom(src_y, src_uv, smp, bent, src_size) * BLOOM_AMOUNT; // Shadow mask #if SHADOW_MASK != 0 @@ -301,7 +320,18 @@ __kernel void crt_lottes( int in_bounds = (bent.x >= 0.0f && bent.x <= 1.0f && bent.y >= 0.0f && bent.y <= 1.0f) ? 1 : 0; - float3 result = in_bounds ? delinearize_rgb(color) : (float3)(0.0f); + float3 rgb = in_bounds ? delinearize_rgb(color) : (float3)(0.0f); - write_imagef(dst, coord, (float4)(result, 1.0f)); + // BT.709 full-range RGB → YCbCr + float y_out = 0.2126f * rgb.x + 0.7152f * rgb.y + 0.0722f * rgb.z; + float cb_out = -0.1146f * rgb.x - 0.3854f * rgb.y + 0.5000f * rgb.z + 0.5f; + float cr_out = 0.5000f * rgb.x - 0.4542f * rgb.y - 0.0458f * rgb.z + 0.5f; + + // Write Y at full resolution + write_imagef(dst_y, coord, (float4)(y_out, 0.0f, 0.0f, 1.0f)); + + // Write UV at half resolution (one thread per 2x2 Y block, no races) + if ((coord.x & 1) == 0 && (coord.y & 1) == 0) { + write_imagef(dst_uv, coord >> 1, (float4)(cb_out, cr_out, 0.0f, 1.0f)); + } } diff --git a/MediaBrowser.Controller/MediaEncoding/EncodingHelper.cs b/MediaBrowser.Controller/MediaEncoding/EncodingHelper.cs index f3fcf69511..759c9aaaa0 100644 --- a/MediaBrowser.Controller/MediaEncoding/EncodingHelper.cs +++ b/MediaBrowser.Controller/MediaEncoding/EncodingHelper.cs @@ -3764,12 +3764,12 @@ namespace MediaBrowser.Controller.MediaEncoding var escapedPath = GetCrtEscapedShaderPath(); var buildOpts = GetCrtBuildOpts(state); + // No scale_opencl format conversion needed: the shader reads and writes + // NV12 planes directly (src_y, src_uv, dst_y, dst_uv). return [ - "scale_opencl=format=rgba", FormattableString.Invariant( - $"program_opencl=source={escapedPath}:kernel=crt_lottes"), - "scale_opencl=format=nv12" + $"program_opencl=source={escapedPath}:kernel=crt_lottes") ]; } @@ -3789,8 +3789,9 @@ namespace MediaBrowser.Controller.MediaEncoding var escapedPath = GetCrtEscapedShaderPath(); var buildOpts = GetCrtBuildOpts(state); + // Shader works with NV12 planes directly; no format conversion needed. return FormattableString.Invariant( - $"format=rgba,hwupload=derive_device=opencl,program_opencl=source={escapedPath}:kernel=crt_lottes,hwdownload,format=yuv420p"); + $"hwupload=derive_device=opencl,program_opencl=source={escapedPath}:kernel=crt_lottes,hwdownload,format=nv12"); } /// @@ -4924,19 +4925,23 @@ namespace MediaBrowser.Controller.MediaEncoding { if (IsCrtShaderEnabled(state)) { - // VAAPI → OpenCL → CRT → VAAPI (reverse=1) → QSV. - // Mirrors the doOclTonemap+isVaInVaOut pattern: derive OpenCL - // from VAAPI, process, then reverse-map back to VAAPI, and - // finally derive QSV (zero-copy, same libva surface). + // VAAPI → OpenCL → CRT (NV12) → CPU → QSV. + // program_opencl outputs NV12 frames into its own pool; we cannot + // rely on hwmap reverse=1 back to VAAPI from those new frames. + // hwdownload then hwupload (-filter_hw_device qsv) is the safe path. mainFilters.Add("hwmap=derive_device=opencl:mode=read"); mainFilters.AddRange(GetCrtShaderOclFilters(state)); - mainFilters.Add("hwmap=derive_device=vaapi:mode=write:reverse=1"); - mainFilters.Add("format=vaapi"); + mainFilters.Add("hwdownload"); + mainFilters.Add("format=nv12"); + mainFilters.Add("hwupload"); + mainFilters.Add("format=qsv"); + } + else + { + // VAAPI → QSV (zero-copy, shared libva surface) + mainFilters.Add("hwmap=derive_device=qsv"); + mainFilters.Add("format=qsv"); } - - // VAAPI → QSV (zero-copy, shared libva surface) - mainFilters.Add("hwmap=derive_device=qsv"); - mainFilters.Add("format=qsv"); } else if (isQsvDecoder) {