Fix CRT shader: rewrite to NV12, remove scale_opencl format conversions

scale_opencl does not support rgba output in this jellyfin-ffmpeg build. Rewrite the OpenCL kernel to accept and emit NV12 planes directly (src_y, src_uv, dst_y, dst_uv) doing YCbCr↔RGB conversion internally. Remove the scale_opencl=format=rgba and scale_opencl=format=nv12 wrappers from GetCrtShaderOclFilters — program_opencl alone is enough. VAAPI decoder path: hwdownload+hwupload to QSV (safe; program_opencl creates new output frames without a VAAPI reverse link). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-26 01:19:55 +01:00
parent 26668426d4
commit 1325603fd0
2 changed files with 111 additions and 76 deletions
--- a/Jellyfin.Server/Resources/Shaders/crt_lottes.cl
+++ b/Jellyfin.Server/Resources/Shaders/crt_lottes.cl
@@ -8,8 +8,12 @@
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // copyright notice and this permission notice appear in all copies.
+//
+// Input/output: NV12 (Y plane + interleaved CbCr plane).
+// Kernel signature: (src_y, src_uv, dst_y, dst_uv)
+// FFmpeg program_opencl passes one image2d_t per plane in plane order.

-// ── Parameters (override via build_opts, e.g. -DSHADOW_MASK=2) ───────────────
+// ── Parameters ────────────────────────────────────────────────────────────────
 #ifndef HARD_SCAN
 #define HARD_SCAN       (-8.0f)
 #endif
@@ -61,27 +65,29 @@ static float3 delinearize_rgb(float3 c)
    return c;
 }

-// ── Texture helper ────────────────────────────────────────────────────────────
+// ── NV12 fetch helper ─────────────────────────────────────────────────────────
+// Reads Y + CbCr from NV12 planes and returns linearised RGB.
+// Both planes use the same normalised coordinates [0,1]; the UV plane is
+// half-resolution but the sampler maps the same normalised position to the
+// corresponding chroma sample automatically.

-static float3 fetch_sample(
-    __read_only image2d_t src,
-    sampler_t             smp,
-    float2                pos,          // normalised (0..1) in SOURCE space
-    float2                off_texels,   // offset in texel units
-    float2                src_size)
-{
-    float2 p = pos + off_texels / src_size;
-    return BRIGHTNESS_BOOST * read_imagef(src, smp, p).xyz;
-}
-
-static float3 nearest_emulated_sample(
-    __read_only image2d_t src,
+static float3 fetch_linear_nv12(
+    __read_only image2d_t y_plane,
+    __read_only image2d_t uv_plane,
    sampler_t             smp,
    float2                pos,
    float2                off_texels,
    float2                src_size)
 {
-    return linearize_rgb(fetch_sample(src, smp, pos, off_texels, src_size));
+    float2 p    = pos + off_texels / src_size;
+    float  y    = BRIGHTNESS_BOOST * read_imagef(y_plane,  smp, p).x;
+    float2 cbcr = read_imagef(uv_plane, smp, p).xy - 0.5f; // centre Cb/Cr
+
+    // BT.709 full-range YCbCr → RGB
+    float r = clamp(y + 1.5748f * cbcr.y,                    0.0f, 1.0f);
+    float g = clamp(y - 0.1873f * cbcr.x - 0.4681f * cbcr.y, 0.0f, 1.0f);
+    float b = clamp(y + 1.8556f * cbcr.x,                    0.0f, 1.0f);
+    return linearize_rgb((float3)(r, g, b));
 }

 // ── Gaussian kernel ───────────────────────────────────────────────────────────
@@ -91,7 +97,6 @@ static float gauss1d(float pos, float scale)
    return exp2(scale * pow(fabs(pos), SHAPE));
 }

-// distance from pos to its nearest texel centre (fractional part, −0.5..+0.5)
 static float2 distance_to_texel(float2 pos, float2 src_size)
 {
    return -1.0f * fract(pos * src_size - 0.5f);
@@ -100,12 +105,14 @@ static float2 distance_to_texel(float2 pos, float2 src_size)
 // ── Horizontal reconstruction (3 / 5 / 7 tap) ────────────────────────────────

 static float3 horz3(
-    __read_only image2d_t src, sampler_t smp,
+    __read_only image2d_t y_plane,
+    __read_only image2d_t uv_plane,
+    sampler_t smp,
    float2 pos, float off_y, float scale, float2 src_size)
 {
-    float3 c = nearest_emulated_sample(src, smp, pos, (float2)(-1.0f, off_y), src_size);
-    float3 d = nearest_emulated_sample(src, smp, pos, (float2)( 0.0f, off_y), src_size);
-    float3 e = nearest_emulated_sample(src, smp, pos, (float2)( 1.0f, off_y), src_size);
+    float3 c = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)(-1.0f, off_y), src_size);
+    float3 d = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 0.0f, off_y), src_size);
+    float3 e = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 1.0f, off_y), src_size);
    float dst = distance_to_texel(pos, src_size).x;
    float wc = gauss1d(dst - 1.0f, scale);
    float wd = gauss1d(dst,         scale);
@@ -114,14 +121,16 @@ static float3 horz3(
 }

 static float3 horz5(
-    __read_only image2d_t src, sampler_t smp,
+    __read_only image2d_t y_plane,
+    __read_only image2d_t uv_plane,
+    sampler_t smp,
    float2 pos, float off_y, float scale, float2 src_size)
 {
-    float3 b = nearest_emulated_sample(src, smp, pos, (float2)(-2.0f, off_y), src_size);
-    float3 c = nearest_emulated_sample(src, smp, pos, (float2)(-1.0f, off_y), src_size);
-    float3 d = nearest_emulated_sample(src, smp, pos, (float2)( 0.0f, off_y), src_size);
-    float3 e = nearest_emulated_sample(src, smp, pos, (float2)( 1.0f, off_y), src_size);
-    float3 f = nearest_emulated_sample(src, smp, pos, (float2)( 2.0f, off_y), src_size);
+    float3 b = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)(-2.0f, off_y), src_size);
+    float3 c = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)(-1.0f, off_y), src_size);
+    float3 d = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 0.0f, off_y), src_size);
+    float3 e = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 1.0f, off_y), src_size);
+    float3 f = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 2.0f, off_y), src_size);
    float dst = distance_to_texel(pos, src_size).x;
    float wb = gauss1d(dst - 2.0f, scale);
    float wc = gauss1d(dst - 1.0f, scale);
@@ -132,16 +141,18 @@ static float3 horz5(
 }

 static float3 horz7(
-    __read_only image2d_t src, sampler_t smp,
+    __read_only image2d_t y_plane,
+    __read_only image2d_t uv_plane,
+    sampler_t smp,
    float2 pos, float off_y, float scale, float2 src_size)
 {
-    float3 a = nearest_emulated_sample(src, smp, pos, (float2)(-3.0f, off_y), src_size);
-    float3 b = nearest_emulated_sample(src, smp, pos, (float2)(-2.0f, off_y), src_size);
-    float3 c = nearest_emulated_sample(src, smp, pos, (float2)(-1.0f, off_y), src_size);
-    float3 d = nearest_emulated_sample(src, smp, pos, (float2)( 0.0f, off_y), src_size);
-    float3 e = nearest_emulated_sample(src, smp, pos, (float2)( 1.0f, off_y), src_size);
-    float3 f = nearest_emulated_sample(src, smp, pos, (float2)( 2.0f, off_y), src_size);
-    float3 g = nearest_emulated_sample(src, smp, pos, (float2)( 3.0f, off_y), src_size);
+    float3 a = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)(-3.0f, off_y), src_size);
+    float3 b = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)(-2.0f, off_y), src_size);
+    float3 c = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)(-1.0f, off_y), src_size);
+    float3 d = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 0.0f, off_y), src_size);
+    float3 e = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 1.0f, off_y), src_size);
+    float3 f = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 2.0f, off_y), src_size);
+    float3 g = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 3.0f, off_y), src_size);
    float dst = distance_to_texel(pos, src_size).x;
    float wa = gauss1d(dst - 3.0f, scale);
    float wb = gauss1d(dst - 2.0f, scale);
@@ -181,12 +192,14 @@ static float bloom_scan_weight(float2 pos, float off, float2 src_size)
 // ── Main CRT reconstruction ───────────────────────────────────────────────────

 static float3 tri(
-    __read_only image2d_t src, sampler_t smp,
+    __read_only image2d_t y_plane,
+    __read_only image2d_t uv_plane,
+    sampler_t smp,
    float2 pos, float2 src_size)
 {
-    float3 a = horz3(src, smp, pos, -1.0f, -10.0f, src_size);
-    float3 b = horz5(src, smp, pos,  0.0f, -10.0f, src_size);
-    float3 c = horz3(src, smp, pos,  1.0f, -10.0f, src_size);
+    float3 a = horz3(y_plane, uv_plane, smp, pos, -1.0f, -10.0f, src_size);
+    float3 b = horz5(y_plane, uv_plane, smp, pos,  0.0f, -10.0f, src_size);
+    float3 c = horz3(y_plane, uv_plane, smp, pos,  1.0f, -10.0f, src_size);
    float wa = scan_weight(pos, -1.0f, src_size);
    float wb = scan_weight(pos,  0.0f, src_size);
    float wc = scan_weight(pos,  1.0f, src_size);
@@ -194,14 +207,16 @@ static float3 tri(
 }

 static float3 bloom(
-    __read_only image2d_t src, sampler_t smp,
+    __read_only image2d_t y_plane,
+    __read_only image2d_t uv_plane,
+    sampler_t smp,
    float2 pos, float2 src_size)
 {
-    float3 a = horz5(src, smp, pos, -2.0f, -3.0f,  src_size);
-    float3 b = horz7(src, smp, pos, -1.0f, -1.5f,  src_size);
-    float3 c = horz7(src, smp, pos,  0.0f, -1.5f,  src_size);
-    float3 d = horz7(src, smp, pos,  1.0f, -1.5f,  src_size);
-    float3 e = horz5(src, smp, pos,  2.0f, -3.0f,  src_size);
+    float3 a = horz5(y_plane, uv_plane, smp, pos, -2.0f, -3.0f,  src_size);
+    float3 b = horz7(y_plane, uv_plane, smp, pos, -1.0f, -1.5f,  src_size);
+    float3 c = horz7(y_plane, uv_plane, smp, pos,  0.0f, -1.5f,  src_size);
+    float3 d = horz7(y_plane, uv_plane, smp, pos,  1.0f, -1.5f,  src_size);
+    float3 e = horz5(y_plane, uv_plane, smp, pos,  2.0f, -3.0f,  src_size);
    float wa = bloom_scan_weight(pos, -2.0f, src_size);
    float wb = bloom_scan_weight(pos, -1.0f, src_size);
    float wc = bloom_scan_weight(pos,  0.0f, src_size);
@@ -252,44 +267,48 @@ static float3 apply_mask(float2 px)
 }

 // ── Entry point ───────────────────────────────────────────────────────────────
+// NV12: FFmpeg program_opencl passes planes in order, so for 2-plane NV12:
+//   arg 0 = src_y  (input Y,  R channel, full resolution)
+//   arg 1 = src_uv (input UV, RG channels, half resolution)
+//   arg 2 = dst_y  (output Y,  full resolution)
+//   arg 3 = dst_uv (output UV, half resolution)
+// Global work size is set to dst_y dimensions (full resolution).

 __kernel void crt_lottes(
-    __read_only  image2d_t src,
-    __write_only image2d_t dst)
+    __read_only  image2d_t src_y,
+    __read_only  image2d_t src_uv,
+    __write_only image2d_t dst_y,
+    __write_only image2d_t dst_uv)
 {
    int2 coord = (int2)(get_global_id(0), get_global_id(1));

-    const int dst_w = get_image_width(dst);
-    const int dst_h = get_image_height(dst);
+    const int dst_w = get_image_width(dst_y);
+    const int dst_h = get_image_height(dst_y);

    if (coord.x >= dst_w || coord.y >= dst_h)
        return;

-    const int src_w = get_image_width(src);
-    const int src_h = get_image_height(src);
+    const int src_w = get_image_width(src_y);
+    const int src_h = get_image_height(src_y);

    const float2 dst_size = (float2)(dst_w, dst_h);
    const float2 src_size = (float2)(src_w, src_h);

-    // Linear (normalised) position in output space
+    // Normalised position in output space
    const float2 out_pos = ((float2)(coord.x, coord.y) + 0.5f) / dst_size;

-    // Sampler: normalised coords + linear filter + clamp-to-edge
+    // Sampler: normalised coords, linear filter, clamp-to-edge
    const sampler_t smp =
        CLK_NORMALIZED_COORDS_TRUE  |
        CLK_ADDRESS_CLAMP_TO_EDGE   |
        CLK_FILTER_LINEAR;

-    // Map to source coords (src may differ from dst when upscaling)
-    // With FFmpeg program_opencl output resolution matches input (same frame size).
-    const float2 src_pos = out_pos;
-
    // Apply CRT barrel-curvature
-    float2 bent = bend_screen(src_pos);
+    float2 bent = bend_screen(out_pos);

    // Main scanline reconstruction + bloom
-    float3 color = tri(src, smp, bent, src_size);
-    color += bloom(src, smp, bent, src_size) * BLOOM_AMOUNT;
+    float3 color = tri(src_y, src_uv, smp, bent, src_size);
+    color += bloom(src_y, src_uv, smp, bent, src_size) * BLOOM_AMOUNT;

    // Shadow mask
 #if SHADOW_MASK != 0
@@ -301,7 +320,18 @@ __kernel void crt_lottes(
    int in_bounds = (bent.x >= 0.0f && bent.x <= 1.0f &&
                     bent.y >= 0.0f && bent.y <= 1.0f) ? 1 : 0;

-    float3 result = in_bounds ? delinearize_rgb(color) : (float3)(0.0f);
+    float3 rgb = in_bounds ? delinearize_rgb(color) : (float3)(0.0f);

-    write_imagef(dst, coord, (float4)(result, 1.0f));
+    // BT.709 full-range RGB → YCbCr
+    float y_out  =  0.2126f * rgb.x + 0.7152f * rgb.y + 0.0722f * rgb.z;
+    float cb_out = -0.1146f * rgb.x - 0.3854f * rgb.y + 0.5000f * rgb.z + 0.5f;
+    float cr_out =  0.5000f * rgb.x - 0.4542f * rgb.y - 0.0458f * rgb.z + 0.5f;
+
+    // Write Y at full resolution
+    write_imagef(dst_y, coord, (float4)(y_out, 0.0f, 0.0f, 1.0f));
+
+    // Write UV at half resolution (one thread per 2x2 Y block, no races)
+    if ((coord.x & 1) == 0 && (coord.y & 1) == 0) {
+        write_imagef(dst_uv, coord >> 1, (float4)(cb_out, cr_out, 0.0f, 1.0f));
+    }
 }
--- a/MediaBrowser.Controller/MediaEncoding/EncodingHelper.cs
+++ b/MediaBrowser.Controller/MediaEncoding/EncodingHelper.cs
@@ -3764,12 +3764,12 @@ namespace MediaBrowser.Controller.MediaEncoding
            var escapedPath = GetCrtEscapedShaderPath();
            var buildOpts = GetCrtBuildOpts(state);

+            // No scale_opencl format conversion needed: the shader reads and writes
+            // NV12 planes directly (src_y, src_uv, dst_y, dst_uv).
            return
            [
-                "scale_opencl=format=rgba",
                FormattableString.Invariant(
-                    $"program_opencl=source={escapedPath}:kernel=crt_lottes"),
-                "scale_opencl=format=nv12"
+                    $"program_opencl=source={escapedPath}:kernel=crt_lottes")
            ];
        }

@@ -3789,8 +3789,9 @@ namespace MediaBrowser.Controller.MediaEncoding
            var escapedPath = GetCrtEscapedShaderPath();
            var buildOpts = GetCrtBuildOpts(state);

+            // Shader works with NV12 planes directly; no format conversion needed.
            return FormattableString.Invariant(
-                $"format=rgba,hwupload=derive_device=opencl,program_opencl=source={escapedPath}:kernel=crt_lottes,hwdownload,format=yuv420p");
+                $"hwupload=derive_device=opencl,program_opencl=source={escapedPath}:kernel=crt_lottes,hwdownload,format=nv12");
        }

        /// <summary>
@@ -4924,20 +4925,24 @@ namespace MediaBrowser.Controller.MediaEncoding
                {
                    if (IsCrtShaderEnabled(state))
                    {
-                        // VAAPI → OpenCL → CRT → VAAPI (reverse=1) → QSV.
-                        // Mirrors the doOclTonemap+isVaInVaOut pattern: derive OpenCL
-                        // from VAAPI, process, then reverse-map back to VAAPI, and
-                        // finally derive QSV (zero-copy, same libva surface).
+                        // VAAPI → OpenCL → CRT (NV12) → CPU → QSV.
+                        // program_opencl outputs NV12 frames into its own pool; we cannot
+                        // rely on hwmap reverse=1 back to VAAPI from those new frames.
+                        // hwdownload then hwupload (-filter_hw_device qsv) is the safe path.
                        mainFilters.Add("hwmap=derive_device=opencl:mode=read");
                        mainFilters.AddRange(GetCrtShaderOclFilters(state));
-                        mainFilters.Add("hwmap=derive_device=vaapi:mode=write:reverse=1");
-                        mainFilters.Add("format=vaapi");
+                        mainFilters.Add("hwdownload");
+                        mainFilters.Add("format=nv12");
+                        mainFilters.Add("hwupload");
+                        mainFilters.Add("format=qsv");
                    }
-
+                    else
+                    {
                        // VAAPI → QSV (zero-copy, shared libva surface)
                        mainFilters.Add("hwmap=derive_device=qsv");
                        mainFilters.Add("format=qsv");
                    }
+                }
                else if (isQsvDecoder)
                {
                    if (IsCrtShaderEnabled(state))