jellyfin/Jellyfin.Server/Resources/Shaders/crt_lottes.cl

// CRT Lottes shader — OpenCL implementation for FFmpeg program_opencl
// Port of Timothy Lottes' CRT shader (public domain).
// Adapted from the mpv-retro-shaders GLSL version.
//
// Copyright (c) 2022, The mpv-retro-shaders Contributors
// Copyright (c) 2024, Jellyfin Contributors
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// Input/output: NV12 (Y plane + interleaved CbCr plane).
// Kernel signature: (src_y, src_uv, dst_y, dst_uv)
// FFmpeg program_opencl passes one image2d_t per plane in plane order.

// ── Parameters ────────────────────────────────────────────────────────────────
#ifndef HARD_SCAN
#define HARD_SCAN       (-8.0f)
#endif
#ifndef CURVATURE_X
#define CURVATURE_X     (0.031f)
#endif
#ifndef CURVATURE_Y
#define CURVATURE_Y     (0.041f)
#endif
#ifndef MASK_DARK
#define MASK_DARK       (0.5f)
#endif
#ifndef MASK_LIGHT
#define MASK_LIGHT      (1.5f)
#endif
#ifndef SHADOW_MASK
#define SHADOW_MASK     2
#endif
#ifndef BRIGHTNESS_BOOST
#define BRIGHTNESS_BOOST (1.0f)
#endif
#ifndef HARD_BLOOM_SCAN
#define HARD_BLOOM_SCAN (-2.0f)
#endif
#ifndef BLOOM_AMOUNT
#define BLOOM_AMOUNT    (1.0f / 16.0f)
#endif
#ifndef SHAPE
#define SHAPE           (2.0f)
#endif

// ── sRGB ↔ linear ─────────────────────────────────────────────────────────────

static float3 linearize_rgb(float3 c)
{
    const float k0 = 0.05958483740687370300f;
    const float k1 = 0.87031054496765136718f;
    c = max(c, 0.0f);
    c = k1 * pow(c + (float3)(k0), (float3)(2.4f));
    return c;
}

static float3 delinearize_rgb(float3 c)
{
    const float k0 = 0.05958483740687370300f;
    const float k1 = 1.14901518821716308593f;
    c = max(c, 0.0f);
    c = pow(k1 * c, (float3)(1.0f / 2.4f)) - (float3)(k0);
    return c;
}

// ── NV12 fetch helper ─────────────────────────────────────────────────────────
// Reads Y + CbCr from NV12 planes and returns linearised RGB.
// Both planes use the same normalised coordinates [0,1]; the UV plane is
// half-resolution but the sampler maps the same normalised position to the
// corresponding chroma sample automatically.

static float3 fetch_linear_nv12(
    __read_only image2d_t y_plane,
    __read_only image2d_t uv_plane,
    sampler_t             smp,
    float2                pos,
    float2                off_texels,
    float2                src_size)
{
    float2 p    = pos + off_texels / src_size;
    float  y    = BRIGHTNESS_BOOST * read_imagef(y_plane,  smp, p).x;
    float2 cbcr = read_imagef(uv_plane, smp, p).xy - 0.5f; // centre Cb/Cr

    // BT.709 full-range YCbCr → RGB
    float r = clamp(y + 1.5748f * cbcr.y,                    0.0f, 1.0f);
    float g = clamp(y - 0.1873f * cbcr.x - 0.4681f * cbcr.y, 0.0f, 1.0f);
    float b = clamp(y + 1.8556f * cbcr.x,                    0.0f, 1.0f);
    return linearize_rgb((float3)(r, g, b));
}

// ── Gaussian kernel ───────────────────────────────────────────────────────────

static float gauss1d(float pos, float scale)
{
    return exp2(scale * pow(fabs(pos), SHAPE));
}

static float2 distance_to_texel(float2 pos, float2 src_size)
{
    return -1.0f * fract(pos * src_size - 0.5f);
}

// ── Horizontal reconstruction (3 / 5 / 7 tap) ────────────────────────────────

static float3 horz3(
    __read_only image2d_t y_plane,
    __read_only image2d_t uv_plane,
    sampler_t smp,
    float2 pos, float off_y, float scale, float2 src_size)
{
    float3 c = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)(-1.0f, off_y), src_size);
    float3 d = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 0.0f, off_y), src_size);
    float3 e = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 1.0f, off_y), src_size);
    float dst = distance_to_texel(pos, src_size).x;
    float wc = gauss1d(dst - 1.0f, scale);
    float wd = gauss1d(dst,         scale);
    float we = gauss1d(dst + 1.0f, scale);
    return (c * wc + d * wd + e * we) / (wc + wd + we);
}

static float3 horz5(
    __read_only image2d_t y_plane,
    __read_only image2d_t uv_plane,
    sampler_t smp,
    float2 pos, float off_y, float scale, float2 src_size)
{
    float3 b = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)(-2.0f, off_y), src_size);
    float3 c = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)(-1.0f, off_y), src_size);
    float3 d = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 0.0f, off_y), src_size);
    float3 e = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 1.0f, off_y), src_size);
    float3 f = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 2.0f, off_y), src_size);
    float dst = distance_to_texel(pos, src_size).x;
    float wb = gauss1d(dst - 2.0f, scale);
    float wc = gauss1d(dst - 1.0f, scale);
    float wd = gauss1d(dst,         scale);
    float we = gauss1d(dst + 1.0f, scale);
    float wf = gauss1d(dst + 2.0f, scale);
    return (b * wb + c * wc + d * wd + e * we + f * wf) / (wb + wc + wd + we + wf);
}

static float3 horz7(
    __read_only image2d_t y_plane,
    __read_only image2d_t uv_plane,
    sampler_t smp,
    float2 pos, float off_y, float scale, float2 src_size)
{
    float3 a = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)(-3.0f, off_y), src_size);
    float3 b = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)(-2.0f, off_y), src_size);
    float3 c = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)(-1.0f, off_y), src_size);
    float3 d = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 0.0f, off_y), src_size);
    float3 e = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 1.0f, off_y), src_size);
    float3 f = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 2.0f, off_y), src_size);
    float3 g = fetch_linear_nv12(y_plane, uv_plane, smp, pos, (float2)( 3.0f, off_y), src_size);
    float dst = distance_to_texel(pos, src_size).x;
    float wa = gauss1d(dst - 3.0f, scale);
    float wb = gauss1d(dst - 2.0f, scale);
    float wc = gauss1d(dst - 1.0f, scale);
    float wd = gauss1d(dst,         scale);
    float we = gauss1d(dst + 1.0f, scale);
    float wf = gauss1d(dst + 2.0f, scale);
    float wg = gauss1d(dst + 3.0f, scale);
    return (a * wa + b * wb + c * wc + d * wd + e * we + f * wf + g * wg)
         / (wa + wb + wc + wd + we + wf + wg);
}

// ── Screen curvature ──────────────────────────────────────────────────────────

static float2 bend_screen(float2 pos)
{
    pos = pos * 2.0f - 1.0f;
    pos *= (float2)(1.0f + (pos.y * pos.y) * CURVATURE_X,
                    1.0f + (pos.x * pos.x) * CURVATURE_Y);
    return pos * 0.5f + 0.5f;
}

// ── Scanline weights ──────────────────────────────────────────────────────────

static float scan_weight(float2 pos, float off, float2 src_size)
{
    float dst = distance_to_texel(pos, src_size).y;
    return gauss1d(dst + off, HARD_SCAN);
}

static float bloom_scan_weight(float2 pos, float off, float2 src_size)
{
    float dst = distance_to_texel(pos, src_size).y;
    return gauss1d(dst + off, HARD_BLOOM_SCAN);
}

// ── Main CRT reconstruction ───────────────────────────────────────────────────

static float3 tri(
    __read_only image2d_t y_plane,
    __read_only image2d_t uv_plane,
    sampler_t smp,
    float2 pos, float2 src_size)
{
    float3 a = horz3(y_plane, uv_plane, smp, pos, -1.0f, -10.0f, src_size);
    float3 b = horz5(y_plane, uv_plane, smp, pos,  0.0f, -10.0f, src_size);
    float3 c = horz3(y_plane, uv_plane, smp, pos,  1.0f, -10.0f, src_size);
    float wa = scan_weight(pos, -1.0f, src_size);
    float wb = scan_weight(pos,  0.0f, src_size);
    float wc = scan_weight(pos,  1.0f, src_size);
    return a * wa + b * wb + c * wc;
}

static float3 bloom(
    __read_only image2d_t y_plane,
    __read_only image2d_t uv_plane,
    sampler_t smp,
    float2 pos, float2 src_size)
{
    float3 a = horz5(y_plane, uv_plane, smp, pos, -2.0f, -3.0f,  src_size);
    float3 b = horz7(y_plane, uv_plane, smp, pos, -1.0f, -1.5f,  src_size);
    float3 c = horz7(y_plane, uv_plane, smp, pos,  0.0f, -1.5f,  src_size);
    float3 d = horz7(y_plane, uv_plane, smp, pos,  1.0f, -1.5f,  src_size);
    float3 e = horz5(y_plane, uv_plane, smp, pos,  2.0f, -3.0f,  src_size);
    float wa = bloom_scan_weight(pos, -2.0f, src_size);
    float wb = bloom_scan_weight(pos, -1.0f, src_size);
    float wc = bloom_scan_weight(pos,  0.0f, src_size);
    float wd = bloom_scan_weight(pos,  1.0f, src_size);
    float we = bloom_scan_weight(pos,  2.0f, src_size);
    return a * wa + b * wb + c * wc + d * wd + e * we;
}

// ── Shadow mask ───────────────────────────────────────────────────────────────

static float3 apply_mask(float2 px)
{
    float3 m = (float3)(MASK_DARK);

#if SHADOW_MASK == 1
    float line = MASK_LIGHT;
    float odd  = (fract(px.x / 6.0f) < 0.5f) ? 1.0f : 0.0f;
    if (fract((px.y + odd) / 2.0f) < 0.5f) line = MASK_DARK;
    float mx = fract(px.x / 3.0f);
    if      (mx < 1.0f / 3.0f) m.x = MASK_LIGHT;
    else if (mx < 2.0f / 3.0f) m.y = MASK_LIGHT;
    else                        m.z = MASK_LIGHT;
    m *= line;

#elif SHADOW_MASK == 2
    float mx2 = fract(px.x / 3.0f);
    if      (mx2 < 1.0f / 3.0f) m.x = MASK_LIGHT;
    else if (mx2 < 2.0f / 3.0f) m.y = MASK_LIGHT;
    else                         m.z = MASK_LIGHT;

#elif SHADOW_MASK == 3
    px.x += px.y * 3.0f;
    float mx3 = fract(px.x / 6.0f);
    if      (mx3 < 1.0f / 3.0f) m.x = MASK_LIGHT;
    else if (mx3 < 2.0f / 3.0f) m.y = MASK_LIGHT;
    else                         m.z = MASK_LIGHT;

#elif SHADOW_MASK == 4
    px = floor(px * (float2)(1.0f, 0.5f));
    px.x += px.y * 3.0f;
    float mx4 = fract(px.x / 6.0f);
    if      (mx4 < 1.0f / 3.0f) m.x = MASK_LIGHT;
    else if (mx4 < 2.0f / 3.0f) m.y = MASK_LIGHT;
    else                         m.z = MASK_LIGHT;
#endif

    return m;
}

// ── Entry point ───────────────────────────────────────────────────────────────
// NV12: FFmpeg program_opencl passes planes in order, so for 2-plane NV12:
//   arg 0 = src_y  (input Y,  R channel, full resolution)
//   arg 1 = src_uv (input UV, RG channels, half resolution)
//   arg 2 = dst_y  (output Y,  full resolution)
//   arg 3 = dst_uv (output UV, half resolution)
// Global work size is set to dst_y dimensions (full resolution).

__kernel void crt_lottes(
    __read_only  image2d_t src_y,
    __read_only  image2d_t src_uv,
    __write_only image2d_t dst_y,
    __write_only image2d_t dst_uv)
{
    int2 coord = (int2)(get_global_id(0), get_global_id(1));

    const int dst_w = get_image_width(dst_y);
    const int dst_h = get_image_height(dst_y);

    if (coord.x >= dst_w || coord.y >= dst_h)
        return;

    const int src_w = get_image_width(src_y);
    const int src_h = get_image_height(src_y);

    const float2 dst_size = (float2)(dst_w, dst_h);
    const float2 src_size = (float2)(src_w, src_h);

    // Normalised position in output space
    const float2 out_pos = ((float2)(coord.x, coord.y) + 0.5f) / dst_size;

    // Sampler: normalised coords, linear filter, clamp-to-edge
    const sampler_t smp =
        CLK_NORMALIZED_COORDS_TRUE  |
        CLK_ADDRESS_CLAMP_TO_EDGE   |
        CLK_FILTER_LINEAR;

    // Apply CRT barrel-curvature
    float2 bent = bend_screen(out_pos);

    // Main scanline reconstruction + bloom
    float3 color = tri(src_y, src_uv, smp, bent, src_size);
    color += bloom(src_y, src_uv, smp, bent, src_size) * BLOOM_AMOUNT;

    // Shadow mask
#if SHADOW_MASK != 0
    float2 px = floor(out_pos * dst_size) + 0.5f;
    color *= apply_mask(px);
#endif

    // Black outside the curved screen border
    int in_bounds = (bent.x >= 0.0f && bent.x <= 1.0f &&
                     bent.y >= 0.0f && bent.y <= 1.0f) ? 1 : 0;

    float3 rgb = in_bounds ? delinearize_rgb(color) : (float3)(0.0f);

    // BT.709 full-range RGB → YCbCr
    float y_out  =  0.2126f * rgb.x + 0.7152f * rgb.y + 0.0722f * rgb.z;
    float cb_out = -0.1146f * rgb.x - 0.3854f * rgb.y + 0.5000f * rgb.z + 0.5f;
    float cr_out =  0.5000f * rgb.x - 0.4542f * rgb.y - 0.0458f * rgb.z + 0.5f;

    // Write Y at full resolution
    write_imagef(dst_y, coord, (float4)(y_out, 0.0f, 0.0f, 1.0f));

    // Write UV at half resolution (one thread per 2x2 Y block, no races)
    if ((coord.x & 1) == 0 && (coord.y & 1) == 0) {
        write_imagef(dst_uv, coord >> 1, (float4)(cb_out, cr_out, 0.0f, 1.0f));
    }
}