From 2f0dda747f0b5be771303f26b016b0dd428dcd19 Mon Sep 17 00:00:00 2001 From: Anthony Ryan Date: Sun, 21 Jun 2026 20:16:48 -0400 Subject: [PATCH] Update JSON::safeEncode to sanitize, rather than attempting convert/repair UTF::utf8ize previously attempted to "repair" as many mangled strings as possible, but I've observed a number of examples over the years where it choked and broke rutorrent. I would argue that taking arbitrary strings of bytes, and attempting to convert them with perfect accuracy into valid UTF8 (when those strings come to us mangled, corrupt and always with unknown encodings) isn't possible. Here's a handful of examples of of byte sequences that choke the UTF8 repair code: ```php require('php/utility/json.php'); var_dump(JSON::safeEncode("\xC0\x80")); var_dump(JSON::safeEncode("\xED\xA0\x80")); var_dump(JSON::safeEncode("\xED\xBF\xBF")); var_dump(JSON::safeEncode("\xF5\x80\x80\x80")); var_dump(JSON::safeEncode("\xF7\xBF\xBF\xBF")); ``` The problem is that I think this list is nowhere near comprehensive, and even if we fix all of these, there will be many more corrupt sequences in the wild we can't predict. Instead of trying to repair all this corrupt data, let's just use the xFFFD replacement character. Users who add files with corrupt strings will see the replacement character (adding some awareness), but that becomes a content problem and not a "ruTorrent is broken" problem. --- php/utility/json.php | 5 ++-- php/utility/utf.php | 55 -------------------------------------------- 2 files changed, 2 insertions(+), 58 deletions(-) diff --git a/php/utility/json.php b/php/utility/json.php index acb6bf2d8..5afad655f 100644 --- a/php/utility/json.php +++ b/php/utility/json.php @@ -6,7 +6,6 @@ class JSON { public static function safeEncode($value) { - $encoded = json_encode($value); - return(!function_exists('json_last_error') || json_last_error()==JSON_ERROR_NONE ? $encoded : json_encode(UTF::utf8ize($value))); + return json_encode($value, JSON_THROW_ON_ERROR|JSON_INVALID_UTF8_SUBSTITUTE); } -} \ No newline at end of file +} diff --git a/php/utility/utf.php b/php/utility/utf.php index 7091687ce..fe9aee9a3 100644 --- a/php/utility/utf.php +++ b/php/utility/utf.php @@ -107,61 +107,6 @@ public static function win2utf($str) return($outstr); } - private static function mix2utf($str, $inv = '_') - { - $len = strlen($str); - for($i = 0; $i < $len; $i++) - { - $c = ord($str[$i]); - if($c > 128) - { - $bytes = 0; - if(($c > 247)) $str[$i] = $inv; - elseif($c > 239) $bytes = 4; - elseif($c > 223) $bytes = 3; - elseif($c > 191) $bytes = 2; - else $str[$i] = $inv; - if($bytes) - { - if(($i + $bytes) > $len) $str[$i] = $inv; - else - { - $start = $i; - $cnt = $bytes; - while($bytes > 1) - { - $i++; - $b = ord($str[$i]); - if($b < 128 || $b > 191) - { - $str[$start] = $inv; - $i = $start; - break; - } - $bytes--; - } - } - } - } - } - return($str); - } - - public static function utf8ize($mixed) - { - if(is_array($mixed) || is_object($mixed)) - { - foreach($mixed as $key => $value) - { - $mixed[$key] = self::utf8ize($value); - } - } - else - if(is_string($mixed)) - $mixed = self::mix2utf($mixed); - return($mixed); - } - private static function maybe_mb_convert_to_utf8($string) { return function_exists('mb_convert_encoding') ? mb_convert_encoding($string, "UTF-8", "auto") : $string; }