deepin-community · Cherrling · Jan 18, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,17 @@
 All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
+## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
+
+### Added
+- Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.
+- Support for Python 3.13 (#512)
+
+### Fixed
+- Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
+- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537)
+- Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)
+
 ## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
 
 ### Fixed

diff --git a/build-requirements.txt b/build-requirements.txt
@@ -1,6 +1,9 @@
 # in the meantime we migrate to pyproject.toml
 # this represent the minimum requirement to build (for the optional speedup)
-mypy==1.6.1; python_version >= '3.8'
+--find-links https://github.com/mypyc/mypy_mypyc-wheels/releases/expanded_assets/v1.12.0+dev.b2deaaecf1a11e13bc962558992b5f2d5701f295
+mypy==1.11.2; python_version >= '3.8' and python_version < '3.13'
+mypy==1.12.0; python_version >= '3.13'
 mypy==1.4.1; python_version < '3.8'
-build==0.10.0
-wheel==0.41.2
+build>=0.10.0,<2
+wheel==0.42.0
+setuptools>=68,<76
diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py
@@ -159,6 +159,8 @@ def from_bytes(
 
     results: CharsetMatches = CharsetMatches()
 
+    early_stop_results: CharsetMatches = CharsetMatches()
+
     sig_encoding, sig_payload = identify_sig_or_bom(sequences)
 
     if sig_encoding is not None:
@@ -221,16 +223,20 @@ def from_bytes(
         try:
             if is_too_large_sequence and is_multi_byte_decoder is False:
                 str(
-                    sequences[: int(50e4)]
-                    if strip_sig_or_bom is False
-                    else sequences[len(sig_payload) : int(50e4)],
+                    (
+                        sequences[: int(50e4)]
+                        if strip_sig_or_bom is False
+                        else sequences[len(sig_payload) : int(50e4)]
+                    ),
                     encoding=encoding_iana,
                 )
             else:
                 decoded_payload = str(
-                    sequences
-                    if strip_sig_or_bom is False
-                    else sequences[len(sig_payload) :],
+                    (
+                        sequences
+                        if strip_sig_or_bom is False
+                        else sequences[len(sig_payload) :]
+                    ),
                     encoding=encoding_iana,
                 )
         except (UnicodeDecodeError, LookupError) as e:
@@ -367,7 +373,13 @@ def from_bytes(
                 and not lazy_str_hard_failure
             ):
                 fallback_entry = CharsetMatch(
-                    sequences, encoding_iana, threshold, False, [], decoded_payload
+                    sequences,
+                    encoding_iana,
+                    threshold,
+                    False,
+                    [],
+                    decoded_payload,
+                    preemptive_declaration=specified_encoding,
                 )
                 if encoding_iana == specified_encoding:
                     fallback_specified = fallback_entry
@@ -421,28 +433,58 @@ def from_bytes(
                 ),
             )
 
-        results.append(
-            CharsetMatch(
-                sequences,
-                encoding_iana,
-                mean_mess_ratio,
-                bom_or_sig_available,
-                cd_ratios_merged,
-                decoded_payload,
-            )
+        current_match = CharsetMatch(
+            sequences,
+            encoding_iana,
+            mean_mess_ratio,
+            bom_or_sig_available,
+            cd_ratios_merged,
+            (
+                decoded_payload
+                if (
+                    is_too_large_sequence is False
+                    or encoding_iana in [specified_encoding, "ascii", "utf_8"]
+                )
+                else None
+            ),
+            preemptive_declaration=specified_encoding,
         )
 
+        results.append(current_match)
+
         if (
             encoding_iana in [specified_encoding, "ascii", "utf_8"]
             and mean_mess_ratio < 0.1
         ):
+            # If md says nothing to worry about, then... stop immediately!
+            if mean_mess_ratio == 0.0:
+                logger.debug(
+                    "Encoding detection: %s is most likely the one.",
+                    current_match.encoding,
+                )
+                if explain:
+                    logger.removeHandler(explain_handler)
+                    logger.setLevel(previous_logger_level)
+                return CharsetMatches([current_match])
+
+            early_stop_results.append(current_match)
+
+        if (
+            len(early_stop_results)
+            and (specified_encoding is None or specified_encoding in tested)
+            and "ascii" in tested
+            and "utf_8" in tested
+        ):
+            probable_result: CharsetMatch = early_stop_results.best()  # type: ignore[assignment]
             logger.debug(
-                "Encoding detection: %s is most likely the one.", encoding_iana
+                "Encoding detection: %s is most likely the one.",
+                probable_result.encoding,
             )
             if explain:
                 logger.removeHandler(explain_handler)
                 logger.setLevel(previous_logger_level)
-            return CharsetMatches([results[encoding_iana]])
+
+            return CharsetMatches([probable_result])
 
         if encoding_iana == sig_encoding:
             logger.debug(

diff --git a/charset_normalizer/cli/__main__.py b/charset_normalizer/cli/__main__.py
@@ -109,6 +109,14 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
         dest="force",
         help="Replace file without asking if you are sure, use this flag with caution.",
     )
+    parser.add_argument(
+        "-i",
+        "--no-preemptive",
+        action="store_true",
+        default=False,
+        dest="no_preemptive",
+        help="Disable looking at a charset declaration to hint the detector.",
+    )
     parser.add_argument(
         "-t",
         "--threshold",
@@ -133,31 +141,47 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
     args = parser.parse_args(argv)
 
     if args.replace is True and args.normalize is False:
+        if args.files:
+            for my_file in args.files:
+                my_file.close()
         print("Use --replace in addition of --normalize only.", file=sys.stderr)
         return 1
 
     if args.force is True and args.replace is False:
+        if args.files:
+            for my_file in args.files:
+                my_file.close()
         print("Use --force in addition of --replace only.", file=sys.stderr)
         return 1
 
     if args.threshold < 0.0 or args.threshold > 1.0:
+        if args.files:
+            for my_file in args.files:
+                my_file.close()
         print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
         return 1
 
     x_ = []
 
     for my_file in args.files:
-        matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
+        matches = from_fp(
+            my_file,
+            threshold=args.threshold,
+            explain=args.verbose,
+            preemptive_behaviour=args.no_preemptive is False,
+        )
 
         best_guess = matches.best()
 
         if best_guess is None:
             print(
                 'Unable to identify originating encoding for "{}". {}'.format(
                     my_file.name,
-                    "Maybe try increasing maximum amount of chaos."
-                    if args.threshold < 1.0
-                    else "",
+                    (
+                        "Maybe try increasing maximum amount of chaos."
+                        if args.threshold < 1.0
+                        else ""
+                    ),
                 ),
                 file=sys.stderr,
             )
@@ -258,8 +282,8 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
                 try:
                     x_[0].unicode_path = join(dir_path, ".".join(o_))
 
-                    with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
-                        fp.write(str(best_guess))
+                    with open(x_[0].unicode_path, "wb") as fp:
+                        fp.write(best_guess.output())
                 except IOError as e:
                     print(str(e), file=sys.stderr)
                     if my_file.closed is False:

diff --git a/charset_normalizer/constant.py b/charset_normalizer/constant.py
@@ -544,6 +544,8 @@
     "|",
     '"',
     "-",
+    "(",
+    ")",
 }
 
 

diff --git a/charset_normalizer/legacy.py b/charset_normalizer/legacy.py
@@ -1,13 +1,24 @@
-from typing import Any, Dict, Optional, Union
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Optional
 from warnings import warn
 
 from .api import from_bytes
 from .constant import CHARDET_CORRESPONDENCE
 
+# TODO: remove this check when dropping Python 3.7 support
+if TYPE_CHECKING:
+    from typing_extensions import TypedDict
+
+    class ResultDict(TypedDict):
+        encoding: Optional[str]
+        language: str
+        confidence: Optional[float]
+
 
 def detect(
     byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
-) -> Dict[str, Optional[Union[str, float]]]:
+) -> ResultDict:
     """
     chardet legacy method
     Detect the encoding of the given byte string. It should be mostly backward-compatible.

diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py
@@ -236,7 +236,7 @@ def reset(self) -> None:  # pragma: no cover
 
     @property
     def ratio(self) -> float:
-        if self._character_count <= 24:
+        if self._character_count <= 13:
             return 0.0
 
         ratio_of_suspicious_range_usage: float = (
@@ -260,6 +260,7 @@ def __init__(self) -> None:
 
         self._buffer: str = ""
         self._buffer_accent_count: int = 0
+        self._buffer_glyph_count: int = 0
 
     def eligible(self, character: str) -> bool:
         return True
@@ -279,6 +280,14 @@ def feed(self, character: str) -> None:
                 and is_thai(character) is False
             ):
                 self._foreign_long_watch = True
+            if (
+                is_cjk(character)
+                or is_hangul(character)
+                or is_katakana(character)
+                or is_hiragana(character)
+                or is_thai(character)
+            ):
+                self._buffer_glyph_count += 1
             return
         if not self._buffer:
             return
@@ -291,17 +300,20 @@ def feed(self, character: str) -> None:
             self._character_count += buffer_length
 
             if buffer_length >= 4:
-                if self._buffer_accent_count / buffer_length > 0.34:
+                if self._buffer_accent_count / buffer_length >= 0.5:
                     self._is_current_word_bad = True
                 # Word/Buffer ending with an upper case accentuated letter are so rare,
                 # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
-                if (
+                elif (
                     is_accentuated(self._buffer[-1])
                     and self._buffer[-1].isupper()
                     and all(_.isupper() for _ in self._buffer) is False
                 ):
                     self._foreign_long_count += 1
                     self._is_current_word_bad = True
+                elif self._buffer_glyph_count == 1:
+                    self._is_current_word_bad = True
+                    self._foreign_long_count += 1
             if buffer_length >= 24 and self._foreign_long_watch:
                 camel_case_dst = [
                     i
@@ -325,6 +337,7 @@ def feed(self, character: str) -> None:
             self._foreign_long_watch = False
             self._buffer = ""
             self._buffer_accent_count = 0
+            self._buffer_glyph_count = 0
         elif (
             character not in {"<", ">", "-", "=", "~", "|", "_"}
             and character.isdigit() is False
-Original file line number
+Diff line change
@@ Expand Up / @@ -544,6 +544,8 @@ @@
         "|",
         '"',
         "-",
+        "(",
+        ")",
     }
@@ Expand Down @@