diff --git a/CHANGELOG.md b/CHANGELOG.md index de66da4..d7cd7e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,17 @@ All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08) + +### Added +- Argument `--no-preemptive` in the CLI to prevent the detector to search for hints. +- Support for Python 3.13 (#512) + +### Fixed +- Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch. +- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537) +- Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381) + ## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31) ### Fixed diff --git a/build-requirements.txt b/build-requirements.txt index 9313e96..ce978b2 100644 --- a/build-requirements.txt +++ b/build-requirements.txt @@ -1,6 +1,9 @@ # in the meantime we migrate to pyproject.toml # this represent the minimum requirement to build (for the optional speedup) -mypy==1.6.1; python_version >= '3.8' +--find-links https://github.com/mypyc/mypy_mypyc-wheels/releases/expanded_assets/v1.12.0+dev.b2deaaecf1a11e13bc962558992b5f2d5701f295 +mypy==1.11.2; python_version >= '3.8' and python_version < '3.13' +mypy==1.12.0; python_version >= '3.13' mypy==1.4.1; python_version < '3.8' -build==0.10.0 -wheel==0.41.2 +build>=0.10.0,<2 +wheel==0.42.0 +setuptools>=68,<76 diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index 0ba08e3..e3f2283 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -159,6 +159,8 @@ def from_bytes( results: CharsetMatches = CharsetMatches() + early_stop_results: CharsetMatches = CharsetMatches() + sig_encoding, sig_payload = identify_sig_or_bom(sequences) if sig_encoding is not None: @@ -221,16 +223,20 @@ def from_bytes( try: if is_too_large_sequence and is_multi_byte_decoder is False: str( - sequences[: int(50e4)] - if strip_sig_or_bom is False - else sequences[len(sig_payload) : int(50e4)], + ( + sequences[: int(50e4)] + if strip_sig_or_bom is False + else sequences[len(sig_payload) : int(50e4)] + ), encoding=encoding_iana, ) else: decoded_payload = str( - sequences - if strip_sig_or_bom is False - else sequences[len(sig_payload) :], + ( + sequences + if strip_sig_or_bom is False + else sequences[len(sig_payload) :] + ), encoding=encoding_iana, ) except (UnicodeDecodeError, LookupError) as e: @@ -367,7 +373,13 @@ def from_bytes( and not lazy_str_hard_failure ): fallback_entry = CharsetMatch( - sequences, encoding_iana, threshold, False, [], decoded_payload + sequences, + encoding_iana, + threshold, + False, + [], + decoded_payload, + preemptive_declaration=specified_encoding, ) if encoding_iana == specified_encoding: fallback_specified = fallback_entry @@ -421,28 +433,58 @@ def from_bytes( ), ) - results.append( - CharsetMatch( - sequences, - encoding_iana, - mean_mess_ratio, - bom_or_sig_available, - cd_ratios_merged, - decoded_payload, - ) + current_match = CharsetMatch( + sequences, + encoding_iana, + mean_mess_ratio, + bom_or_sig_available, + cd_ratios_merged, + ( + decoded_payload + if ( + is_too_large_sequence is False + or encoding_iana in [specified_encoding, "ascii", "utf_8"] + ) + else None + ), + preemptive_declaration=specified_encoding, ) + results.append(current_match) + if ( encoding_iana in [specified_encoding, "ascii", "utf_8"] and mean_mess_ratio < 0.1 ): + # If md says nothing to worry about, then... stop immediately! + if mean_mess_ratio == 0.0: + logger.debug( + "Encoding detection: %s is most likely the one.", + current_match.encoding, + ) + if explain: + logger.removeHandler(explain_handler) + logger.setLevel(previous_logger_level) + return CharsetMatches([current_match]) + + early_stop_results.append(current_match) + + if ( + len(early_stop_results) + and (specified_encoding is None or specified_encoding in tested) + and "ascii" in tested + and "utf_8" in tested + ): + probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment] logger.debug( - "Encoding detection: %s is most likely the one.", encoding_iana + "Encoding detection: %s is most likely the one.", + probable_result.encoding, ) if explain: logger.removeHandler(explain_handler) logger.setLevel(previous_logger_level) - return CharsetMatches([results[encoding_iana]]) + + return CharsetMatches([probable_result]) if encoding_iana == sig_encoding: logger.debug( diff --git a/charset_normalizer/cli/__main__.py b/charset_normalizer/cli/__main__.py index f4bcbaa..e7edd0f 100644 --- a/charset_normalizer/cli/__main__.py +++ b/charset_normalizer/cli/__main__.py @@ -109,6 +109,14 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: dest="force", help="Replace file without asking if you are sure, use this flag with caution.", ) + parser.add_argument( + "-i", + "--no-preemptive", + action="store_true", + default=False, + dest="no_preemptive", + help="Disable looking at a charset declaration to hint the detector.", + ) parser.add_argument( "-t", "--threshold", @@ -133,21 +141,35 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: args = parser.parse_args(argv) if args.replace is True and args.normalize is False: + if args.files: + for my_file in args.files: + my_file.close() print("Use --replace in addition of --normalize only.", file=sys.stderr) return 1 if args.force is True and args.replace is False: + if args.files: + for my_file in args.files: + my_file.close() print("Use --force in addition of --replace only.", file=sys.stderr) return 1 if args.threshold < 0.0 or args.threshold > 1.0: + if args.files: + for my_file in args.files: + my_file.close() print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr) return 1 x_ = [] for my_file in args.files: - matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose) + matches = from_fp( + my_file, + threshold=args.threshold, + explain=args.verbose, + preemptive_behaviour=args.no_preemptive is False, + ) best_guess = matches.best() @@ -155,9 +177,11 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: print( 'Unable to identify originating encoding for "{}". {}'.format( my_file.name, - "Maybe try increasing maximum amount of chaos." - if args.threshold < 1.0 - else "", + ( + "Maybe try increasing maximum amount of chaos." + if args.threshold < 1.0 + else "" + ), ), file=sys.stderr, ) @@ -258,8 +282,8 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: try: x_[0].unicode_path = join(dir_path, ".".join(o_)) - with open(x_[0].unicode_path, "w", encoding="utf-8") as fp: - fp.write(str(best_guess)) + with open(x_[0].unicode_path, "wb") as fp: + fp.write(best_guess.output()) except IOError as e: print(str(e), file=sys.stderr) if my_file.closed is False: diff --git a/charset_normalizer/constant.py b/charset_normalizer/constant.py index 8634904..f8f2a81 100644 --- a/charset_normalizer/constant.py +++ b/charset_normalizer/constant.py @@ -544,6 +544,8 @@ "|", '"', "-", + "(", + ")", } diff --git a/charset_normalizer/legacy.py b/charset_normalizer/legacy.py index 43aad21..3f6d490 100644 --- a/charset_normalizer/legacy.py +++ b/charset_normalizer/legacy.py @@ -1,13 +1,24 @@ -from typing import Any, Dict, Optional, Union +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Optional from warnings import warn from .api import from_bytes from .constant import CHARDET_CORRESPONDENCE +# TODO: remove this check when dropping Python 3.7 support +if TYPE_CHECKING: + from typing_extensions import TypedDict + + class ResultDict(TypedDict): + encoding: Optional[str] + language: str + confidence: Optional[float] + def detect( byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any -) -> Dict[str, Optional[Union[str, float]]]: +) -> ResultDict: """ chardet legacy method Detect the encoding of the given byte string. It should be mostly backward-compatible. diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py index 77897aa..d834db0 100644 --- a/charset_normalizer/md.py +++ b/charset_normalizer/md.py @@ -236,7 +236,7 @@ def reset(self) -> None: # pragma: no cover @property def ratio(self) -> float: - if self._character_count <= 24: + if self._character_count <= 13: return 0.0 ratio_of_suspicious_range_usage: float = ( @@ -260,6 +260,7 @@ def __init__(self) -> None: self._buffer: str = "" self._buffer_accent_count: int = 0 + self._buffer_glyph_count: int = 0 def eligible(self, character: str) -> bool: return True @@ -279,6 +280,14 @@ def feed(self, character: str) -> None: and is_thai(character) is False ): self._foreign_long_watch = True + if ( + is_cjk(character) + or is_hangul(character) + or is_katakana(character) + or is_hiragana(character) + or is_thai(character) + ): + self._buffer_glyph_count += 1 return if not self._buffer: return @@ -291,17 +300,20 @@ def feed(self, character: str) -> None: self._character_count += buffer_length if buffer_length >= 4: - if self._buffer_accent_count / buffer_length > 0.34: + if self._buffer_accent_count / buffer_length >= 0.5: self._is_current_word_bad = True # Word/Buffer ending with an upper case accentuated letter are so rare, # that we will consider them all as suspicious. Same weight as foreign_long suspicious. - if ( + elif ( is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper() and all(_.isupper() for _ in self._buffer) is False ): self._foreign_long_count += 1 self._is_current_word_bad = True + elif self._buffer_glyph_count == 1: + self._is_current_word_bad = True + self._foreign_long_count += 1 if buffer_length >= 24 and self._foreign_long_watch: camel_case_dst = [ i @@ -325,6 +337,7 @@ def feed(self, character: str) -> None: self._foreign_long_watch = False self._buffer = "" self._buffer_accent_count = 0 + self._buffer_glyph_count = 0 elif ( character not in {"<", ">", "-", "=", "~", "|", "_"} and character.isdigit() is False diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py index a760b9c..6f6b86b 100644 --- a/charset_normalizer/models.py +++ b/charset_normalizer/models.py @@ -1,9 +1,10 @@ from encodings.aliases import aliases from hashlib import sha256 from json import dumps +from re import sub from typing import Any, Dict, Iterator, List, Optional, Tuple, Union -from .constant import TOO_BIG_SEQUENCE +from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE from .utils import iana_name, is_multi_byte_encoding, unicode_range @@ -16,6 +17,7 @@ def __init__( has_sig_or_bom: bool, languages: "CoherenceMatches", decoded_payload: Optional[str] = None, + preemptive_declaration: Optional[str] = None, ): self._payload: bytes = payload @@ -33,13 +35,13 @@ def __init__( self._string: Optional[str] = decoded_payload + self._preemptive_declaration: Optional[str] = preemptive_declaration + def __eq__(self, other: object) -> bool: if not isinstance(other, CharsetMatch): - raise TypeError( - "__eq__ cannot be invoked on {} and {}.".format( - str(other.__class__), str(self.__class__) - ) - ) + if isinstance(other, str): + return iana_name(other) == self.encoding + return False return self.encoding == other.encoding and self.fingerprint == other.fingerprint def __lt__(self, other: object) -> bool: @@ -210,7 +212,24 @@ def output(self, encoding: str = "utf_8") -> bytes: """ if self._output_encoding is None or self._output_encoding != encoding: self._output_encoding = encoding - self._output_payload = str(self).encode(encoding, "replace") + decoded_string = str(self) + if ( + self._preemptive_declaration is not None + and self._preemptive_declaration.lower() + not in ["utf-8", "utf8", "utf_8"] + ): + patched_header = sub( + RE_POSSIBLE_ENCODING_INDICATION, + lambda m: m.string[m.span()[0] : m.span()[1]].replace( + m.groups()[0], iana_name(self._output_encoding) # type: ignore[arg-type] + ), + decoded_string[:8192], + 1, + ) + + decoded_string = patched_header + decoded_string[8192:] + + self._output_payload = decoded_string.encode(encoding, "replace") return self._output_payload # type: ignore @@ -266,7 +285,7 @@ def append(self, item: CharsetMatch) -> None: ) ) # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage) - if len(item.raw) <= TOO_BIG_SEQUENCE: + if len(item.raw) < TOO_BIG_SEQUENCE: for match in self._results: if match.fingerprint == item.fingerprint and match.chaos == item.chaos: match.add_submatch(item) diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py index 5a4da4f..699990e 100644 --- a/charset_normalizer/version.py +++ b/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "3.3.2" +__version__ = "3.4.0" VERSION = __version__.split(".") diff --git a/debian/changelog b/debian/changelog index bb3e29d..15f5714 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,39 @@ +python-charset-normalizer (3.4.0-1) unstable; urgency=medium + + * Team upload. + * New upstream version + * Refresh patches + + -- Michael R. Crusoe Wed, 23 Oct 2024 18:48:05 +0200 + +python-charset-normalizer (3.3.2-4) unstable; urgency=medium + + * Team upload. + * Add patch to make the build reproducible + + -- Mattia Rizzolo Thu, 19 Sep 2024 17:35:46 +0200 + +python-charset-normalizer (3.3.2-3) unstable; urgency=medium + + * Team upload. + * Break dependency loop (us→sphinx→requests→us). + Thank you Samuel Thibault + & Adrian Bunk for the fixes. + Closes: #1078012 + * Standards-Version: 4.7.0 (routine-update) + * Remove trailing whitespace in debian/copyright (routine-update) + * Set upstream metadata fields: Bug-Database, Bug-Submit, Repository, Repository-Browse, Security-Contact. + + -- Michael R. Crusoe Sat, 14 Sep 2024 10:54:30 +0200 + +python-charset-normalizer (3.3.2-2) unstable; urgency=medium + + * Team upload. + * Build binary versions using mypyc to increase performance; as + upstream does. + + -- Michael R. Crusoe Sat, 03 Aug 2024 16:31:55 +0200 + python-charset-normalizer (3.3.2-1) unstable; urgency=medium * Team upload diff --git a/debian/control b/debian/control index 9c7d0b7..c7c7ad2 100644 --- a/debian/control +++ b/debian/control @@ -8,13 +8,17 @@ Build-Depends: dh-sequence-python3, pybuild-plugin-pyproject, python3-all, + python3-all-dev, + python3-mypy, python3-pytest , python3-pytest-cov , python3-setuptools, + sphinx-common, +Build-Depends-Indep: python3-sphinx, - furo , + furo, Rules-Requires-Root: no -Standards-Version: 4.6.2 +Standards-Version: 4.7.0 Homepage: https://github.com/ousret/charset_normalizer Vcs-Git: https://salsa.debian.org/python-team/packages/python-charset-normalizer.git Vcs-Browser: https://salsa.debian.org/python-team/packages/python-charset-normalizer @@ -38,10 +42,11 @@ Description: charset, encoding and language detection for Python (Documentation) This package contains the documentation. Package: python3-charset-normalizer -Architecture: all +Architecture: any Depends: ${misc:Depends}, ${python3:Depends}, + ${shlibs:Depends} Suggests: python-charset-normalizer-doc, Description: charset, encoding and language detection (Python 3) diff --git a/debian/copyright b/debian/copyright index 8e439c6..383abc2 100644 --- a/debian/copyright +++ b/debian/copyright @@ -22,7 +22,7 @@ License: CC-BY-SA-3.0 SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION - PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE. + PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE. . License . diff --git a/debian/patches/Make-the-build-reproducible-assure-stable-order-use-a-tup.patch b/debian/patches/Make-the-build-reproducible-assure-stable-order-use-a-tup.patch new file mode 100644 index 0000000..db75909 --- /dev/null +++ b/debian/patches/Make-the-build-reproducible-assure-stable-order-use-a-tup.patch @@ -0,0 +1,23 @@ +From: Mattia Rizzolo +Date: Thu, 19 Sep 2024 17:31:55 +0200 +Subject: Make the build reproducible: assure stable order, + use a tuple instead of a set + +Signed-off-by: Mattia Rizzolo +--- + charset_normalizer/md.py | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py +index d834db0..546ed59 100644 +--- a/charset_normalizer/md.py ++++ b/charset_normalizer/md.py +@@ -339,7 +339,7 @@ class SuperWeirdWordPlugin(MessDetectorPlugin): + self._buffer_accent_count = 0 + self._buffer_glyph_count = 0 + elif ( +- character not in {"<", ">", "-", "=", "~", "|", "_"} ++ character not in ("<", ">", "-", "=", "~", "|", "_") + and character.isdigit() is False + and is_symbol(character) + ): diff --git a/debian/patches/docs-Prevent-possible-privacy-issue-to-a-linked-image.patch b/debian/patches/docs-Prevent-possible-privacy-issue-to-a-linked-image.patch index e95aa5c..444e1ae 100644 --- a/debian/patches/docs-Prevent-possible-privacy-issue-to-a-linked-image.patch +++ b/debian/patches/docs-Prevent-possible-privacy-issue-to-a-linked-image.patch @@ -11,7 +11,7 @@ Foreward: Not-Needed 1 file changed, 5 deletions(-) diff --git a/docs/index.rst b/docs/index.rst -index 05d5f98..0762a03 100755 +index 19ca08a..392a132 100755 --- a/docs/index.rst +++ b/docs/index.rst @@ -11,11 +11,6 @@ All IANA character set names for which the Python core library provides codecs a diff --git a/debian/patches/mypyc-debug-symbols b/debian/patches/mypyc-debug-symbols new file mode 100644 index 0000000..b453b40 --- /dev/null +++ b/debian/patches/mypyc-debug-symbols @@ -0,0 +1,22 @@ +From: "Michael R. Crusoe" +Date: Thu, 19 Sep 2024 17:31:08 +0200 +Subject: Produce debugging symbols + +Forwarded: not-needed +--- + setup.py | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/setup.py b/setup.py +index 0ccc7e9..1fb2179 100644 +--- a/setup.py ++++ b/setup.py +@@ -27,7 +27,7 @@ if USE_MYPYC: + + MYPYC_MODULES = mypycify([ + "charset_normalizer/md.py", +- ], debug_level="0") ++ ], debug_level="1") + else: + MYPYC_MODULES = None + diff --git a/debian/patches/series b/debian/patches/series index 2ecd2c9..7349ea8 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -1 +1,3 @@ +mypyc-debug-symbols docs-Prevent-possible-privacy-issue-to-a-linked-image.patch +Make-the-build-reproducible-assure-stable-order-use-a-tup.patch diff --git a/debian/rules b/debian/rules index ba918f9..136c1e5 100755 --- a/debian/rules +++ b/debian/rules @@ -11,13 +11,17 @@ SPHINXOPTS := -N -D html_last_updated_fmt="$(BUILD_DATE)" export PYBUILD_NAME=charset_normalizer export PYBUILD_BEFORE_TEST=cp -r {dir}/data {build_dir} export PYBUILD_AFTER_TEST=rm -rf {build_dir}/.coverage {build_dir}/data/ +export CHARSET_NORMALIZER_USE_MYPYC=1 %: dh $@ --with=sphinxdoc --buildsystem=pybuild -override_dh_sphinxdoc: -ifeq (,$(findstring nodoc, $(DEB_BUILD_OPTIONS))) +override_dh_sphinxdoc-arch: + +override_dh_auto_build-indep: + +override_dh_auto_test-indep: + +execute_before_dh_sphinxdoc-indep: PYTHONPATH=. python3 -m sphinx -b html $(SPHINXOPTS) docs/ $(CURDIR)/debian/python-charset-normalizer-doc/usr/share/doc/python-charset-normalizer-doc/html rm $(CURDIR)/debian/python-charset-normalizer-doc/usr/share/doc/python-charset-normalizer-doc/html/.nojekyll - dh_sphinxdoc -endif diff --git a/debian/upstream/metadata b/debian/upstream/metadata index 4b950f4..943919e 100644 --- a/debian/upstream/metadata +++ b/debian/upstream/metadata @@ -1,7 +1,8 @@ --- -Bug-Database: https://github.com/Ousret/charset_normalizer/issues -Bug-Submit: https://github.com/Ousret/charset_normalizer/issues/new +Bug-Database: https://github.com/jawah/charset_normalizer/issues +Bug-Submit: https://github.com/jawah/charset_normalizer/issues/new Documentation: https://charset-normalizer.readthedocs.io/en/latest FAQ: https://github.com/Ousret/charset_normalizer/blob/master/README.md -Repository: https://github.com/ousret/charset_normalizer.git -Repository-Browse: https://github.com/ousret/charset_normalizer +Repository: https://github.com/jawah/charset_normalizer.git +Repository-Browse: https://github.com/jawah/charset_normalizer +Security-Contact: https://github.com/ousret/charset_normalizer/tree/HEAD/SECURITY.md diff --git a/debian/watch b/debian/watch index 7f93aba..c17b32f 100644 --- a/debian/watch +++ b/debian/watch @@ -4,5 +4,5 @@ opts="mode=git, \ compression=gz, \ uversionmangle=s/(\d)[_\.\-\+]?((RC|rc|pre|dev|beta|alpha)\.?\d*)$/$1~$2/, \ dversionmangle=s/\+(dfsg|ds)(\.?\d+)?$//" \ -https://github.com/Ousret/charset_normalizer.git \ +https://github.com/jawah/charset_normalizer.git \ refs/tags/@ANY_VERSION@ diff --git a/dev-requirements.txt b/dev-requirements.txt index c25bd74..12c2ebf 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,14 +1,15 @@ +--find-links https://github.com/mypyc/mypy_mypyc-wheels/releases/expanded_assets/v1.12.0+dev.b2deaaecf1a11e13bc962558992b5f2d5701f295 flake8==5.0.4 chardet==5.1.0 isort==5.11.4 codecov==2.1.13 pytest-cov==4.1.0 -build==0.10.0 -wheel==0.41.2 - +build>=0.10.0,<2 +wheel==0.42.0 black==23.3.0 -mypy==1.6.1; python_version >= '3.8' +mypy==1.11.2; python_version >= '3.8' and python_version < '3.13' +mypy==1.12.0; python_version >= '3.13' mypy==1.4.1; python_version < '3.8' Flask==2.2.3 -pytest==7.4.3 +pytest>=7.4.4,<=8.3.3 requests==2.31.0 diff --git a/docs/community/featured.rst b/docs/community/featured.rst index 8d1814c..a704a0b 100644 --- a/docs/community/featured.rst +++ b/docs/community/featured.rst @@ -9,10 +9,7 @@ your level or opinions. Niquests -------- -Started as a simple though.. - -.. image:: https://i.imgflip.com/7xet0f.jpg - :width: 200 +Started as a simple though.. IE 11 has built-in HTTP/2 support while Requests 2.32 does not! Most of our programs that interact with HTTP server are built with ``requests`` and we aren't likely to switch without a substantial effort. diff --git a/setup.cfg b/setup.cfg index 72fbc05..3eb71fa 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,7 +6,7 @@ long_description_content_type = text/markdown keywords = encoding, charset, charset-detector, detector, normalization, unicode, chardet, detect url = https://github.com/Ousret/charset_normalizer license = MIT -author_email = ahmed.tahri@cloudnursery.dev +author_email = tahri.ahmed@proton.me author = Ahmed TAHRI project_urls = Bug Reports = https://github.com/Ousret/charset_normalizer/issues @@ -25,6 +25,7 @@ classifiers = Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 Programming Language :: Python :: 3.12 + Programming Language :: Python :: 3.13 Programming Language :: Python :: Implementation :: PyPy Topic :: Text Processing :: Linguistic Topic :: Utilities diff --git a/tests/test_base_detection.py b/tests/test_base_detection.py index 3180a50..e5d774d 100644 --- a/tests/test_base_detection.py +++ b/tests/test_base_detection.py @@ -123,3 +123,14 @@ def test_doc_example_short_cp1251(): ).best() assert best_guess.encoding == "cp1251" + + +def test_direct_cmp_charset_match(): + best_guess = from_bytes( + "😀 Hello World! How affairs are going? 😀".encode("utf_8") + ).best() + + assert best_guess == "utf_8" + assert best_guess == "utf-8" + assert best_guess != 8 + assert best_guess != None diff --git a/tests/test_edge_case.py b/tests/test_edge_case.py index f324664..6caa1c4 100644 --- a/tests/test_edge_case.py +++ b/tests/test_edge_case.py @@ -10,3 +10,33 @@ def test_unicode_edge_case(): assert best_guess is not None, "Payload should have given something, detection failure" assert best_guess.encoding == "utf_8", "UTF-8 payload wrongly detected" + + +def test_issue_gh520(): + """Verify that minorities does not strip basic latin characters!""" + payload = b"/includes/webform.compon\xd2\xaants.inc/" + + best_guess = from_bytes(payload).best() + + assert best_guess is not None, "Payload should have given something, detection failure" + assert "Basic Latin" in best_guess.alphabets + + +def test_issue_gh509(): + """Two common ASCII punctuations should render as-is.""" + payload = b");" + + best_guess = from_bytes(payload).best() + + assert best_guess is not None, "Payload should have given something, detection failure" + assert "ascii" == best_guess.encoding + + +def test_issue_gh498(): + """This case was mistaken for utf-16-le, this should never happen again.""" + payload = b'\x84\xae\xaa\xe3\xac\xa5\xad\xe2 Microsoft Word.docx' + + best_guess = from_bytes(payload).best() + + assert best_guess is not None, "Payload should have given something, detection failure" + assert "Cyrillic" in best_guess.alphabets diff --git a/tests/test_preemptive_detection.py b/tests/test_preemptive_detection.py index 042415b..411bf45 100644 --- a/tests/test_preemptive_detection.py +++ b/tests/test_preemptive_detection.py @@ -1,6 +1,7 @@ import pytest from charset_normalizer.utils import any_specified_encoding +from charset_normalizer import CharsetMatch @pytest.mark.parametrize( @@ -24,3 +25,42 @@ def test_detect_most_common_body_encoding(payload, expected_encoding): ) assert specified_encoding == expected_encoding, "Unable to determine properly encoding from given body" + + +@pytest.mark.parametrize( + "payload, expected_outcome", + [ + (b'', b''), + (b'', b''), + (b'', b''), + (b'# coding: utf-8', b'# coding: utf-8'), + (b'', b''), + (b'', b''), + (b'', b''), + (b'', b''), + (b'', b''), + ] +) +def test_preemptive_mark_replacement(payload, expected_outcome): + """ + When generating (to Unicode converted) bytes, we want to change any potential declarative charset + to utf-8. This test that. + """ + specified_encoding = any_specified_encoding( + payload + ) + + detected_encoding = specified_encoding if specified_encoding is not None else "utf-8" + + m = CharsetMatch( + payload, + detected_encoding, + 0., + False, + [], + preemptive_declaration=specified_encoding, + ) + + transformed_output = m.output() + + assert transformed_output == expected_outcome