From 712d720fa58ced8a0922aa107967f64d453ef857 Mon Sep 17 00:00:00 2001 From: "deepin-community-bot[bot]" <156989552+deepin-community-bot[bot]@users.noreply.github.com> Date: Tue, 30 Jun 2026 06:03:49 +0000 Subject: [PATCH] feat: update python-charset-normalizer to 3.4.2-1 --- .codecov.yml | 8 - .coveragerc | 28 ++- .pre-commit-config.yaml | 30 +++ .readthedocs.yaml | 4 +- CHANGELOG.md | 47 +++- CONTRIBUTING.md | 14 +- LICENSE | 4 +- MANIFEST.in | 2 +- README.md | 21 +- UPGRADE.md | 31 --- bin/bc.py | 73 +++--- bin/coverage.py | 80 +++--- bin/integration.py | 54 ---- bin/performance.py | 20 +- bin/run_autofix.sh | 12 - bin/run_checks.sh | 15 -- bin/serve.py | 37 --- build-requirements.txt | 6 - charset_normalizer/version.py | 6 - debian/.gitignore | 1 + debian/changelog | 50 ++++ debian/control | 11 +- debian/copyright | 2 +- ...ucible-assure-stable-order-use-a-tup.patch | 23 ++ ...ible-privacy-issue-to-a-linked-image.patch | 2 +- debian/patches/mypyc-debug-symbols | 22 ++ debian/patches/series | 2 + debian/rules | 14 +- debian/tests/upstream-tests | 2 +- debian/upstream/metadata | 9 +- debian/watch | 2 +- dev-requirements.txt | 16 +- docs/community/featured.rst | 5 +- docs/index.rst | 2 +- noxfile.py | 234 ++++++++++++++++++ pyproject.toml | 85 +++++++ setup.cfg | 70 ------ setup.py | 28 +-- .../charset_normalizer}/__init__.py | 4 +- .../charset_normalizer}/__main__.py | 2 + .../charset_normalizer}/api.py | 136 ++++++---- .../charset_normalizer}/cd.py | 68 ++--- .../charset_normalizer}/cli/__init__.py | 2 + .../charset_normalizer}/cli/__main__.py | 111 ++++++++- .../charset_normalizer}/constant.py | 46 +++- .../charset_normalizer}/legacy.py | 18 +- .../charset_normalizer}/md.py | 98 +++++--- .../charset_normalizer}/models.py | 106 ++++---- .../charset_normalizer}/py.typed | 0 .../charset_normalizer}/utils.py | 81 +++--- src/charset_normalizer/version.py | 8 + tests/__init__.py | 1 - tests/test_base_detection.py | 115 ++++++--- tests/test_cli.py | 122 +++------ tests/test_coherence_detection.py | 76 +++++- tests/test_detect_legacy.py | 76 ++---- tests/test_edge_case.py | 57 ++++- tests/test_full_detection.py | 65 ++--- tests/test_isbinary.py | 27 +- tests/test_large_payload.py | 36 ++- tests/test_logging.py | 25 +- tests/test_mess_detection.py | 51 ++-- tests/test_preemptive_detection.py | 86 ++++++- tests/test_utils.py | 6 +- 64 files changed, 1587 insertions(+), 908 deletions(-) delete mode 100644 .codecov.yml create mode 100644 .pre-commit-config.yaml delete mode 100644 UPGRADE.md delete mode 100644 bin/integration.py delete mode 100755 bin/run_autofix.sh delete mode 100755 bin/run_checks.sh delete mode 100644 bin/serve.py delete mode 100644 build-requirements.txt delete mode 100644 charset_normalizer/version.py create mode 100644 debian/.gitignore create mode 100644 debian/patches/Make-the-build-reproducible-assure-stable-order-use-a-tup.patch create mode 100644 debian/patches/mypyc-debug-symbols create mode 100644 noxfile.py create mode 100644 pyproject.toml delete mode 100644 setup.cfg rename {charset_normalizer => src/charset_normalizer}/__init__.py (97%) rename {charset_normalizer => src/charset_normalizer}/__main__.py (66%) rename {charset_normalizer => src/charset_normalizer}/api.py (84%) rename {charset_normalizer => src/charset_normalizer}/cd.py (86%) rename {charset_normalizer => src/charset_normalizer}/cli/__init__.py (73%) rename {charset_normalizer => src/charset_normalizer}/cli/__main__.py (71%) rename {charset_normalizer => src/charset_normalizer}/constant.py (93%) rename {charset_normalizer => src/charset_normalizer}/legacy.py (81%) rename {charset_normalizer => src/charset_normalizer}/md.py (87%) rename {charset_normalizer => src/charset_normalizer}/models.py (77%) rename {charset_normalizer => src/charset_normalizer}/py.typed (100%) rename {charset_normalizer => src/charset_normalizer}/utils.py (84%) create mode 100644 src/charset_normalizer/version.py diff --git a/.codecov.yml b/.codecov.yml deleted file mode 100644 index f307895..0000000 --- a/.codecov.yml +++ /dev/null @@ -1,8 +0,0 @@ -coverage: - status: - project: - default: - target: 88% - threshold: null - patch: false - changes: false diff --git a/.coveragerc b/.coveragerc index 723bfd0..8b59131 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,2 +1,28 @@ [run] -source=charset_normalizer +source = + charset_normalizer +# Needed for Python 3.11 and lower +disable_warnings = no-sysmon + +[paths] +source = + src/charset_normalizer + */charset_normalizer + *\charset_normalizer + +[report] +omit = + src/charset_normalizer/__main__.py + +exclude_lines = + except ModuleNotFoundError: + except ImportError: + pass + import + raise NotImplementedError + .* # Platform-specific.* + .*:.* # Python \d.* + .* # Abstract + .* # Defensive: + if (?:typing.)?TYPE_CHECKING: + ^\s*?\.\.\.\s*$ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..93b904c --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,30 @@ +exclude: 'docs/|data/|tests/' + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: check-yaml + - id: debug-statements + - id: end-of-file-fixer + - id: trailing-whitespace + - repo: https://github.com/asottile/pyupgrade + rev: v3.19.1 + hooks: + - id: pyupgrade + args: [ --py37-plus, --keep-runtime-typing ] + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.9.1 + hooks: + # Run the linter. + - id: ruff + args: [ --fix ] + # Run the formatter. + - id: ruff-format + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.14.1 + hooks: + - id: mypy + args: [ --check-untyped-defs ] + exclude: 'tests/|noxfile.py|setup.py|bin/' diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 87d8ed3..b783a32 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -1,9 +1,9 @@ version: 2 build: - os: ubuntu-20.04 + os: ubuntu-22.04 tools: - python: "3.9" + python: "3.10" # Build documentation in the docs/ directory with Sphinx sphinx: diff --git a/CHANGELOG.md b/CHANGELOG.md index de66da4..9e78f38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,47 @@ All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [3.4.2](https://github.com/Ousret/charset_normalizer/compare/3.4.1...3.4.2) (2025-05-02) + +### Fixed +- Addressed the DeprecationWarning in our CLI regarding `argparse.FileType` by backporting the target class into the package. (#591) +- Improved the overall reliability of the detector with CJK Ideographs. (#605) (#587) + +### Changed +- Optional mypyc compilation upgraded to version 1.15 for Python >= 3.8 + +## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...3.4.1) (2024-12-24) + +### Changed +- Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend. +- Enforce annotation delayed loading for a simpler and consistent types in the project. +- Optional mypyc compilation upgraded to version 1.14 for Python >= 3.8 + +### Added +- pre-commit configuration. +- noxfile. + +### Removed +- `build-requirements.txt` as per using `pyproject.toml` native build configuration. +- `bin/integration.py` and `bin/serve.py` in favor of downstream integration test (see noxfile). +- `setup.cfg` in favor of `pyproject.toml` metadata configuration. +- Unused `utils.range_scan` function. + +### Fixed +- Converting content to Unicode bytes may insert `utf_8` instead of preferred `utf-8`. (#572) +- Deprecation warning "'count' is passed as positional argument" when converting to Unicode bytes on Python 3.13+ + +## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08) + +### Added +- Argument `--no-preemptive` in the CLI to prevent the detector to search for hints. +- Support for Python 3.13 (#512) + +### Fixed +- Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch. +- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537) +- Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381) + ## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31) ### Fixed @@ -170,7 +211,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12) ### Fixed -- ASCII miss-detection on rare cases (PR #170) +- ASCII miss-detection on rare cases (PR #170) ## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30) @@ -202,7 +243,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - MD improvement on trailing data and long foreign (non-pure latin) data (PR #124) - Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122) - call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129) -- Code style as refactored by Sourcery-AI (PR #131) +- Code style as refactored by Sourcery-AI (PR #131) - Minor adjustment on the MD around european words (PR #133) - Remove and replace SRTs from assets / tests (PR #139) - Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135) @@ -275,7 +316,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15) ### Fixed -- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59) +- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59) ### Changed - Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index abee674..40b19f0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,12 +1,12 @@ # Contribution Guidelines -If you’re reading this, you’re probably interested in contributing to Charset Normalizer. -Thank you very much! Open source projects live-and-die based on the support they receive from others, +If you’re reading this, you’re probably interested in contributing to Charset Normalizer. +Thank you very much! Open source projects live-and-die based on the support they receive from others, and the fact that you’re even considering contributing to this project is very generous of you. ## Questions -The GitHub issue tracker is for *bug reports* and *feature requests*. +The GitHub issue tracker is for *bug reports* and *feature requests*. Questions are allowed only when no answer are provided in docs. ## Good Bug Reports @@ -67,6 +67,10 @@ the backward-compatibility. ## How to run tests locally? It is essential that you run, prior to any submissions the mandatory checks. -Run the script `./bin/run_checks.sh` to verify that your modification are not breaking anything. -Also, make sure to run the `./bin/run_autofix.sh` to comply with the style format and import sorting. +```shell +pip install nox +nox -s test +nox -s lint +nox -s coverage +``` diff --git a/LICENSE b/LICENSE index ad82355..9725772 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2019 TAHRI Ahmed R. +Copyright (c) 2025 TAHRI Ahmed R. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file +SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in index 3792f5b..8da2cd0 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,4 @@ -include LICENSE README.md CHANGELOG.md charset_normalizer/py.typed dev-requirements.txt +include LICENSE README.md CHANGELOG.md src/charset_normalizer/py.typed dev-requirements.txt SECURITY.md noxfile.py recursive-include data *.md recursive-include data *.txt recursive-include docs * diff --git a/README.md b/README.md index 13e6e14..ee5b2e7 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@

Featured Packages
- Static Badge + Static Badge Static Badge @@ -55,8 +55,7 @@ This project offers you an alternative to **Universal Charset Encoding Detector* Reading Normalized TextCat Reading Text

-*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*
-Did you got there because of the logs? See [https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html](https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html) +*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*
## ⚡ Performance @@ -64,21 +63,23 @@ This package offer better performance than its counterpart Chardet. Here are som | Package | Accuracy | Mean per file (ms) | File per sec (est) | |-----------------------------------------------|:--------:|:------------------:|:------------------:| -| [chardet](https://github.com/chardet/chardet) | 86 % | 200 ms | 5 file/sec | +| [chardet](https://github.com/chardet/chardet) | 86 % | 63 ms | 16 file/sec | | charset-normalizer | **98 %** | **10 ms** | 100 file/sec | | Package | 99th percentile | 95th percentile | 50th percentile | |-----------------------------------------------|:---------------:|:---------------:|:---------------:| -| [chardet](https://github.com/chardet/chardet) | 1200 ms | 287 ms | 23 ms | +| [chardet](https://github.com/chardet/chardet) | 265 ms | 71 ms | 7 ms | | charset-normalizer | 100 ms | 50 ms | 5 ms | +_updated as of december 2024 using CPython 3.12_ + Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload. > Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows. > And yes, these results might change at any time. The dataset can be updated to include more files. > The actual delays heavily depends on your CPU capabilities. The factors should remain the same. > Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability -> (eg. Supported Encoding) Challenge-them if you want. +> (e.g. Supported Encoding) Challenge-them if you want. ## ✨ Installation @@ -195,11 +196,11 @@ reliable alternative using a completely different method. Also! I never back dow I **don't care** about the **originating charset** encoding, because **two different tables** can produce **two identical rendered string.** -What I want is to get readable text, the best I can. +What I want is to get readable text, the best I can. In a way, **I'm brute forcing text decoding.** How cool is that ? 😎 -Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode. +Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair Unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode. ## 🍰 How @@ -211,7 +212,7 @@ Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is **Wait a minute**, what is noise/mess and coherence according to **YOU ?** *Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then -**I established** some ground rules about **what is obvious** when **it seems like** a mess. +**I established** some ground rules about **what is obvious** when **it seems like** a mess (aka. defining noise in rendered text). I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to improve or rewrite it. @@ -255,3 +256,5 @@ from the experts who know it best, while seamlessly integrating with existing tools. [1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme + +[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/7297/badge)](https://www.bestpractices.dev/projects/7297) diff --git a/UPGRADE.md b/UPGRADE.md deleted file mode 100644 index 4b8f7bb..0000000 --- a/UPGRADE.md +++ /dev/null @@ -1,31 +0,0 @@ -Guide to upgrade your code from v1 to v2 ----------------------------------------- - - * If you are using the legacy `detect` function, that is it. You have nothing to do. - -## Detection - -### Before - -```python -from charset_normalizer import CharsetNormalizerMatches - -results = CharsetNormalizerMatches.from_bytes( - '我没有埋怨,磋砣的只是一些时间。'.encode('utf_32') -) -``` - -### After - -```python -from charset_normalizer import from_bytes - -results = from_bytes( - '我没有埋怨,磋砣的只是一些时间。'.encode('utf_32') -) -``` - -Methods that once were staticmethods of the class `CharsetNormalizerMatches` are now basic functions. -`from_fp`, `from_bytes`, `from_fp` and `` are concerned. - -Staticmethods scheduled to be removed in version 3.0 diff --git a/bin/bc.py b/bin/bc.py index df28943..ba4ee41 100644 --- a/bin/bc.py +++ b/bin/bc.py @@ -1,13 +1,13 @@ -#!/bin/python +from __future__ import annotations + +import argparse from glob import glob from os.path import isdir from sys import argv -from typing import List -import argparse -from charset_normalizer import detect as tbt_detect from chardet import detect as chardet_detect +from charset_normalizer import detect as tbt_detect from charset_normalizer.utils import iana_name @@ -16,28 +16,35 @@ def calc_equivalence(content: bytes, cp_a: str, cp_b: str): str_a = content.decode(cp_a) str_b = content.decode(cp_b) except UnicodeDecodeError: - return 0. + return 0.0 character_count = len(str_a) - diff_character_count = sum( - chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b) - ) + diff_character_count = sum(chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b)) - return 1. - (diff_character_count / character_count) + return 1.0 - (diff_character_count / character_count) -def cli_bc(arguments: List[str]): +def cli_bc(arguments: list[str]): parser = argparse.ArgumentParser( description="BC script checker for Charset-Normalizer with Chardet" ) - parser.add_argument('-c', '--coverage', action="store", default=85, type=int, dest='coverage', - help="Define the minimum acceptable coverage to succeed") + parser.add_argument( + "-c", + "--coverage", + action="store", + default=85, + type=int, + dest="coverage", + help="Define the minimum acceptable coverage to succeed", + ) args = parser.parse_args(arguments) if not isdir("./char-dataset"): - print("This script require https://github.com/Ousret/char-dataset to be cloned on package root directory") + print( + "This script require https://github.com/Ousret/char-dataset to be cloned on package root directory" + ) exit(1) success_count = 0 @@ -50,44 +57,52 @@ def cli_bc(arguments: List[str]): content = fp.read() chardet_result = chardet_detect(content) - chardet_encoding = chardet_result['encoding'] + chardet_encoding = chardet_result["encoding"] charset_normalizer_result = tbt_detect(content) - charset_normalizer_encoding = charset_normalizer_result['encoding'] + charset_normalizer_encoding = charset_normalizer_result["encoding"] if [chardet_encoding, charset_normalizer_encoding].count(None) == 1: - print("⚡⚡ '{}' (BC-Break) New('{}') vs Legacy('{}')".format(tbt_path, charset_normalizer_encoding, chardet_encoding)) + print( + f"⚡⚡ '{tbt_path}' (BC-Break) New('{charset_normalizer_encoding}') vs Legacy('{chardet_encoding}')" + ) continue if charset_normalizer_encoding == chardet_encoding: success_count += 1 - print("✅✅ '{}' (BC)".format(tbt_path)) + print(f"✅✅ '{tbt_path}' (BC)") continue - if (chardet_encoding is None and charset_normalizer_encoding is None) or (iana_name(chardet_encoding, False) == iana_name(charset_normalizer_encoding, False)): + if (chardet_encoding is None and charset_normalizer_encoding is None) or ( + iana_name(chardet_encoding, False) + == iana_name(charset_normalizer_encoding, False) + ): success_count += 1 - print("✅✅ '{}' (BC)".format(tbt_path)) + print(f"✅✅ '{tbt_path}' (BC)") continue - calc_eq = calc_equivalence(content, chardet_encoding, charset_normalizer_encoding) + calc_eq = calc_equivalence( + content, chardet_encoding, charset_normalizer_encoding + ) if calc_eq >= 0.98: success_count += 1 - print("️✅ ️'{}' (got '{}' but eq {} WITH {} %)".format(tbt_path, charset_normalizer_encoding, chardet_encoding, round(calc_eq * 100., 3))) + print( + f"️✅ ️'{tbt_path}' (got '{charset_normalizer_encoding}' but " + f"eq {chardet_encoding} WITH {round(calc_eq * 100.0, 3)} %)" + ) continue - print("⚡⚡ '{}' (BC-Break) New('{}') vs Legacy('{}')".format(tbt_path, charset_normalizer_encoding, chardet_encoding)) + print( + f"⚡⚡ '{tbt_path}' (BC-Break) New('{charset_normalizer_encoding}') vs Legacy('{chardet_encoding}')" + ) - success_ratio = round(success_count / total_count, 2) * 100. + success_ratio = round(success_count / total_count, 2) * 100.0 - print("Total EST BC = {} % ({} / {} files)".format(success_ratio, success_count, total_count)) + print(f"Total EST BC = {success_ratio} % ({success_count} / {total_count} files)") return 0 if success_ratio >= args.coverage else 1 if __name__ == "__main__": - exit( - cli_bc( - argv[1:] - ) - ) + exit(cli_bc(argv[1:])) diff --git a/bin/coverage.py b/bin/coverage.py index e5f07bd..80e1822 100644 --- a/bin/coverage.py +++ b/bin/coverage.py @@ -1,43 +1,54 @@ -#!/bin/python +from __future__ import annotations + +import argparse from glob import glob +from os import sep from os.path import isdir from sys import argv -from typing import List -import argparse -from charset_normalizer import from_path, __version__ +from charset_normalizer import __version__, from_path from charset_normalizer.utils import iana_name -from os import sep - def calc_equivalence(content: bytes, cp_a: str, cp_b: str): str_a = content.decode(cp_a) str_b = content.decode(cp_b) character_count = len(str_a) - diff_character_count = sum( - chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b) - ) - + diff_character_count = sum(chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b)) - return 1. - (diff_character_count / character_count) + return 1.0 - (diff_character_count / character_count) -def cli_coverage(arguments: List[str]): +def cli_coverage(arguments: list[str]): parser = argparse.ArgumentParser( description="Embedded detection success coverage script checker for Charset-Normalizer" ) - parser.add_argument('-p', '--with-preemptive', action="store_true", default=False, dest='preemptive', - help='Enable the preemptive scan behaviour during coverage check') - parser.add_argument('-c', '--coverage', action="store", default=90, type=int, dest='coverage', - help="Define the minimum acceptable coverage to succeed") + parser.add_argument( + "-p", + "--with-preemptive", + action="store_true", + default=False, + dest="preemptive", + help="Enable the preemptive scan behaviour during coverage check", + ) + parser.add_argument( + "-c", + "--coverage", + action="store", + default=90, + type=int, + dest="coverage", + help="Define the minimum acceptable coverage to succeed", + ) args = parser.parse_args(arguments) if not isdir("./char-dataset"): - print("This script require https://github.com/Ousret/char-dataset to be cloned on package root directory") + print( + "This script require https://github.com/Ousret/char-dataset to be cloned on package root directory" + ) exit(1) print(f"> using charset-normalizer {__version__}") @@ -46,28 +57,27 @@ def cli_coverage(arguments: List[str]): total_count = 0 for tbt_path in sorted(glob("./char-dataset/**/*.*")): - expected_encoding = tbt_path.split(sep)[-2] total_count += 1 - results = from_path( - tbt_path, - preemptive_behaviour=args.preemptive - ) + results = from_path(tbt_path, preemptive_behaviour=args.preemptive) if expected_encoding == "None" and len(results) == 0: - print("✅✅ '{}'".format(tbt_path)) + print(f"✅✅ '{tbt_path}'") success_count += 1 continue if len(results) == 0: - print("⚡⚡ '{}' (nothing)".format(tbt_path)) + print(f"⚡⚡ '{tbt_path}' (nothing)") continue result = results.best() - if expected_encoding in result.could_be_from_charset or iana_name(expected_encoding) in result.could_be_from_charset: - print("✅✅ '{}'".format(tbt_path)) + if ( + expected_encoding in result.could_be_from_charset + or iana_name(expected_encoding) in result.could_be_from_charset + ): + print(f"✅✅ '{tbt_path}'") success_count += 1 continue @@ -75,21 +85,21 @@ def cli_coverage(arguments: List[str]): if calc_eq >= 0.98: success_count += 1 - print("️✅ ️'{}' (got '{}' but equivalence {} %)".format(tbt_path, result.encoding, round(calc_eq * 100., 3))) + print( + f"️✅ ️'{tbt_path}' (got '{result.encoding}' but equivalence {round(calc_eq * 100.0, 3)} %)" + ) continue - print("⚡ '{}' (got '{}')".format(tbt_path, result.encoding)) + print(f"⚡ '{tbt_path}' (got '{result.encoding}')") - success_ratio = round(success_count / total_count, 2) * 100. + success_ratio = round(success_count / total_count, 2) * 100.0 - print("Total EST coverage = {} % ({} / {} files)".format(success_ratio, success_count, total_count)) + print( + f"Total EST coverage = {success_ratio} % ({success_count} / {total_count} files)" + ) return 0 if success_ratio >= args.coverage else 1 if __name__ == "__main__": - exit( - cli_coverage( - argv[1:] - ) - ) + exit(cli_coverage(argv[1:])) diff --git a/bin/integration.py b/bin/integration.py deleted file mode 100644 index b186313..0000000 --- a/bin/integration.py +++ /dev/null @@ -1,54 +0,0 @@ -from requests import get, __version__ -from typing import List -from charset_normalizer import detect, __version__ as __version_cn__ - -if __name__ == "__main__": - - print(f"requests {__version__}") - print(f"charset_normalizer {__version_cn__}") - - files: List[str] = get("http://127.0.0.1:8080/").json() - - print("## Testing with actual files") - - for file in files: - r = get( - "http://127.0.0.1:8080/" + file - ) - - if r.ok is False: - print(f"Unable to retrieve '{file}' | HTTP/{r.status_code}") - exit(1) - - expected_encoding = detect(r.content)["encoding"] - - if expected_encoding != r.apparent_encoding: - print(f"Integration test failed | File '{file}' | Expected '{expected_encoding}' got '{r.apparent_encoding}'") - exit(1) - - print(f"✅✅ '{file}' OK") - - print("## Testing with edge cases") - - # Should NOT crash - get("http://127.0.0.1:8080/edge/empty/json").json() - - print("✅✅ Empty JSON OK") - - if get("http://127.0.0.1:8080/edge/empty/plain").apparent_encoding != "utf-8": - print("Empty payload SHOULD not return apparent_encoding != UTF-8") - exit(1) - - print("✅✅ Empty Plain Text OK") - - r = get("http://127.0.0.1:8080/edge/gb18030/json") - - if r.apparent_encoding != "GB18030": - print("JSON Basic Detection FAILURE (/edge/gb18030/json)") - exit(1) - - r.json() - - print("✅✅ GB18030 JSON Encoded OK") - - print("Integration tests passed!") diff --git a/bin/performance.py b/bin/performance.py index ff715fd..3a55c18 100644 --- a/bin/performance.py +++ b/bin/performance.py @@ -1,15 +1,16 @@ -#!/bin/python -from glob import glob -from time import perf_counter_ns +from __future__ import annotations + import argparse -from sys import argv +from glob import glob +from math import ceil from os.path import isdir +from statistics import mean, stdev +from sys import argv +from time import perf_counter_ns -from charset_normalizer import detect from chardet import detect as chardet_detect -from statistics import mean, stdev -from math import ceil +from charset_normalizer import detect def calc_percentile(data, percentile): @@ -66,7 +67,8 @@ def performance_compare(arguments): charset_normalizer_time = charset_normalizer_time or 0.000005 cn_faster = (chardet_time / charset_normalizer_time) * 100 - 100 print( - f"{idx+1:>3}/{total_files} {tbt_path:<82} C:{chardet_time:.5f} CN:{charset_normalizer_time:.5f} {cn_faster:.1f} %" + f"{idx + 1:>3}/{total_files} {tbt_path:<82} C:{chardet_time:.5f} " + f"CN:{charset_normalizer_time:.5f} {cn_faster:.1f} %" ) # Print the top 10 rows with the slowest execution time @@ -78,7 +80,7 @@ def performance_compare(arguments): ) for idx, time in sorted_results[:10]: tbt_path = file_list[idx] - print(f"{idx+1:>3}/{total_files} {tbt_path:<82} CN:{time:.5f}") + print(f"{idx + 1:>3}/{total_files} {tbt_path:<82} CN:{time:.5f}") # Print charset normalizer statistics min_time = min(charset_normalizer_results) diff --git a/bin/run_autofix.sh b/bin/run_autofix.sh deleted file mode 100755 index f8a6381..0000000 --- a/bin/run_autofix.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/sh -e - -export PREFIX="" -if [ -d 'venv' ] ; then - export PREFIX="venv/bin/" -fi - -set -x - -${PREFIX}pip install -r ./dev-requirements.txt -${PREFIX}black --target-version=py37 charset_normalizer -${PREFIX}isort charset_normalizer diff --git a/bin/run_checks.sh b/bin/run_checks.sh deleted file mode 100755 index 951611f..0000000 --- a/bin/run_checks.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/sh -e - -export PREFIX="" -if [ -d 'venv' ] ; then - export PREFIX="venv/bin/" -fi - -set -x - -${PREFIX}pip install -r ./dev-requirements.txt -${PREFIX}pytest -${PREFIX}black --check --diff --target-version=py37 charset_normalizer -${PREFIX}flake8 charset_normalizer -${PREFIX}mypy charset_normalizer -${PREFIX}isort --check --diff charset_normalizer diff --git a/bin/serve.py b/bin/serve.py deleted file mode 100644 index 0b055ef..0000000 --- a/bin/serve.py +++ /dev/null @@ -1,37 +0,0 @@ -from flask import Flask, jsonify, send_from_directory -from glob import glob - -app = Flask(__name__) - - -@app.route('/raw/') -def read_file(path): - return send_from_directory('../char-dataset', path, as_attachment=True), 200, {"Content-Type": "text/plain"} - - -@app.route("/") -def read_targets(): - return jsonify( - [ - el.replace("./char-dataset", "/raw").replace("\\", "/") for el in sorted(glob("./char-dataset/**/*")) - ] - ) - - -@app.route("/edge/empty/plain") -def read_empty_response_plain(): - return b"", 200, {"Content-Type": "text/plain"} - - -@app.route("/edge/empty/json") -def read_empty_response_json(): - return b"{}", 200, {"Content-Type": "application/json"} - - -@app.route("/edge/gb18030/json") -def read_gb18030_response_json(): - return '{"abc": "我没有埋怨,磋砣的只是一些时间。。今觀俗士之論也,以族舉德,以位命賢,茲可謂得論之一體矣,而未獲至論之淑真也。"}'.encode("gb18030"), 200, {"Content-Type": "application/json"} - - -if __name__ == "__main__": - app.run(host="127.0.0.1", port=8080) diff --git a/build-requirements.txt b/build-requirements.txt deleted file mode 100644 index 9313e96..0000000 --- a/build-requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -# in the meantime we migrate to pyproject.toml -# this represent the minimum requirement to build (for the optional speedup) -mypy==1.6.1; python_version >= '3.8' -mypy==1.4.1; python_version < '3.8' -build==0.10.0 -wheel==0.41.2 diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py deleted file mode 100644 index 5a4da4f..0000000 --- a/charset_normalizer/version.py +++ /dev/null @@ -1,6 +0,0 @@ -""" -Expose version -""" - -__version__ = "3.3.2" -VERSION = __version__.split(".") diff --git a/debian/.gitignore b/debian/.gitignore new file mode 100644 index 0000000..2c8afeb --- /dev/null +++ b/debian/.gitignore @@ -0,0 +1 @@ +/files diff --git a/debian/changelog b/debian/changelog index bb3e29d..029ed53 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,53 @@ +python-charset-normalizer (3.4.2-1) unstable; urgency=medium + + * Team upload. + * New upstream release. + + -- Colin Watson Mon, 05 May 2025 11:40:18 +0100 + +python-charset-normalizer (3.4.1-1) unstable; urgency=medium + + * Team upload. + * New upstream release. + + -- Colin Watson Tue, 18 Feb 2025 13:02:04 +0000 + +python-charset-normalizer (3.4.0-1) unstable; urgency=medium + + * Team upload. + * New upstream version + * Refresh patches + + -- Michael R. Crusoe Wed, 23 Oct 2024 18:48:05 +0200 + +python-charset-normalizer (3.3.2-4) unstable; urgency=medium + + * Team upload. + * Add patch to make the build reproducible + + -- Mattia Rizzolo Thu, 19 Sep 2024 17:35:46 +0200 + +python-charset-normalizer (3.3.2-3) unstable; urgency=medium + + * Team upload. + * Break dependency loop (us→sphinx→requests→us). + Thank you Samuel Thibault + & Adrian Bunk for the fixes. + Closes: #1078012 + * Standards-Version: 4.7.0 (routine-update) + * Remove trailing whitespace in debian/copyright (routine-update) + * Set upstream metadata fields: Bug-Database, Bug-Submit, Repository, Repository-Browse, Security-Contact. + + -- Michael R. Crusoe Sat, 14 Sep 2024 10:54:30 +0200 + +python-charset-normalizer (3.3.2-2) unstable; urgency=medium + + * Team upload. + * Build binary versions using mypyc to increase performance; as + upstream does. + + -- Michael R. Crusoe Sat, 03 Aug 2024 16:31:55 +0200 + python-charset-normalizer (3.3.2-1) unstable; urgency=medium * Team upload diff --git a/debian/control b/debian/control index 9c7d0b7..c7c7ad2 100644 --- a/debian/control +++ b/debian/control @@ -8,13 +8,17 @@ Build-Depends: dh-sequence-python3, pybuild-plugin-pyproject, python3-all, + python3-all-dev, + python3-mypy, python3-pytest , python3-pytest-cov , python3-setuptools, + sphinx-common, +Build-Depends-Indep: python3-sphinx, - furo , + furo, Rules-Requires-Root: no -Standards-Version: 4.6.2 +Standards-Version: 4.7.0 Homepage: https://github.com/ousret/charset_normalizer Vcs-Git: https://salsa.debian.org/python-team/packages/python-charset-normalizer.git Vcs-Browser: https://salsa.debian.org/python-team/packages/python-charset-normalizer @@ -38,10 +42,11 @@ Description: charset, encoding and language detection for Python (Documentation) This package contains the documentation. Package: python3-charset-normalizer -Architecture: all +Architecture: any Depends: ${misc:Depends}, ${python3:Depends}, + ${shlibs:Depends} Suggests: python-charset-normalizer-doc, Description: charset, encoding and language detection (Python 3) diff --git a/debian/copyright b/debian/copyright index 8e439c6..383abc2 100644 --- a/debian/copyright +++ b/debian/copyright @@ -22,7 +22,7 @@ License: CC-BY-SA-3.0 SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION - PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE. + PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE. . License . diff --git a/debian/patches/Make-the-build-reproducible-assure-stable-order-use-a-tup.patch b/debian/patches/Make-the-build-reproducible-assure-stable-order-use-a-tup.patch new file mode 100644 index 0000000..7461c23 --- /dev/null +++ b/debian/patches/Make-the-build-reproducible-assure-stable-order-use-a-tup.patch @@ -0,0 +1,23 @@ +From: Mattia Rizzolo +Date: Thu, 19 Sep 2024 17:31:55 +0200 +Subject: Make the build reproducible: assure stable order, + use a tuple instead of a set + +Signed-off-by: Mattia Rizzolo +--- + src/charset_normalizer/md.py | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/charset_normalizer/md.py b/src/charset_normalizer/md.py +index 12ce024..9f002be 100644 +--- a/src/charset_normalizer/md.py ++++ b/src/charset_normalizer/md.py +@@ -341,7 +341,7 @@ class SuperWeirdWordPlugin(MessDetectorPlugin): + self._buffer_accent_count = 0 + self._buffer_glyph_count = 0 + elif ( +- character not in {"<", ">", "-", "=", "~", "|", "_"} ++ character not in ("<", ">", "-", "=", "~", "|", "_") + and character.isdigit() is False + and is_symbol(character) + ): diff --git a/debian/patches/docs-Prevent-possible-privacy-issue-to-a-linked-image.patch b/debian/patches/docs-Prevent-possible-privacy-issue-to-a-linked-image.patch index e95aa5c..fd50579 100644 --- a/debian/patches/docs-Prevent-possible-privacy-issue-to-a-linked-image.patch +++ b/debian/patches/docs-Prevent-possible-privacy-issue-to-a-linked-image.patch @@ -11,7 +11,7 @@ Foreward: Not-Needed 1 file changed, 5 deletions(-) diff --git a/docs/index.rst b/docs/index.rst -index 05d5f98..0762a03 100755 +index da8a9b6..c0c418b 100755 --- a/docs/index.rst +++ b/docs/index.rst @@ -11,11 +11,6 @@ All IANA character set names for which the Python core library provides codecs a diff --git a/debian/patches/mypyc-debug-symbols b/debian/patches/mypyc-debug-symbols new file mode 100644 index 0000000..d4bc918 --- /dev/null +++ b/debian/patches/mypyc-debug-symbols @@ -0,0 +1,22 @@ +From: "Michael R. Crusoe" +Date: Thu, 19 Sep 2024 17:31:08 +0200 +Subject: Produce debugging symbols + +Forwarded: not-needed +--- + setup.py | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/setup.py b/setup.py +index da2a69f..d521f07 100644 +--- a/setup.py ++++ b/setup.py +@@ -21,7 +21,7 @@ if USE_MYPYC: + [ + "src/charset_normalizer/md.py", + ], +- debug_level="0", ++ debug_level="1", + opt_level="3", + ) + else: diff --git a/debian/patches/series b/debian/patches/series index 2ecd2c9..7349ea8 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -1 +1,3 @@ +mypyc-debug-symbols docs-Prevent-possible-privacy-issue-to-a-linked-image.patch +Make-the-build-reproducible-assure-stable-order-use-a-tup.patch diff --git a/debian/rules b/debian/rules index ba918f9..7583222 100755 --- a/debian/rules +++ b/debian/rules @@ -11,13 +11,17 @@ SPHINXOPTS := -N -D html_last_updated_fmt="$(BUILD_DATE)" export PYBUILD_NAME=charset_normalizer export PYBUILD_BEFORE_TEST=cp -r {dir}/data {build_dir} export PYBUILD_AFTER_TEST=rm -rf {build_dir}/.coverage {build_dir}/data/ +export CHARSET_NORMALIZER_USE_MYPYC=1 %: dh $@ --with=sphinxdoc --buildsystem=pybuild -override_dh_sphinxdoc: -ifeq (,$(findstring nodoc, $(DEB_BUILD_OPTIONS))) - PYTHONPATH=. python3 -m sphinx -b html $(SPHINXOPTS) docs/ $(CURDIR)/debian/python-charset-normalizer-doc/usr/share/doc/python-charset-normalizer-doc/html +override_dh_sphinxdoc-arch: + +override_dh_auto_build-indep: + +override_dh_auto_test-indep: + +execute_before_dh_sphinxdoc-indep: + PYTHONPATH=src python3 -m sphinx -b html $(SPHINXOPTS) docs/ $(CURDIR)/debian/python-charset-normalizer-doc/usr/share/doc/python-charset-normalizer-doc/html rm $(CURDIR)/debian/python-charset-normalizer-doc/usr/share/doc/python-charset-normalizer-doc/html/.nojekyll - dh_sphinxdoc -endif diff --git a/debian/tests/upstream-tests b/debian/tests/upstream-tests index e0d3d08..7cb5a3d 100644 --- a/debian/tests/upstream-tests +++ b/debian/tests/upstream-tests @@ -2,7 +2,7 @@ set -e -u -cp -a charset_normalizer data setup.cfg tests "${AUTOPKGTEST_TMP}" +cp -a src data pyproject.toml tests "${AUTOPKGTEST_TMP}" for py3vers in $(py3versions -s); do echo diff --git a/debian/upstream/metadata b/debian/upstream/metadata index 4b950f4..943919e 100644 --- a/debian/upstream/metadata +++ b/debian/upstream/metadata @@ -1,7 +1,8 @@ --- -Bug-Database: https://github.com/Ousret/charset_normalizer/issues -Bug-Submit: https://github.com/Ousret/charset_normalizer/issues/new +Bug-Database: https://github.com/jawah/charset_normalizer/issues +Bug-Submit: https://github.com/jawah/charset_normalizer/issues/new Documentation: https://charset-normalizer.readthedocs.io/en/latest FAQ: https://github.com/Ousret/charset_normalizer/blob/master/README.md -Repository: https://github.com/ousret/charset_normalizer.git -Repository-Browse: https://github.com/ousret/charset_normalizer +Repository: https://github.com/jawah/charset_normalizer.git +Repository-Browse: https://github.com/jawah/charset_normalizer +Security-Contact: https://github.com/ousret/charset_normalizer/tree/HEAD/SECURITY.md diff --git a/debian/watch b/debian/watch index 7f93aba..c17b32f 100644 --- a/debian/watch +++ b/debian/watch @@ -4,5 +4,5 @@ opts="mode=git, \ compression=gz, \ uversionmangle=s/(\d)[_\.\-\+]?((RC|rc|pre|dev|beta|alpha)\.?\d*)$/$1~$2/, \ dversionmangle=s/\+(dfsg|ds)(\.?\d+)?$//" \ -https://github.com/Ousret/charset_normalizer.git \ +https://github.com/jawah/charset_normalizer.git \ refs/tags/@ANY_VERSION@ diff --git a/dev-requirements.txt b/dev-requirements.txt index c25bd74..9c85e11 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,14 +1,2 @@ -flake8==5.0.4 -chardet==5.1.0 -isort==5.11.4 -codecov==2.1.13 -pytest-cov==4.1.0 -build==0.10.0 -wheel==0.41.2 - -black==23.3.0 -mypy==1.6.1; python_version >= '3.8' -mypy==1.4.1; python_version < '3.8' -Flask==2.2.3 -pytest==7.4.3 -requests==2.31.0 +coverage>=7.2.7,<7.9 +pytest>=7.4.4,<9 diff --git a/docs/community/featured.rst b/docs/community/featured.rst index 8d1814c..a704a0b 100644 --- a/docs/community/featured.rst +++ b/docs/community/featured.rst @@ -9,10 +9,7 @@ your level or opinions. Niquests -------- -Started as a simple though.. - -.. image:: https://i.imgflip.com/7xet0f.jpg - :width: 200 +Started as a simple though.. IE 11 has built-in HTTP/2 support while Requests 2.32 does not! Most of our programs that interact with HTTP server are built with ``requests`` and we aren't likely to switch without a substantial effort. diff --git a/docs/index.rst b/docs/index.rst index 19ca08a..da8a9b6 100755 --- a/docs/index.rst +++ b/docs/index.rst @@ -20,7 +20,7 @@ It aims to be as generic as possible. It is released under MIT license, see LICENSE for more details. Be aware that no warranty of any kind is provided with this package. -Copyright (C) 2023 Ahmed TAHRI +Copyright (C) 2025 Ahmed TAHRI Introduction ============ diff --git a/noxfile.py b/noxfile.py new file mode 100644 index 0000000..bc60155 --- /dev/null +++ b/noxfile.py @@ -0,0 +1,234 @@ +from __future__ import annotations + +import os +import shutil + +import nox + + +def test_impl( + session: nox.Session, + use_mypyc: bool = False, +): + # Install deps and the package itself. + session.install("-U", "pip", "setuptools", silent=False) + session.install("-r", "dev-requirements.txt", silent=False) + + session.install( + ".", + silent=False, + env={"CHARSET_NORMALIZER_USE_MYPYC": "1" if use_mypyc else "0"}, + ) + + # Show the pip version. + session.run("pip", "--version") + # Print the Python version and bytesize. + session.run("python", "--version") + # Show charset-normalizer cli info + session.run("normalizer", "--version") + + # Inspired from https://hynek.me/articles/ditch-codecov-python/ + # We use parallel mode and then combine in a later CI step + session.run( + "python", + "-m", + "coverage", + "run", + "--parallel-mode", + "-m", + "pytest", + "-v", + "-ra", + f"--color={'yes' if 'GITHUB_ACTIONS' in os.environ else 'auto'}", + "--tb=native", + "--durations=10", + "--strict-config", + "--strict-markers", + *(session.posargs or ("tests/",)), + env={ + "PYTHONWARNINGS": "always::DeprecationWarning", + "COVERAGE_CORE": "sysmon", + }, + ) + + +@nox.session( + python=["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.14", "pypy"] +) +def test(session: nox.Session) -> None: + test_impl(session) + + +@nox.session(python=["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]) +def test_mypyc(session: nox.Session) -> None: + test_impl(session, True) + + +def git_clone(session: nox.Session, git_url: str) -> None: + """We either clone the target repository or if already exist + simply reset the state and pull. + """ + expected_directory = git_url.split("/")[-1] + + if expected_directory.endswith(".git"): + expected_directory = expected_directory[:-4] + + if not os.path.isdir(expected_directory): + session.run("git", "clone", "--depth", "1", git_url, external=True) + else: + session.run( + "git", "-C", expected_directory, "reset", "--hard", "HEAD", external=True + ) + session.run("git", "-C", expected_directory, "pull", external=True) + + +@nox.session() +def backward_compatibility(session: nox.Session) -> None: + git_clone(session, "https://github.com/ousret/char-dataset") + + # Install deps and the package itself. + session.install("-U", "pip", "setuptools", silent=False) + session.install("-r", "dev-requirements.txt", silent=False) + + session.install(".", silent=False) + session.install("chardet") + + session.run( + "python", + "bin/bc.py", + *(session.posargs or ("--coverage=85",)), + ) + + +@nox.session() +def coverage(session: nox.Session) -> None: + git_clone(session, "https://github.com/ousret/char-dataset") + + # Install deps and the package itself. + session.install("-U", "pip", "setuptools", silent=False) + session.install("-r", "dev-requirements.txt", silent=False) + + session.install(".", silent=False) + + # Show the pip version. + session.run("pip", "--version") + # Print the Python version and bytesize. + session.run("python", "--version") + # Show charset-normalizer cli info + session.run("normalizer", "--version") + + session.run( + "python", + "-m", + "coverage", + "run", + "--parallel-mode", + "bin/coverage.py", + *(session.posargs or ("--coverage=90", "--with-preemptive")), + ) + + +@nox.session() +def performance(session: nox.Session) -> None: + git_clone(session, "https://github.com/ousret/char-dataset") + + # Install deps and the package itself. + session.install("-U", "pip", "setuptools", silent=False) + session.install("-r", "dev-requirements.txt", silent=False) + + session.install("chardet") + session.install(".", silent=False, env={"CHARSET_NORMALIZER_USE_MYPYC": "1"}) + + session.run( + "python", + "bin/performance.py", + *(session.posargs or ()), + ) + + +@nox.session() +def downstream_niquests(session: nox.Session) -> None: + root = os.getcwd() + tmp_dir = session.create_tmp() + + session.cd(tmp_dir) + git_clone(session, "https://github.com/jawah/niquests") + session.chdir("niquests") + + session.run("git", "rev-parse", "HEAD", external=True) + session.install(".[socks]", silent=False) + session.install("-r", "requirements-dev.txt", silent=False) + + session.cd(root) + session.install(".", silent=False) + session.cd(f"{tmp_dir}/niquests") + + session.run( + "python", + "-c", + "import charset_normalizer; print(charset_normalizer.__version__)", + ) + session.run( + "python", + "-m", + "pytest", + "-v", + f"--color={'yes' if 'GITHUB_ACTIONS' in os.environ else 'auto'}", + *(session.posargs or ("tests/",)), + env={"NIQUESTS_STRICT_OCSP": "1"}, + ) + + +@nox.session() +def downstream_requests(session: nox.Session) -> None: + root = os.getcwd() + tmp_dir = session.create_tmp() + + session.cd(tmp_dir) + git_clone(session, "https://github.com/psf/requests") + session.chdir("requests") + + session.run("git", "rev-parse", "HEAD", external=True) + session.install(".[socks]", silent=False) + session.install("-r", "requirements-dev.txt", silent=False) + + session.cd(root) + session.install(".", silent=False) + session.cd(f"{tmp_dir}/requests") + + session.run( + "python", + "-c", + "import charset_normalizer; print(charset_normalizer.__version__)", + ) + session.run( + "python", + "-m", + "pytest", + "-v", + f"--color={'yes' if 'GITHUB_ACTIONS' in os.environ else 'auto'}", + *(session.posargs or ("tests/",)), + ) + + +@nox.session() +def format(session: nox.Session) -> None: + """Run code formatters.""" + lint(session) + + +@nox.session +def lint(session: nox.Session) -> None: + session.install("pre-commit") + session.run("pre-commit", "run", "--all-files") + + +@nox.session +def docs(session: nox.Session) -> None: + session.install("-r", "docs/requirements.txt") + session.install(".") + + session.chdir("docs") + if os.path.exists("_build"): + shutil.rmtree("_build") + session.run("sphinx-build", "-b", "html", "-W", ".", "_build/html") diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..34245fa --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,85 @@ +[build-system] +requires = ["setuptools", "setuptools-scm", "mypy>=1.4.1,<=1.15.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "charset-normalizer" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +license = {text = "MIT"} +keywords = ["encoding", "charset", "charset-detector", "detector", "normalization", "unicode", "chardet", "detect"] +authors = [ + {name = "Ahmed R. TAHRI", email="tahri.ahmed@proton.me"}, +] +maintainers = [ + {name = "Ahmed R. TAHRI", email="tahri.ahmed@proton.me"}, +] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + "Topic :: Text Processing :: Linguistic", + "Topic :: Utilities", + "Typing :: Typed", +] +requires-python = ">=3.7" +dynamic = ["version", "readme"] + +[project.optional-dependencies] +unicode_backport = [] + +[tool.setuptools] +package-dir = {"" = "src"} +packages = ["charset_normalizer", "charset_normalizer.cli", ] + +[tool.setuptools.dynamic] +version = {attr = "charset_normalizer.__version__"} +readme = {file = ["README.md", "CHANGELOG.md", "LICENSE"], content-type = "text/markdown"} + +[project.scripts] +normalizer = "charset_normalizer:cli.cli_detect" + +[project.urls] +"Changelog" = "https://github.com/jawah/charset_normalizer/blob/master/CHANGELOG.md" +"Documentation" = "https://charset-normalizer.readthedocs.io/" +"Code" = "https://github.com/jawah/charset_normalizer" +"Issue tracker" = "https://github.com/jawah/charset_normalizer/issues" + +[tool.pytest.ini_options] +log_level = "DEBUG" +filterwarnings = [ + "error", +] + +[tool.isort] +profile = "black" +add_imports = "from __future__ import annotations" + +[tool.mypy] +check_untyped_defs = true +disallow_any_generics = true +disallow_incomplete_defs = true +disallow_subclassing_any = true +disallow_untyped_calls = true +disallow_untyped_decorators = true +disallow_untyped_defs = true +no_implicit_optional = true +no_implicit_reexport = true +show_error_codes = true +strict_equality = true +warn_redundant_casts = true +warn_return_any = true +warn_unused_configs = true +warn_unused_ignores = false diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 72fbc05..0000000 --- a/setup.cfg +++ /dev/null @@ -1,70 +0,0 @@ -[metadata] -name = charset-normalizer -description = The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet. -long_description = file: README.md, CHANGELOG.md, LICENSE -long_description_content_type = text/markdown -keywords = encoding, charset, charset-detector, detector, normalization, unicode, chardet, detect -url = https://github.com/Ousret/charset_normalizer -license = MIT -author_email = ahmed.tahri@cloudnursery.dev -author = Ahmed TAHRI -project_urls = - Bug Reports = https://github.com/Ousret/charset_normalizer/issues - Documentation = https://charset-normalizer.readthedocs.io/en/latest -classifiers = - Development Status :: 5 - Production/Stable - License :: OSI Approved :: MIT License - Intended Audience :: Developers - Topic :: Software Development :: Libraries :: Python Modules - Operating System :: OS Independent - Programming Language :: Python - Programming Language :: Python :: 3 - Programming Language :: Python :: 3.7 - Programming Language :: Python :: 3.8 - Programming Language :: Python :: 3.9 - Programming Language :: Python :: 3.10 - Programming Language :: Python :: 3.11 - Programming Language :: Python :: 3.12 - Programming Language :: Python :: Implementation :: PyPy - Topic :: Text Processing :: Linguistic - Topic :: Utilities - Typing :: Typed - -[options.packages.find] -exclude = - tests - *.tests - *.tests.* - tests.* - docs* - data* - -[options.extras_require] -unicode_backport = - -[options.entry_points] -console_scripts = - normalizer = charset_normalizer.cli:cli_detect - -[options] -packages = find: -include_package_data = True -python_requires = >=3.7.0 - -[options.package_data] -charset_normalizer = py.typed - -[tool:pytest] -addopts = --cov=charset_normalizer --cov-report=term-missing -rxXs - -[flake8] -ignore = W503, E203, B305 -max-line-length = 120 - -[mypy] -disallow_untyped_defs = True -ignore_missing_imports = True - -[tool:isort] -profile = black -combine_as_imports = True diff --git a/setup.py b/setup.py index 0ccc7e9..da2a69f 100644 --- a/setup.py +++ b/setup.py @@ -1,38 +1,30 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- +from __future__ import annotations import os import sys -from re import search from setuptools import setup - -def get_version(): - with open('charset_normalizer/version.py') as version_file: - return search(r"""__version__\s+=\s+(['"])(?P.+?)\1""", - version_file.read()).group('version') - - USE_MYPYC = False if len(sys.argv) > 1 and sys.argv[1] == "--use-mypyc": sys.argv.pop(1) USE_MYPYC = True -if os.getenv("CHARSET_NORMALIZER_USE_MYPYC", None) == "1": +elif os.getenv("CHARSET_NORMALIZER_USE_MYPYC", None) == "1": USE_MYPYC = True if USE_MYPYC: from mypyc.build import mypycify - MYPYC_MODULES = mypycify([ - "charset_normalizer/md.py", - ], debug_level="0") + MYPYC_MODULES = mypycify( + [ + "src/charset_normalizer/md.py", + ], + debug_level="0", + opt_level="3", + ) else: MYPYC_MODULES = None -setup( - name="charset-normalizer", - version=get_version(), - ext_modules=MYPYC_MODULES -) +setup(name="charset-normalizer", ext_modules=MYPYC_MODULES) diff --git a/charset_normalizer/__init__.py b/src/charset_normalizer/__init__.py similarity index 97% rename from charset_normalizer/__init__.py rename to src/charset_normalizer/__init__.py index 55991fc..0d3a379 100644 --- a/charset_normalizer/__init__.py +++ b/src/charset_normalizer/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """ Charset-Normalizer ~~~~~~~~~~~~~~ @@ -19,6 +18,9 @@ :copyright: (c) 2021 by Ahmed TAHRI :license: MIT, see LICENSE for more details. """ + +from __future__ import annotations + import logging from .api import from_bytes, from_fp, from_path, is_binary diff --git a/charset_normalizer/__main__.py b/src/charset_normalizer/__main__.py similarity index 66% rename from charset_normalizer/__main__.py rename to src/charset_normalizer/__main__.py index beae2ef..e0e76f7 100644 --- a/charset_normalizer/__main__.py +++ b/src/charset_normalizer/__main__.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from .cli import cli_detect if __name__ == "__main__": diff --git a/charset_normalizer/api.py b/src/charset_normalizer/api.py similarity index 84% rename from charset_normalizer/api.py rename to src/charset_normalizer/api.py index 0ba08e3..2c8c061 100644 --- a/charset_normalizer/api.py +++ b/src/charset_normalizer/api.py @@ -1,6 +1,8 @@ +from __future__ import annotations + import logging from os import PathLike -from typing import BinaryIO, List, Optional, Set, Union +from typing import BinaryIO from .cd import ( coherence_ratio, @@ -21,8 +23,6 @@ should_strip_sig_or_bom, ) -# Will most likely be controversial -# logging.addLevelName(TRACE, "TRACE") logger = logging.getLogger("charset_normalizer") explain_handler = logging.StreamHandler() explain_handler.setFormatter( @@ -31,12 +31,12 @@ def from_bytes( - sequences: Union[bytes, bytearray], + sequences: bytes | bytearray, steps: int = 5, chunk_size: int = 512, threshold: float = 0.2, - cp_isolation: Optional[List[str]] = None, - cp_exclusion: Optional[List[str]] = None, + cp_isolation: list[str] | None = None, + cp_exclusion: list[str] | None = None, preemptive_behaviour: bool = True, explain: bool = False, language_threshold: float = 0.1, @@ -62,7 +62,7 @@ def from_bytes( if not isinstance(sequences, (bytearray, bytes)): raise TypeError( - "Expected object of type bytes or bytearray, got: {0}".format( + "Expected object of type bytes or bytearray, got: {}".format( type(sequences) ) ) @@ -76,7 +76,7 @@ def from_bytes( if length == 0: logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.") - if explain: + if explain: # Defensive: ensure exit path clean handler logger.removeHandler(explain_handler) logger.setLevel(previous_logger_level or logging.WARNING) return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")]) @@ -135,9 +135,9 @@ def from_bytes( ), ) - prioritized_encodings: List[str] = [] + prioritized_encodings: list[str] = [] - specified_encoding: Optional[str] = ( + specified_encoding: str | None = ( any_specified_encoding(sequences) if preemptive_behaviour else None ) @@ -149,16 +149,18 @@ def from_bytes( specified_encoding, ) - tested: Set[str] = set() - tested_but_hard_failure: List[str] = [] - tested_but_soft_failure: List[str] = [] + tested: set[str] = set() + tested_but_hard_failure: list[str] = [] + tested_but_soft_failure: list[str] = [] - fallback_ascii: Optional[CharsetMatch] = None - fallback_u8: Optional[CharsetMatch] = None - fallback_specified: Optional[CharsetMatch] = None + fallback_ascii: CharsetMatch | None = None + fallback_u8: CharsetMatch | None = None + fallback_specified: CharsetMatch | None = None results: CharsetMatches = CharsetMatches() + early_stop_results: CharsetMatches = CharsetMatches() + sig_encoding, sig_payload = identify_sig_or_bom(sequences) if sig_encoding is not None: @@ -187,7 +189,7 @@ def from_bytes( tested.add(encoding_iana) - decoded_payload: Optional[str] = None + decoded_payload: str | None = None bom_or_sig_available: bool = sig_encoding == encoding_iana strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom( encoding_iana @@ -221,16 +223,20 @@ def from_bytes( try: if is_too_large_sequence and is_multi_byte_decoder is False: str( - sequences[: int(50e4)] - if strip_sig_or_bom is False - else sequences[len(sig_payload) : int(50e4)], + ( + sequences[: int(50e4)] + if strip_sig_or_bom is False + else sequences[len(sig_payload) : int(50e4)] + ), encoding=encoding_iana, ) else: decoded_payload = str( - sequences - if strip_sig_or_bom is False - else sequences[len(sig_payload) :], + ( + sequences + if strip_sig_or_bom is False + else sequences[len(sig_payload) :] + ), encoding=encoding_iana, ) except (UnicodeDecodeError, LookupError) as e: @@ -286,7 +292,7 @@ def from_bytes( early_stop_count: int = 0 lazy_str_hard_failure = False - md_chunks: List[str] = [] + md_chunks: list[str] = [] md_ratios = [] try: @@ -367,7 +373,13 @@ def from_bytes( and not lazy_str_hard_failure ): fallback_entry = CharsetMatch( - sequences, encoding_iana, threshold, False, [], decoded_payload + sequences, + encoding_iana, + threshold, + False, + [], + decoded_payload, + preemptive_declaration=specified_encoding, ) if encoding_iana == specified_encoding: fallback_specified = fallback_entry @@ -385,7 +397,7 @@ def from_bytes( ) if not is_multi_byte_decoder: - target_languages: List[str] = encoding_languages(encoding_iana) + target_languages: list[str] = encoding_languages(encoding_iana) else: target_languages = mb_encoding_languages(encoding_iana) @@ -421,28 +433,58 @@ def from_bytes( ), ) - results.append( - CharsetMatch( - sequences, - encoding_iana, - mean_mess_ratio, - bom_or_sig_available, - cd_ratios_merged, - decoded_payload, - ) + current_match = CharsetMatch( + sequences, + encoding_iana, + mean_mess_ratio, + bom_or_sig_available, + cd_ratios_merged, + ( + decoded_payload + if ( + is_too_large_sequence is False + or encoding_iana in [specified_encoding, "ascii", "utf_8"] + ) + else None + ), + preemptive_declaration=specified_encoding, ) + results.append(current_match) + if ( encoding_iana in [specified_encoding, "ascii", "utf_8"] and mean_mess_ratio < 0.1 ): + # If md says nothing to worry about, then... stop immediately! + if mean_mess_ratio == 0.0: + logger.debug( + "Encoding detection: %s is most likely the one.", + current_match.encoding, + ) + if explain: # Defensive: ensure exit path clean handler + logger.removeHandler(explain_handler) + logger.setLevel(previous_logger_level) + return CharsetMatches([current_match]) + + early_stop_results.append(current_match) + + if ( + len(early_stop_results) + and (specified_encoding is None or specified_encoding in tested) + and "ascii" in tested + and "utf_8" in tested + ): + probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment] logger.debug( - "Encoding detection: %s is most likely the one.", encoding_iana + "Encoding detection: %s is most likely the one.", + probable_result.encoding, ) - if explain: + if explain: # Defensive: ensure exit path clean handler logger.removeHandler(explain_handler) logger.setLevel(previous_logger_level) - return CharsetMatches([results[encoding_iana]]) + + return CharsetMatches([probable_result]) if encoding_iana == sig_encoding: logger.debug( @@ -450,7 +492,7 @@ def from_bytes( "the beginning of the sequence.", encoding_iana, ) - if explain: + if explain: # Defensive: ensure exit path clean handler logger.removeHandler(explain_handler) logger.setLevel(previous_logger_level) return CharsetMatches([results[encoding_iana]]) @@ -504,8 +546,8 @@ def from_fp( steps: int = 5, chunk_size: int = 512, threshold: float = 0.20, - cp_isolation: Optional[List[str]] = None, - cp_exclusion: Optional[List[str]] = None, + cp_isolation: list[str] | None = None, + cp_exclusion: list[str] | None = None, preemptive_behaviour: bool = True, explain: bool = False, language_threshold: float = 0.1, @@ -530,12 +572,12 @@ def from_fp( def from_path( - path: Union[str, bytes, PathLike], # type: ignore[type-arg] + path: str | bytes | PathLike, # type: ignore[type-arg] steps: int = 5, chunk_size: int = 512, threshold: float = 0.20, - cp_isolation: Optional[List[str]] = None, - cp_exclusion: Optional[List[str]] = None, + cp_isolation: list[str] | None = None, + cp_exclusion: list[str] | None = None, preemptive_behaviour: bool = True, explain: bool = False, language_threshold: float = 0.1, @@ -561,12 +603,12 @@ def from_path( def is_binary( - fp_or_path_or_payload: Union[PathLike, str, BinaryIO, bytes], # type: ignore[type-arg] + fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg] steps: int = 5, chunk_size: int = 512, threshold: float = 0.20, - cp_isolation: Optional[List[str]] = None, - cp_exclusion: Optional[List[str]] = None, + cp_isolation: list[str] | None = None, + cp_exclusion: list[str] | None = None, preemptive_behaviour: bool = True, explain: bool = False, language_threshold: float = 0.1, diff --git a/charset_normalizer/cd.py b/src/charset_normalizer/cd.py similarity index 86% rename from charset_normalizer/cd.py rename to src/charset_normalizer/cd.py index 4ea6760..71a3ed5 100644 --- a/charset_normalizer/cd.py +++ b/src/charset_normalizer/cd.py @@ -1,8 +1,10 @@ +from __future__ import annotations + import importlib from codecs import IncrementalDecoder from collections import Counter from functools import lru_cache -from typing import Counter as TypeCounter, Dict, List, Optional, Tuple +from typing import Counter as TypeCounter from .constant import ( FREQUENCIES, @@ -22,26 +24,24 @@ ) -def encoding_unicode_range(iana_name: str) -> List[str]: +def encoding_unicode_range(iana_name: str) -> list[str]: """ Return associated unicode ranges in a single byte code page. """ if is_multi_byte_encoding(iana_name): - raise IOError("Function not supported on multi-byte code page") + raise OSError("Function not supported on multi-byte code page") - decoder = importlib.import_module( - "encodings.{}".format(iana_name) - ).IncrementalDecoder + decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder p: IncrementalDecoder = decoder(errors="ignore") - seen_ranges: Dict[str, int] = {} + seen_ranges: dict[str, int] = {} character_count: int = 0 for i in range(0x40, 0xFF): chunk: str = p.decode(bytes([i])) if chunk: - character_range: Optional[str] = unicode_range(chunk) + character_range: str | None = unicode_range(chunk) if character_range is None: continue @@ -61,11 +61,11 @@ def encoding_unicode_range(iana_name: str) -> List[str]: ) -def unicode_range_languages(primary_range: str) -> List[str]: +def unicode_range_languages(primary_range: str) -> list[str]: """ Return inferred languages used with a unicode range. """ - languages: List[str] = [] + languages: list[str] = [] for language, characters in FREQUENCIES.items(): for character in characters: @@ -77,13 +77,13 @@ def unicode_range_languages(primary_range: str) -> List[str]: @lru_cache() -def encoding_languages(iana_name: str) -> List[str]: +def encoding_languages(iana_name: str) -> list[str]: """ Single-byte encoding language association. Some code page are heavily linked to particular language(s). This function does the correspondence. """ - unicode_ranges: List[str] = encoding_unicode_range(iana_name) - primary_range: Optional[str] = None + unicode_ranges: list[str] = encoding_unicode_range(iana_name) + primary_range: str | None = None for specified_range in unicode_ranges: if "Latin" not in specified_range: @@ -97,7 +97,7 @@ def encoding_languages(iana_name: str) -> List[str]: @lru_cache() -def mb_encoding_languages(iana_name: str) -> List[str]: +def mb_encoding_languages(iana_name: str) -> list[str]: """ Multi-byte encoding language association. Some code page are heavily linked to particular language(s). This function does the correspondence. @@ -118,7 +118,7 @@ def mb_encoding_languages(iana_name: str) -> List[str]: @lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT) -def get_target_features(language: str) -> Tuple[bool, bool]: +def get_target_features(language: str) -> tuple[bool, bool]: """ Determine main aspects from a supported language if it contains accents and if is pure Latin. """ @@ -135,12 +135,12 @@ def get_target_features(language: str) -> Tuple[bool, bool]: def alphabet_languages( - characters: List[str], ignore_non_latin: bool = False -) -> List[str]: + characters: list[str], ignore_non_latin: bool = False +) -> list[str]: """ Return associated languages associated to given characters. """ - languages: List[Tuple[str, float]] = [] + languages: list[tuple[str, float]] = [] source_have_accents = any(is_accentuated(character) for character in characters) @@ -170,7 +170,7 @@ def alphabet_languages( def characters_popularity_compare( - language: str, ordered_characters: List[str] + language: str, ordered_characters: list[str] ) -> float: """ Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language. @@ -178,7 +178,7 @@ def characters_popularity_compare( Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.) """ if language not in FREQUENCIES: - raise ValueError("{} not available".format(language)) + raise ValueError(f"{language} not available") character_approved_count: int = 0 FREQUENCIES_language_set = set(FREQUENCIES[language]) @@ -214,14 +214,14 @@ def characters_popularity_compare( character_approved_count += 1 continue - characters_before_source: List[str] = FREQUENCIES[language][ + characters_before_source: list[str] = FREQUENCIES[language][ 0:character_rank_in_language ] - characters_after_source: List[str] = FREQUENCIES[language][ + characters_after_source: list[str] = FREQUENCIES[language][ character_rank_in_language: ] - characters_before: List[str] = ordered_characters[0:character_rank] - characters_after: List[str] = ordered_characters[character_rank:] + characters_before: list[str] = ordered_characters[0:character_rank] + characters_after: list[str] = ordered_characters[character_rank:] before_match_count: int = len( set(characters_before) & set(characters_before_source) @@ -249,24 +249,24 @@ def characters_popularity_compare( return character_approved_count / len(ordered_characters) -def alpha_unicode_split(decoded_sequence: str) -> List[str]: +def alpha_unicode_split(decoded_sequence: str) -> list[str]: """ Given a decoded text sequence, return a list of str. Unicode range / alphabet separation. Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list; One containing the latin letters and the other hebrew. """ - layers: Dict[str, str] = {} + layers: dict[str, str] = {} for character in decoded_sequence: if character.isalpha() is False: continue - character_range: Optional[str] = unicode_range(character) + character_range: str | None = unicode_range(character) if character_range is None: continue - layer_target_range: Optional[str] = None + layer_target_range: str | None = None for discovered_range in layers: if ( @@ -288,12 +288,12 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]: return list(layers.values()) -def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches: +def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches: """ This function merge results previously given by the function coherence_ratio. The return type is the same as coherence_ratio. """ - per_language_ratios: Dict[str, List[float]] = {} + per_language_ratios: dict[str, list[float]] = {} for result in results: for sub_result in result: language, ratio = sub_result @@ -321,7 +321,7 @@ def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches: We shall NOT return "English—" in CoherenceMatches because it is an alternative of "English". This function only keeps the best match and remove the em-dash in it. """ - index_results: Dict[str, List[float]] = dict() + index_results: dict[str, list[float]] = dict() for result in results: language, ratio = result @@ -345,14 +345,14 @@ def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches: @lru_cache(maxsize=2048) def coherence_ratio( - decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None + decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None ) -> CoherenceMatches: """ Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers. A layer = Character extraction by alphabets/ranges. """ - results: List[Tuple[str, float]] = [] + results: list[tuple[str, float]] = [] ignore_non_latin: bool = False sufficient_match_count: int = 0 @@ -371,7 +371,7 @@ def coherence_ratio( if character_count <= TOO_SMALL_SEQUENCE: continue - popular_character_ordered: List[str] = [c for c, o in most_common] + popular_character_ordered: list[str] = [c for c, o in most_common] for language in lg_inclusion_list or alphabet_languages( popular_character_ordered, ignore_non_latin diff --git a/charset_normalizer/cli/__init__.py b/src/charset_normalizer/cli/__init__.py similarity index 73% rename from charset_normalizer/cli/__init__.py rename to src/charset_normalizer/cli/__init__.py index d95fedf..543a5a4 100644 --- a/charset_normalizer/cli/__init__.py +++ b/src/charset_normalizer/cli/__init__.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from .__main__ import cli_detect, query_yes_no __all__ = ( diff --git a/charset_normalizer/cli/__main__.py b/src/charset_normalizer/cli/__main__.py similarity index 71% rename from charset_normalizer/cli/__main__.py rename to src/charset_normalizer/cli/__main__.py index f4bcbaa..cb64156 100644 --- a/charset_normalizer/cli/__main__.py +++ b/src/charset_normalizer/cli/__main__.py @@ -1,9 +1,11 @@ +from __future__ import annotations + import argparse import sys +import typing from json import dumps from os.path import abspath, basename, dirname, join, realpath from platform import python_version -from typing import List, Optional from unicodedata import unidata_version import charset_normalizer.md as md_module @@ -42,10 +44,69 @@ def query_yes_no(question: str, default: str = "yes") -> bool: elif choice in valid: return valid[choice] else: - sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n") + sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n") + + +class FileType: + """Factory for creating file object types + Instances of FileType are typically passed as type= arguments to the + ArgumentParser add_argument() method. -def cli_detect(argv: Optional[List[str]] = None) -> int: + Keyword Arguments: + - mode -- A string indicating how the file is to be opened. Accepts the + same values as the builtin open() function. + - bufsize -- The file's desired buffer size. Accepts the same values as + the builtin open() function. + - encoding -- The file's encoding. Accepts the same values as the + builtin open() function. + - errors -- A string indicating how encoding and decoding errors are to + be handled. Accepts the same value as the builtin open() function. + + Backported from CPython 3.12 + """ + + def __init__( + self, + mode: str = "r", + bufsize: int = -1, + encoding: str | None = None, + errors: str | None = None, + ): + self._mode = mode + self._bufsize = bufsize + self._encoding = encoding + self._errors = errors + + def __call__(self, string: str) -> typing.IO: # type: ignore[type-arg] + # the special argument "-" means sys.std{in,out} + if string == "-": + if "r" in self._mode: + return sys.stdin.buffer if "b" in self._mode else sys.stdin + elif any(c in self._mode for c in "wax"): + return sys.stdout.buffer if "b" in self._mode else sys.stdout + else: + msg = f'argument "-" with mode {self._mode}' + raise ValueError(msg) + + # all other arguments are used as file names + try: + return open(string, self._mode, self._bufsize, self._encoding, self._errors) + except OSError as e: + message = f"can't open '{string}': {e}" + raise argparse.ArgumentTypeError(message) + + def __repr__(self) -> str: + args = self._mode, self._bufsize + kwargs = [("encoding", self._encoding), ("errors", self._errors)] + args_str = ", ".join( + [repr(arg) for arg in args if arg != -1] + + [f"{kw}={arg!r}" for kw, arg in kwargs if arg is not None] + ) + return f"{type(self).__name__}({args_str})" + + +def cli_detect(argv: list[str] | None = None) -> int: """ CLI assistant using ARGV and ArgumentParser :param argv: @@ -58,7 +119,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: ) parser.add_argument( - "files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed" + "files", type=FileType("rb"), nargs="+", help="File(s) to be analysed" ) parser.add_argument( "-v", @@ -109,6 +170,14 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: dest="force", help="Replace file without asking if you are sure, use this flag with caution.", ) + parser.add_argument( + "-i", + "--no-preemptive", + action="store_true", + default=False, + dest="no_preemptive", + help="Disable looking at a charset declaration to hint the detector.", + ) parser.add_argument( "-t", "--threshold", @@ -116,7 +185,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: default=0.2, type=float, dest="threshold", - help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.", + help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.", ) parser.add_argument( "--version", @@ -133,21 +202,35 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: args = parser.parse_args(argv) if args.replace is True and args.normalize is False: + if args.files: + for my_file in args.files: + my_file.close() print("Use --replace in addition of --normalize only.", file=sys.stderr) return 1 if args.force is True and args.replace is False: + if args.files: + for my_file in args.files: + my_file.close() print("Use --force in addition of --replace only.", file=sys.stderr) return 1 if args.threshold < 0.0 or args.threshold > 1.0: + if args.files: + for my_file in args.files: + my_file.close() print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr) return 1 x_ = [] for my_file in args.files: - matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose) + matches = from_fp( + my_file, + threshold=args.threshold, + explain=args.verbose, + preemptive_behaviour=args.no_preemptive is False, + ) best_guess = matches.best() @@ -155,9 +238,11 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: print( 'Unable to identify originating encoding for "{}". {}'.format( my_file.name, - "Maybe try increasing maximum amount of chaos." - if args.threshold < 1.0 - else "", + ( + "Maybe try increasing maximum amount of chaos." + if args.threshold < 1.0 + else "" + ), ), file=sys.stderr, ) @@ -235,7 +320,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: dir_path = dirname(realpath(my_file.name)) file_name = basename(realpath(my_file.name)) - o_: List[str] = file_name.split(".") + o_: list[str] = file_name.split(".") if args.replace is False: o_.insert(-1, best_guess.encoding) @@ -258,9 +343,9 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: try: x_[0].unicode_path = join(dir_path, ".".join(o_)) - with open(x_[0].unicode_path, "w", encoding="utf-8") as fp: - fp.write(str(best_guess)) - except IOError as e: + with open(x_[0].unicode_path, "wb") as fp: + fp.write(best_guess.output()) + except OSError as e: print(str(e), file=sys.stderr) if my_file.closed is False: my_file.close() diff --git a/charset_normalizer/constant.py b/src/charset_normalizer/constant.py similarity index 93% rename from charset_normalizer/constant.py rename to src/charset_normalizer/constant.py index 8634904..cc71a01 100644 --- a/charset_normalizer/constant.py +++ b/src/charset_normalizer/constant.py @@ -1,11 +1,12 @@ -# -*- coding: utf-8 -*- +from __future__ import annotations + from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE from encodings.aliases import aliases -from re import IGNORECASE, compile as re_compile -from typing import Dict, List, Set, Union +from re import IGNORECASE +from re import compile as re_compile # Contain for each eligible encoding a list of/item bytes SIG/BOM -ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = { +ENCODING_MARKS: dict[str, bytes | list[bytes]] = { "utf_8": BOM_UTF8, "utf_7": [ b"\x2b\x2f\x76\x38", @@ -25,7 +26,7 @@ UTF8_MAXIMAL_ALLOCATION: int = 1_112_064 # Up-to-date Unicode ucd/15.0.0 -UNICODE_RANGES_COMBINED: Dict[str, range] = { +UNICODE_RANGES_COMBINED: dict[str, range] = { "Control character": range(32), "Basic Latin": range(32, 128), "Latin-1 Supplement": range(128, 256), @@ -357,7 +358,7 @@ } -UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [ +UNICODE_SECONDARY_RANGE_KEYWORD: list[str] = [ "Supplement", "Extended", "Extensions", @@ -392,7 +393,7 @@ "koi8_u", ] -IANA_SUPPORTED: List[str] = sorted( +IANA_SUPPORTED: list[str] = sorted( filter( lambda x: x.endswith("_codec") is False and x not in {"rot_13", "tactis", "mbcs"}, @@ -403,7 +404,7 @@ IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED) # pre-computed code page that are similar using the function cp_similarity. -IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = { +IANA_SUPPORTED_SIMILAR: dict[str, list[str]] = { "cp037": ["cp1026", "cp1140", "cp273", "cp500"], "cp1026": ["cp037", "cp1140", "cp273", "cp500"], "cp1125": ["cp866"], @@ -492,7 +493,7 @@ } -CHARDET_CORRESPONDENCE: Dict[str, str] = { +CHARDET_CORRESPONDENCE: dict[str, str] = { "iso2022_kr": "ISO-2022-KR", "iso2022_jp": "ISO-2022-JP", "euc_kr": "EUC-KR", @@ -528,7 +529,7 @@ } -COMMON_SAFE_ASCII_CHARACTERS: Set[str] = { +COMMON_SAFE_ASCII_CHARACTERS: set[str] = { "<", ">", "=", @@ -544,11 +545,30 @@ "|", '"', "-", + "(", + ")", } +# Sample character sets — replace with full lists if needed +COMMON_CHINESE_CHARACTERS = "的一是在不了有和人这中大为上个国我以要他时来用们生到作地于出就分对成会可主发年动同工也能下过子说产种面而方后多定行学法所民得经十三之进着等部度家电力里如水化高自二理起小物现实加量都两体制机当使点从业本去把性好应开它合还因由其些然前外天政四日那社义事平形相全表间样与关各重新线内数正心反你明看原又么利比或但质气第向道命此变条只没结解问意建月公无系军很情者最立代想已通并提直题党程展五果料象员革位入常文总次品式活设及管特件长求老头基资边流路级少图山统接知较将组见计别她手角期根论运农指几九区强放决西被干做必战先回则任取据处队南给色光门即保治北造百规热领七海口东导器压志世金增争济阶油思术极交受联什认六共权收证改清己美再采转更单风切打白教速花带安场身车例真务具万每目至达走积示议声报斗完类八离华名确才科张信马节话米整空元况今集温传土许步群广石记需段研界拉林律叫且究观越织装影算低持音众书布复容儿须际商非验连断深难近矿千周委素技备半办青省列习响约支般史感劳便团往酸历市克何除消构府太准精值号率族维划选标写存候毛亲快效斯院查江型眼王按格养易置派层片始却专状育厂京识适属圆包火住调满县局照参红细引听该铁价严龙飞" + +COMMON_JAPANESE_CHARACTERS = "日一国年大十二本中長出三時行見月分後前生五間上東四今金九入学高円子外八六下来気小七山話女北午百書先名川千水半男西電校語土木聞食車何南万毎白天母火右読友左休父雨" + +COMMON_KOREAN_CHARACTERS = "一二三四五六七八九十百千萬上下左右中人女子大小山川日月火水木金土父母天地國名年時文校學生" + +# Combine all into a set +COMMON_CJK_CHARACTERS = set( + "".join( + [ + COMMON_CHINESE_CHARACTERS, + COMMON_JAPANESE_CHARACTERS, + COMMON_KOREAN_CHARACTERS, + ] + ) +) -KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"} -ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"} +KO_NAMES: set[str] = {"johab", "cp949", "euc_kr"} +ZH_NAMES: set[str] = {"big5", "cp950", "big5hkscs", "hz"} # Logging LEVEL below DEBUG TRACE: int = 5 @@ -556,7 +576,7 @@ # Language label that contain the em dash "—" # character are to be considered alternative seq to origin -FREQUENCIES: Dict[str, List[str]] = { +FREQUENCIES: dict[str, list[str]] = { "English": [ "e", "a", diff --git a/charset_normalizer/legacy.py b/src/charset_normalizer/legacy.py similarity index 81% rename from charset_normalizer/legacy.py rename to src/charset_normalizer/legacy.py index 43aad21..e221bec 100644 --- a/charset_normalizer/legacy.py +++ b/src/charset_normalizer/legacy.py @@ -1,13 +1,24 @@ -from typing import Any, Dict, Optional, Union +from __future__ import annotations + +from typing import TYPE_CHECKING, Any from warnings import warn from .api import from_bytes from .constant import CHARDET_CORRESPONDENCE +# TODO: remove this check when dropping Python 3.7 support +if TYPE_CHECKING: + from typing_extensions import TypedDict + + class ResultDict(TypedDict): + encoding: str | None + language: str + confidence: float | None + def detect( byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any -) -> Dict[str, Optional[Union[str, float]]]: +) -> ResultDict: """ chardet legacy method Detect the encoding of the given byte string. It should be mostly backward-compatible. @@ -26,8 +37,7 @@ def detect( if not isinstance(byte_str, (bytearray, bytes)): raise TypeError( # pragma: nocover - "Expected object of type bytes or bytearray, got: " - "{0}".format(type(byte_str)) + f"Expected object of type bytes or bytearray, got: {type(byte_str)}" ) if isinstance(byte_str, bytearray): diff --git a/charset_normalizer/md.py b/src/charset_normalizer/md.py similarity index 87% rename from charset_normalizer/md.py rename to src/charset_normalizer/md.py index 77897aa..12ce024 100644 --- a/charset_normalizer/md.py +++ b/src/charset_normalizer/md.py @@ -1,6 +1,7 @@ +from __future__ import annotations + from functools import lru_cache from logging import getLogger -from typing import List, Optional from .constant import ( COMMON_SAFE_ASCII_CHARACTERS, @@ -25,6 +26,7 @@ is_unprintable, remove_accent, unicode_range, + is_cjk_uncommon, ) @@ -68,7 +70,7 @@ def __init__(self) -> None: self._symbol_count: int = 0 self._character_count: int = 0 - self._last_printable_char: Optional[str] = None + self._last_printable_char: str | None = None self._frenzy_symbol_in_word: bool = False def eligible(self, character: str) -> bool: @@ -92,7 +94,7 @@ def feed(self, character: str) -> None: self._last_printable_char = character - def reset(self) -> None: # pragma: no cover + def reset(self) -> None: # Abstract self._punctuation_count = 0 self._character_count = 0 self._symbol_count = 0 @@ -123,7 +125,7 @@ def feed(self, character: str) -> None: if is_accentuated(character): self._accentuated_count += 1 - def reset(self) -> None: # pragma: no cover + def reset(self) -> None: # Abstract self._character_count = 0 self._accentuated_count = 0 @@ -149,7 +151,7 @@ def feed(self, character: str) -> None: self._unprintable_count += 1 self._character_count += 1 - def reset(self) -> None: # pragma: no cover + def reset(self) -> None: # Abstract self._unprintable_count = 0 @property @@ -165,7 +167,7 @@ def __init__(self) -> None: self._successive_count: int = 0 self._character_count: int = 0 - self._last_latin_character: Optional[str] = None + self._last_latin_character: str | None = None def eligible(self, character: str) -> bool: return character.isalpha() and is_latin(character) @@ -184,7 +186,7 @@ def feed(self, character: str) -> None: self._successive_count += 1 self._last_latin_character = character - def reset(self) -> None: # pragma: no cover + def reset(self) -> None: # Abstract self._successive_count = 0 self._character_count = 0 self._last_latin_character = None @@ -201,7 +203,7 @@ class SuspiciousRange(MessDetectorPlugin): def __init__(self) -> None: self._suspicious_successive_range_count: int = 0 self._character_count: int = 0 - self._last_printable_seen: Optional[str] = None + self._last_printable_seen: str | None = None def eligible(self, character: str) -> bool: return character.isprintable() @@ -221,22 +223,22 @@ def feed(self, character: str) -> None: self._last_printable_seen = character return - unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen) - unicode_range_b: Optional[str] = unicode_range(character) + unicode_range_a: str | None = unicode_range(self._last_printable_seen) + unicode_range_b: str | None = unicode_range(character) if is_suspiciously_successive_range(unicode_range_a, unicode_range_b): self._suspicious_successive_range_count += 1 self._last_printable_seen = character - def reset(self) -> None: # pragma: no cover + def reset(self) -> None: # Abstract self._character_count = 0 self._suspicious_successive_range_count = 0 self._last_printable_seen = None @property def ratio(self) -> float: - if self._character_count <= 24: + if self._character_count <= 13: return 0.0 ratio_of_suspicious_range_usage: float = ( @@ -260,6 +262,7 @@ def __init__(self) -> None: self._buffer: str = "" self._buffer_accent_count: int = 0 + self._buffer_glyph_count: int = 0 def eligible(self, character: str) -> bool: return True @@ -279,6 +282,14 @@ def feed(self, character: str) -> None: and is_thai(character) is False ): self._foreign_long_watch = True + if ( + is_cjk(character) + or is_hangul(character) + or is_katakana(character) + or is_hiragana(character) + or is_thai(character) + ): + self._buffer_glyph_count += 1 return if not self._buffer: return @@ -291,17 +302,20 @@ def feed(self, character: str) -> None: self._character_count += buffer_length if buffer_length >= 4: - if self._buffer_accent_count / buffer_length > 0.34: + if self._buffer_accent_count / buffer_length >= 0.5: self._is_current_word_bad = True # Word/Buffer ending with an upper case accentuated letter are so rare, # that we will consider them all as suspicious. Same weight as foreign_long suspicious. - if ( + elif ( is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper() and all(_.isupper() for _ in self._buffer) is False ): self._foreign_long_count += 1 self._is_current_word_bad = True + elif self._buffer_glyph_count == 1: + self._is_current_word_bad = True + self._foreign_long_count += 1 if buffer_length >= 24 and self._foreign_long_watch: camel_case_dst = [ i @@ -325,6 +339,7 @@ def feed(self, character: str) -> None: self._foreign_long_watch = False self._buffer = "" self._buffer_accent_count = 0 + self._buffer_glyph_count = 0 elif ( character not in {"<", ">", "-", "=", "~", "|", "_"} and character.isdigit() is False @@ -333,7 +348,7 @@ def feed(self, character: str) -> None: self._is_current_word_bad = True self._buffer += character - def reset(self) -> None: # pragma: no cover + def reset(self) -> None: # Abstract self._buffer = "" self._is_current_word_bad = False self._foreign_long_watch = False @@ -351,35 +366,39 @@ def ratio(self) -> float: return self._bad_character_count / self._character_count -class CjkInvalidStopPlugin(MessDetectorPlugin): +class CjkUncommonPlugin(MessDetectorPlugin): """ - GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and - can be easily detected. Searching for the overuse of '丅' and '丄'. + Detect messy CJK text that probably means nothing. """ def __init__(self) -> None: - self._wrong_stop_count: int = 0 - self._cjk_character_count: int = 0 + self._character_count: int = 0 + self._uncommon_count: int = 0 def eligible(self, character: str) -> bool: - return True + return is_cjk(character) def feed(self, character: str) -> None: - if character in {"丅", "丄"}: - self._wrong_stop_count += 1 + self._character_count += 1 + + if is_cjk_uncommon(character): + self._uncommon_count += 1 return - if is_cjk(character): - self._cjk_character_count += 1 - def reset(self) -> None: # pragma: no cover - self._wrong_stop_count = 0 - self._cjk_character_count = 0 + def reset(self) -> None: # Abstract + self._character_count = 0 + self._uncommon_count = 0 @property def ratio(self) -> float: - if self._cjk_character_count < 16: + if self._character_count < 8: return 0.0 - return self._wrong_stop_count / self._cjk_character_count + + uncommon_form_usage: float = self._uncommon_count / self._character_count + + # we can be pretty sure it's garbage when uncommon characters are widely + # used. otherwise it could just be traditional chinese for example. + return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0 class ArchaicUpperLowerPlugin(MessDetectorPlugin): @@ -393,7 +412,7 @@ def __init__(self) -> None: self._character_count: int = 0 - self._last_alpha_seen: Optional[str] = None + self._last_alpha_seen: str | None = None self._current_ascii_only: bool = True def eligible(self, character: str) -> bool: @@ -441,7 +460,7 @@ def feed(self, character: str) -> None: self._character_count_since_last_sep += 1 self._last_alpha_seen = character - def reset(self) -> None: # pragma: no cover + def reset(self) -> None: # Abstract self._character_count = 0 self._character_count_since_last_sep = 0 self._successive_upper_lower_count = 0 @@ -463,7 +482,7 @@ def __init__(self) -> None: self._character_count: int = 0 self._isolated_form_count: int = 0 - def reset(self) -> None: # pragma: no cover + def reset(self) -> None: # Abstract self._character_count = 0 self._isolated_form_count = 0 @@ -488,7 +507,7 @@ def ratio(self) -> float: @lru_cache(maxsize=1024) def is_suspiciously_successive_range( - unicode_range_a: Optional[str], unicode_range_b: Optional[str] + unicode_range_a: str | None, unicode_range_b: str | None ) -> bool: """ Determine if two Unicode range seen next to each other can be considered as suspicious. @@ -512,9 +531,10 @@ def is_suspiciously_successive_range( ): return False - keywords_range_a, keywords_range_b = unicode_range_a.split( - " " - ), unicode_range_b.split(" ") + keywords_range_a, keywords_range_b = ( + unicode_range_a.split(" "), + unicode_range_b.split(" "), + ) for el in keywords_range_a: if el in UNICODE_SECONDARY_RANGE_KEYWORD: @@ -567,7 +587,7 @@ def mess_ratio( Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. """ - detectors: List[MessDetectorPlugin] = [ + detectors: list[MessDetectorPlugin] = [ md_class() for md_class in MessDetectorPlugin.__subclasses__() ] @@ -609,7 +629,7 @@ def mess_ratio( logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}") logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}") - for dt in detectors: # pragma: nocover + for dt in detectors: logger.log(TRACE, f"{dt.__class__}: {dt.ratio}") return round(mean_mess_ratio, 3) diff --git a/charset_normalizer/models.py b/src/charset_normalizer/models.py similarity index 77% rename from charset_normalizer/models.py rename to src/charset_normalizer/models.py index a760b9c..1042758 100644 --- a/charset_normalizer/models.py +++ b/src/charset_normalizer/models.py @@ -1,9 +1,12 @@ +from __future__ import annotations + from encodings.aliases import aliases from hashlib import sha256 from json import dumps -from typing import Any, Dict, Iterator, List, Optional, Tuple, Union +from re import sub +from typing import Any, Iterator, List, Tuple -from .constant import TOO_BIG_SEQUENCE +from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE from .utils import iana_name, is_multi_byte_encoding, unicode_range @@ -14,8 +17,9 @@ def __init__( guessed_encoding: str, mean_mess_ratio: float, has_sig_or_bom: bool, - languages: "CoherenceMatches", - decoded_payload: Optional[str] = None, + languages: CoherenceMatches, + decoded_payload: str | None = None, + preemptive_declaration: str | None = None, ): self._payload: bytes = payload @@ -23,23 +27,23 @@ def __init__( self._mean_mess_ratio: float = mean_mess_ratio self._languages: CoherenceMatches = languages self._has_sig_or_bom: bool = has_sig_or_bom - self._unicode_ranges: Optional[List[str]] = None + self._unicode_ranges: list[str] | None = None - self._leaves: List[CharsetMatch] = [] + self._leaves: list[CharsetMatch] = [] self._mean_coherence_ratio: float = 0.0 - self._output_payload: Optional[bytes] = None - self._output_encoding: Optional[str] = None + self._output_payload: bytes | None = None + self._output_encoding: str | None = None + + self._string: str | None = decoded_payload - self._string: Optional[str] = decoded_payload + self._preemptive_declaration: str | None = preemptive_declaration def __eq__(self, other: object) -> bool: if not isinstance(other, CharsetMatch): - raise TypeError( - "__eq__ cannot be invoked on {} and {}.".format( - str(other.__class__), str(self.__class__) - ) - ) + if isinstance(other, str): + return iana_name(other) == self.encoding + return False return self.encoding == other.encoding and self.fingerprint == other.fingerprint def __lt__(self, other: object) -> bool: @@ -75,9 +79,9 @@ def __str__(self) -> str: return self._string def __repr__(self) -> str: - return "".format(self.encoding, self.fingerprint) + return f"" - def add_submatch(self, other: "CharsetMatch") -> None: + def add_submatch(self, other: CharsetMatch) -> None: if not isinstance(other, CharsetMatch) or other == self: raise ValueError( "Unable to add instance <{}> as a submatch of a CharsetMatch".format( @@ -93,11 +97,11 @@ def encoding(self) -> str: return self._encoding @property - def encoding_aliases(self) -> List[str]: + def encoding_aliases(self) -> list[str]: """ Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855. """ - also_known_as: List[str] = [] + also_known_as: list[str] = [] for u, p in aliases.items(): if self.encoding == u: also_known_as.append(p) @@ -114,7 +118,7 @@ def byte_order_mark(self) -> bool: return self._has_sig_or_bom @property - def languages(self) -> List[str]: + def languages(self) -> list[str]: """ Return the complete list of possible languages found in decoded sequence. Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'. @@ -175,7 +179,7 @@ def raw(self) -> bytes: return self._payload @property - def submatch(self) -> List["CharsetMatch"]: + def submatch(self) -> list[CharsetMatch]: return self._leaves @property @@ -183,19 +187,17 @@ def has_submatch(self) -> bool: return len(self._leaves) > 0 @property - def alphabets(self) -> List[str]: + def alphabets(self) -> list[str]: if self._unicode_ranges is not None: return self._unicode_ranges # list detected ranges - detected_ranges: List[Optional[str]] = [ - unicode_range(char) for char in str(self) - ] + detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)] # filter and sort self._unicode_ranges = sorted(list({r for r in detected_ranges if r})) return self._unicode_ranges @property - def could_be_from_charset(self) -> List[str]: + def could_be_from_charset(self) -> list[str]: """ The complete list of encoding that output the exact SAME str result and therefore could be the originating encoding. @@ -210,7 +212,25 @@ def output(self, encoding: str = "utf_8") -> bytes: """ if self._output_encoding is None or self._output_encoding != encoding: self._output_encoding = encoding - self._output_payload = str(self).encode(encoding, "replace") + decoded_string = str(self) + if ( + self._preemptive_declaration is not None + and self._preemptive_declaration.lower() + not in ["utf-8", "utf8", "utf_8"] + ): + patched_header = sub( + RE_POSSIBLE_ENCODING_INDICATION, + lambda m: m.string[m.span()[0] : m.span()[1]].replace( + m.groups()[0], + iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type] + ), + decoded_string[:8192], + count=1, + ) + + decoded_string = patched_header + decoded_string[8192:] + + self._output_payload = decoded_string.encode(encoding, "replace") return self._output_payload # type: ignore @@ -228,13 +248,13 @@ class CharsetMatches: Act like a list(iterable) but does not implements all related methods. """ - def __init__(self, results: Optional[List[CharsetMatch]] = None): - self._results: List[CharsetMatch] = sorted(results) if results else [] + def __init__(self, results: list[CharsetMatch] | None = None): + self._results: list[CharsetMatch] = sorted(results) if results else [] def __iter__(self) -> Iterator[CharsetMatch]: yield from self._results - def __getitem__(self, item: Union[int, str]) -> CharsetMatch: + def __getitem__(self, item: int | str) -> CharsetMatch: """ Retrieve a single item either by its position or encoding name (alias may be used here). Raise KeyError upon invalid index or encoding not present in results. @@ -266,7 +286,7 @@ def append(self, item: CharsetMatch) -> None: ) ) # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage) - if len(item.raw) <= TOO_BIG_SEQUENCE: + if len(item.raw) < TOO_BIG_SEQUENCE: for match in self._results: if match.fingerprint == item.fingerprint and match.chaos == item.chaos: match.add_submatch(item) @@ -274,7 +294,7 @@ def append(self, item: CharsetMatch) -> None: self._results.append(item) self._results = sorted(self._results) - def best(self) -> Optional["CharsetMatch"]: + def best(self) -> CharsetMatch | None: """ Simply return the first match. Strict equivalent to matches[0]. """ @@ -282,7 +302,7 @@ def best(self) -> Optional["CharsetMatch"]: return None return self._results[0] - def first(self) -> Optional["CharsetMatch"]: + def first(self) -> CharsetMatch | None: """ Redundant method, call the method best(). Kept for BC reasons. """ @@ -297,31 +317,31 @@ class CliDetectionResult: def __init__( self, path: str, - encoding: Optional[str], - encoding_aliases: List[str], - alternative_encodings: List[str], + encoding: str | None, + encoding_aliases: list[str], + alternative_encodings: list[str], language: str, - alphabets: List[str], + alphabets: list[str], has_sig_or_bom: bool, chaos: float, coherence: float, - unicode_path: Optional[str], + unicode_path: str | None, is_preferred: bool, ): self.path: str = path - self.unicode_path: Optional[str] = unicode_path - self.encoding: Optional[str] = encoding - self.encoding_aliases: List[str] = encoding_aliases - self.alternative_encodings: List[str] = alternative_encodings + self.unicode_path: str | None = unicode_path + self.encoding: str | None = encoding + self.encoding_aliases: list[str] = encoding_aliases + self.alternative_encodings: list[str] = alternative_encodings self.language: str = language - self.alphabets: List[str] = alphabets + self.alphabets: list[str] = alphabets self.has_sig_or_bom: bool = has_sig_or_bom self.chaos: float = chaos self.coherence: float = coherence self.is_preferred: bool = is_preferred @property - def __dict__(self) -> Dict[str, Any]: # type: ignore + def __dict__(self) -> dict[str, Any]: # type: ignore return { "path": self.path, "encoding": self.encoding, diff --git a/charset_normalizer/py.typed b/src/charset_normalizer/py.typed similarity index 100% rename from charset_normalizer/py.typed rename to src/charset_normalizer/py.typed diff --git a/charset_normalizer/utils.py b/src/charset_normalizer/utils.py similarity index 84% rename from charset_normalizer/utils.py rename to src/charset_normalizer/utils.py index e5cbbf4..6bf0384 100644 --- a/charset_normalizer/utils.py +++ b/src/charset_normalizer/utils.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import importlib import logging import unicodedata @@ -5,9 +7,11 @@ from encodings.aliases import aliases from functools import lru_cache from re import findall -from typing import Generator, List, Optional, Set, Tuple, Union +from typing import Generator -from _multibytecodec import MultibyteIncrementalDecoder +from _multibytecodec import ( # type: ignore[import-not-found,import] + MultibyteIncrementalDecoder, +) from .constant import ( ENCODING_MARKS, @@ -16,6 +20,7 @@ UNICODE_RANGES_COMBINED, UNICODE_SECONDARY_RANGE_KEYWORD, UTF8_MAXIMAL_ALLOCATION, + COMMON_CJK_CHARACTERS, ) @@ -23,7 +28,7 @@ def is_accentuated(character: str) -> bool: try: description: str = unicodedata.name(character) - except ValueError: + except ValueError: # Defensive: unicode database outdated? return False return ( "WITH GRAVE" in description @@ -43,13 +48,13 @@ def remove_accent(character: str) -> str: if not decomposed: return character - codes: List[str] = decomposed.split(" ") + codes: list[str] = decomposed.split(" ") return chr(int(codes[0], 16)) @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) -def unicode_range(character: str) -> Optional[str]: +def unicode_range(character: str) -> str | None: """ Retrieve the Unicode range official name from a single character. """ @@ -66,7 +71,7 @@ def unicode_range(character: str) -> Optional[str]: def is_latin(character: str) -> bool: try: description: str = unicodedata.name(character) - except ValueError: + except ValueError: # Defensive: unicode database outdated? return False return "LATIN" in description @@ -78,7 +83,7 @@ def is_punctuation(character: str) -> bool: if "P" in character_category: return True - character_range: Optional[str] = unicode_range(character) + character_range: str | None = unicode_range(character) if character_range is None: return False @@ -93,7 +98,7 @@ def is_symbol(character: str) -> bool: if "S" in character_category or "N" in character_category: return True - character_range: Optional[str] = unicode_range(character) + character_range: str | None = unicode_range(character) if character_range is None: return False @@ -103,7 +108,7 @@ def is_symbol(character: str) -> bool: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_emoticon(character: str) -> bool: - character_range: Optional[str] = unicode_range(character) + character_range: str | None = unicode_range(character) if character_range is None: return False @@ -130,7 +135,7 @@ def is_case_variable(character: str) -> bool: def is_cjk(character: str) -> bool: try: character_name = unicodedata.name(character) - except ValueError: + except ValueError: # Defensive: unicode database outdated? return False return "CJK" in character_name @@ -140,7 +145,7 @@ def is_cjk(character: str) -> bool: def is_hiragana(character: str) -> bool: try: character_name = unicodedata.name(character) - except ValueError: + except ValueError: # Defensive: unicode database outdated? return False return "HIRAGANA" in character_name @@ -150,7 +155,7 @@ def is_hiragana(character: str) -> bool: def is_katakana(character: str) -> bool: try: character_name = unicodedata.name(character) - except ValueError: + except ValueError: # Defensive: unicode database outdated? return False return "KATAKANA" in character_name @@ -160,7 +165,7 @@ def is_katakana(character: str) -> bool: def is_hangul(character: str) -> bool: try: character_name = unicodedata.name(character) - except ValueError: + except ValueError: # Defensive: unicode database outdated? return False return "HANGUL" in character_name @@ -170,7 +175,7 @@ def is_hangul(character: str) -> bool: def is_thai(character: str) -> bool: try: character_name = unicodedata.name(character) - except ValueError: + except ValueError: # Defensive: unicode database outdated? return False return "THAI" in character_name @@ -180,7 +185,7 @@ def is_thai(character: str) -> bool: def is_arabic(character: str) -> bool: try: character_name = unicodedata.name(character) - except ValueError: + except ValueError: # Defensive: unicode database outdated? return False return "ARABIC" in character_name @@ -190,12 +195,17 @@ def is_arabic(character: str) -> bool: def is_arabic_isolated_form(character: str) -> bool: try: character_name = unicodedata.name(character) - except ValueError: + except ValueError: # Defensive: unicode database outdated? return False return "ARABIC" in character_name and "ISOLATED FORM" in character_name +@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) +def is_cjk_uncommon(character: str) -> bool: + return character not in COMMON_CJK_CHARACTERS + + @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED)) def is_unicode_range_secondary(range_name: str) -> bool: return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD) @@ -206,13 +216,13 @@ def is_unprintable(character: str) -> bool: return ( character.isspace() is False # includes \n \t \r \v and character.isprintable() is False - and character != "\x1A" # Why? Its the ASCII substitute character. + and character != "\x1a" # Why? Its the ASCII substitute character. and character != "\ufeff" # bug discovered in Python, # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space. ) -def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional[str]: +def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None: """ Extract using ASCII-only decoder any specified encoding in the first n-bytes. """ @@ -221,7 +231,7 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional seq_len: int = len(sequence) - results: List[str] = findall( + results: list[str] = findall( RE_POSSIBLE_ENCODING_INDICATION, sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"), ) @@ -260,18 +270,18 @@ def is_multi_byte_encoding(name: str) -> bool: "utf_32_be", "utf_7", } or issubclass( - importlib.import_module("encodings.{}".format(name)).IncrementalDecoder, + importlib.import_module(f"encodings.{name}").IncrementalDecoder, MultibyteIncrementalDecoder, ) -def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]: +def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]: """ Identify and extract SIG/BOM in given sequence. """ for iana_encoding in ENCODING_MARKS: - marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding] + marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding] if isinstance(marks, bytes): marks = [marks] @@ -288,6 +298,7 @@ def should_strip_sig_or_bom(iana_encoding: str) -> bool: def iana_name(cp_name: str, strict: bool = True) -> str: + """Returns the Python normalized encoding name (Not the IANA official name).""" cp_name = cp_name.lower().replace("-", "_") encoding_alias: str @@ -298,35 +309,17 @@ def iana_name(cp_name: str, strict: bool = True) -> str: return encoding_iana if strict: - raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name)) + raise ValueError(f"Unable to retrieve IANA for '{cp_name}'") return cp_name -def range_scan(decoded_sequence: str) -> List[str]: - ranges: Set[str] = set() - - for character in decoded_sequence: - character_range: Optional[str] = unicode_range(character) - - if character_range is None: - continue - - ranges.add(character_range) - - return list(ranges) - - def cp_similarity(iana_name_a: str, iana_name_b: str) -> float: if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b): return 0.0 - decoder_a = importlib.import_module( - "encodings.{}".format(iana_name_a) - ).IncrementalDecoder - decoder_b = importlib.import_module( - "encodings.{}".format(iana_name_b) - ).IncrementalDecoder + decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder + decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder id_a: IncrementalDecoder = decoder_a(errors="ignore") id_b: IncrementalDecoder = decoder_b(errors="ignore") @@ -374,7 +367,7 @@ def cut_sequence_chunks( strip_sig_or_bom: bool, sig_payload: bytes, is_multi_byte_decoder: bool, - decoded_payload: Optional[str] = None, + decoded_payload: str | None = None, ) -> Generator[str, None, None]: if decoded_payload and is_multi_byte_decoder is False: for i in offsets: diff --git a/src/charset_normalizer/version.py b/src/charset_normalizer/version.py new file mode 100644 index 0000000..e5687e3 --- /dev/null +++ b/src/charset_normalizer/version.py @@ -0,0 +1,8 @@ +""" +Expose version +""" + +from __future__ import annotations + +__version__ = "3.4.2" +VERSION = __version__.split(".") diff --git a/tests/__init__.py b/tests/__init__.py index 8b13789..e69de29 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1 +0,0 @@ - diff --git a/tests/test_base_detection.py b/tests/test_base_detection.py index 3180a50..e4fb5fd 100644 --- a/tests/test_base_detection.py +++ b/tests/test_base_detection.py @@ -1,40 +1,52 @@ -from charset_normalizer.api import from_bytes -from charset_normalizer.models import CharsetMatches +from __future__ import annotations import pytest +from charset_normalizer.api import from_bytes +from charset_normalizer.models import CharsetMatches + def test_empty(): - best_guess = from_bytes(b'').best() + best_guess = from_bytes(b"").best() assert best_guess is not None, "Empty bytes payload SHOULD NOT return None" - assert best_guess.encoding == "utf_8", "Empty bytes payload SHOULD be guessed as UTF-8 (arbitrary)" + assert ( + best_guess.encoding == "utf_8" + ), "Empty bytes payload SHOULD be guessed as UTF-8 (arbitrary)" assert len(best_guess.alphabets) == 0, "" def test_bool_matches(): - guesses_not_empty = from_bytes(b'') + guesses_not_empty = from_bytes(b"") guesses_empty = CharsetMatches([]) - assert bool(guesses_not_empty) is True, "Bool behaviour of CharsetMatches altered, should be True" - assert bool(guesses_empty) is False, "Bool behaviour of CharsetMatches altered, should be False" + assert ( + bool(guesses_not_empty) is True + ), "Bool behaviour of CharsetMatches altered, should be True" + assert ( + bool(guesses_empty) is False + ), "Bool behaviour of CharsetMatches altered, should be False" @pytest.mark.parametrize( "payload, expected_encoding", [ - (b'\xfe\xff', 'utf_16'), - ('\uFEFF'.encode('gb18030'), 'gb18030'), - (b'\xef\xbb\xbf', 'utf_8'), - ("".encode('utf_32'), "utf_32") - ] + (b"\xfe\xff", "utf_16"), + ("\uFEFF".encode("gb18030"), "gb18030"), + (b"\xef\xbb\xbf", "utf_8"), + ("".encode("utf_32"), "utf_32"), + ], ) def test_empty_but_with_bom_or_sig(payload, expected_encoding): best_guess = from_bytes(payload).best() assert best_guess is not None, "Empty detection but with SIG/BOM has failed!" - assert best_guess.encoding == expected_encoding, "Empty detection but with SIG/BOM is wrongly detected!" - assert best_guess.raw == payload, "The RAW property should contain the original payload given for detection." + assert ( + best_guess.encoding == expected_encoding + ), "Empty detection but with SIG/BOM is wrongly detected!" + assert ( + best_guess.raw == payload + ), "The RAW property should contain the original payload given for detection." assert best_guess.byte_order_mark is True, "The BOM/SIG property should return True" assert str(best_guess) == "", "The cast to str SHOULD be empty" @@ -42,16 +54,27 @@ def test_empty_but_with_bom_or_sig(payload, expected_encoding): @pytest.mark.parametrize( "payload, expected_encoding", [ - ((u'\uFEFF' + '我没有埋怨,磋砣的只是一些时间。').encode('gb18030'), "gb18030",), - ('我没有埋怨,磋砣的只是一些时间。'.encode('utf_32'), "utf_32",), - ('我没有埋怨,磋砣的只是一些时间。'.encode('utf_8_sig'), "utf_8",), - ] + ( + ("\uFEFF" + "我没有埋怨,磋砣的只是一些时间。").encode("gb18030"), + "gb18030", + ), + ( + "我没有埋怨,磋砣的只是一些时间。".encode("utf_32"), + "utf_32", + ), + ( + "我没有埋怨,磋砣的只是一些时间。".encode("utf_8_sig"), + "utf_8", + ), + ], ) def test_content_with_bom_or_sig(payload, expected_encoding): best_guess = from_bytes(payload).best() assert best_guess is not None, "Detection but with SIG/BOM has failed!" - assert best_guess.encoding == expected_encoding, "Detection but with SIG/BOM is wrongly detected!" + assert ( + best_guess.encoding == expected_encoding + ), "Detection but with SIG/BOM is wrongly detected!" assert best_guess.byte_order_mark is True, "The BOM/SIG property should return True" @@ -63,42 +86,49 @@ def test_content_with_bom_or_sig(payload, expected_encoding): b'{"token": "g4UsPJdfzNkGW2jwmKDGDilKGKYtpF2X.mx3MaTWL1tL7CNn5U7DeCcodKX7S3lwwJPKNjBT8etY"}', b"81f4ab054b39cb0e12701e734077d84264308f5fc79494fc5f159fa2ebc07b73c8cc0e98e009664a20986706f90146e8eefcb929ce1f74a8eab21369fdc70198", b"{}", - ] + ], ) def test_obviously_ascii_content(payload): best_guess = from_bytes(payload).best() assert best_guess is not None, "Dead-simple ASCII detection has failed!" - assert best_guess.encoding == "ascii", "Dead-simple ASCII detection is wrongly detected!" + assert ( + best_guess.encoding == "ascii" + ), "Dead-simple ASCII detection is wrongly detected!" @pytest.mark.parametrize( "payload", [ - '\u020d\x1b'.encode('utf-8'), - 'h\xe9llo world!\n'.encode('utf_8'), - '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8'), - 'Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно, поне що се отнася до началното и основното образование.'.encode('utf_8'), - 'Bсеки човек има право на образование.'.encode('utf_8'), - "(° ͜ʖ °), creepy face, smiley 😀".encode("utf_8"), - """["Financiën", "La France"]""".encode("utf_8"), - "Qu'est ce que une étoile?".encode("utf_8"), - """Financiën""".encode("utf_8"), - "😀".encode("utf_8") - ] + "\u020d\x1b".encode(), + "h\xe9llo world!\n".encode(), + "我没有埋怨,磋砣的只是一些时间。".encode(), + "Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно, поне що се отнася до началното и основното образование.".encode(), + "Bсеки човек има право на образование.".encode(), + "(° ͜ʖ °), creepy face, smiley 😀".encode(), + """["Financiën", "La France"]""".encode(), + "Qu'est ce que une étoile?".encode(), + """Financiën""".encode(), + "😀".encode(), + ], ) def test_obviously_utf8_content(payload): best_guess = from_bytes(payload).best() assert best_guess is not None, "Dead-simple UTF-8 detection has failed!" - assert best_guess.encoding == "utf_8", "Dead-simple UTF-8 detection is wrongly detected!" + assert ( + best_guess.encoding == "utf_8" + ), "Dead-simple UTF-8 detection is wrongly detected!" def test_mb_cutting_chk(): # This payload should be wrongfully split and the autofix should ran automatically # on chunks extraction. - payload = b"\xbf\xaa\xbb\xe7\xc0\xfb \xbf\xb9\xbc\xf6 " \ - b" \xbf\xac\xb1\xb8\xc0\xda\xb5\xe9\xc0\xba \xba\xb9\xc0\xbd\xbc\xad\xb3\xaa " * 128 + payload = ( + b"\xbf\xaa\xbb\xe7\xc0\xfb \xbf\xb9\xbc\xf6 " + b" \xbf\xac\xb1\xb8\xc0\xda\xb5\xe9\xc0\xba \xba\xb9\xc0\xbd\xbc\xad\xb3\xaa " + * 128 + ) guesses = from_bytes(payload, cp_isolation=["cp949"]) best_guess = guesses.best() @@ -108,9 +138,7 @@ def test_mb_cutting_chk(): def test_alphabets_property(): - best_guess = from_bytes( - "😀 Hello World! How affairs are going? 😀".encode("utf_8") - ).best() + best_guess = from_bytes("😀 Hello World! How affairs are going? 😀".encode()).best() assert "Basic Latin" in best_guess.alphabets assert "Emoticons range(Emoji)" in best_guess.alphabets @@ -119,7 +147,16 @@ def test_alphabets_property(): def test_doc_example_short_cp1251(): best_guess = from_bytes( - 'Bсеки човек има право на образование.'.encode('cp1251') + "Bсеки човек има право на образование.".encode("cp1251") ).best() assert best_guess.encoding == "cp1251" + + +def test_direct_cmp_charset_match(): + best_guess = from_bytes("😀 Hello World! How affairs are going? 😀".encode()).best() + + assert best_guess == "utf_8" + assert best_guess == "utf-8" + assert best_guess != 8 + assert best_guess != None diff --git a/tests/test_cli.py b/tests/test_cli.py index b73fb61..5f2777c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,70 +1,46 @@ +from __future__ import annotations + import unittest -from charset_normalizer.cli import cli_detect, query_yes_no -from unittest.mock import patch +from os import pardir, path, remove from os.path import exists -from os import remove, path, pardir +from unittest.mock import patch + +from charset_normalizer.cli import cli_detect, query_yes_no -DIR_PATH = path.join( - path.dirname(path.realpath(__file__)), - pardir -) +DIR_PATH = path.join(path.dirname(path.realpath(__file__)), pardir) class TestCommandLineInterface(unittest.TestCase): - - @patch('builtins.input', lambda *args: 'y') + @patch("builtins.input", lambda *args: "y") def test_simple_yes_input(self): - self.assertTrue( - query_yes_no('Are u willing to chill a little bit ?') - ) + self.assertTrue(query_yes_no("Are u willing to chill a little bit ?")) - @patch('builtins.input', lambda *args: 'N') + @patch("builtins.input", lambda *args: "N") def test_simple_no_input(self): - self.assertFalse( - query_yes_no('Are u willing to chill a little bit ?') - ) + self.assertFalse(query_yes_no("Are u willing to chill a little bit ?")) def test_single_file(self): - - self.assertEqual( - 0, - cli_detect( - [DIR_PATH + '/data/sample-arabic-1.txt'] - ) - ) + self.assertEqual(0, cli_detect([DIR_PATH + "/data/sample-arabic-1.txt"])) def test_version_output_success(self): with self.assertRaises(SystemExit): - cli_detect( - ['--version'] - ) + cli_detect(["--version"]) def test_single_file_normalize(self): self.assertEqual( - 0, - cli_detect( - [ - DIR_PATH + '/data/sample-arabic-1.txt', - '--normalize' - ] - ) + 0, cli_detect([DIR_PATH + "/data/sample-arabic-1.txt", "--normalize"]) ) - self.assertTrue( - exists(DIR_PATH + '/data/sample-arabic-1.cp1256.txt') - ) + self.assertTrue(exists(DIR_PATH + "/data/sample-arabic-1.cp1256.txt")) try: - remove(DIR_PATH + '/data/sample-arabic-1.cp1256.txt') + remove(DIR_PATH + "/data/sample-arabic-1.cp1256.txt") except: pass def test_single_verbose_file(self): self.assertEqual( - 0, - cli_detect( - [DIR_PATH + '/data/sample-arabic-1.txt', '--verbose'] - ) + 0, cli_detect([DIR_PATH + "/data/sample-arabic-1.txt", "--verbose"]) ) def test_multiple_file(self): @@ -72,11 +48,11 @@ def test_multiple_file(self): 0, cli_detect( [ - DIR_PATH + '/data/sample-arabic-1.txt', - DIR_PATH + '/data/sample-french.txt', - DIR_PATH + '/data/sample-chinese.txt' + DIR_PATH + "/data/sample-arabic-1.txt", + DIR_PATH + "/data/sample-french.txt", + DIR_PATH + "/data/sample-chinese.txt", ] - ) + ), ) def test_with_alternative(self): @@ -84,12 +60,12 @@ def test_with_alternative(self): 0, cli_detect( [ - '-a', - DIR_PATH + '/data/sample-arabic-1.txt', - DIR_PATH + '/data/sample-french.txt', - DIR_PATH + '/data/sample-chinese.txt' + "-a", + DIR_PATH + "/data/sample-arabic-1.txt", + DIR_PATH + "/data/sample-french.txt", + DIR_PATH + "/data/sample-chinese.txt", ] - ) + ), ) def test_with_minimal_output(self): @@ -97,12 +73,12 @@ def test_with_minimal_output(self): 0, cli_detect( [ - '-m', - DIR_PATH + '/data/sample-arabic-1.txt', - DIR_PATH + '/data/sample-french.txt', - DIR_PATH + '/data/sample-chinese.txt' + "-m", + DIR_PATH + "/data/sample-arabic-1.txt", + DIR_PATH + "/data/sample-french.txt", + DIR_PATH + "/data/sample-chinese.txt", ] - ) + ), ) def test_with_minimal_and_alt(self): @@ -110,47 +86,31 @@ def test_with_minimal_and_alt(self): 0, cli_detect( [ - '-m', - '-a', - DIR_PATH + '/data/sample-arabic-1.txt', - DIR_PATH + '/data/sample-french.txt', - DIR_PATH + '/data/sample-chinese.txt' + "-m", + "-a", + DIR_PATH + "/data/sample-arabic-1.txt", + DIR_PATH + "/data/sample-french.txt", + DIR_PATH + "/data/sample-chinese.txt", ] - ) + ), ) def test_non_existent_file(self): - with self.assertRaises(SystemExit) as cm: - cli_detect( - [DIR_PATH + '/data/not_found_data.txt'] - ) + cli_detect([DIR_PATH + "/data/not_found_data.txt"]) self.assertEqual(cm.exception.code, 2) def test_replace_without_normalize(self): - self.assertEqual( - cli_detect( - [ - DIR_PATH + '/data/sample-arabic-1.txt', - '--replace' - ] - ), - 1 + cli_detect([DIR_PATH + "/data/sample-arabic-1.txt", "--replace"]), 1 ) def test_force_replace_without_replace(self): self.assertEqual( - cli_detect( - [ - DIR_PATH + '/data/sample-arabic-1.txt', - '--force' - ] - ), - 1 + cli_detect([DIR_PATH + "/data/sample-arabic-1.txt", "--force"]), 1 ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_coherence_detection.py b/tests/test_coherence_detection.py index 7e39913..e5952d6 100644 --- a/tests/test_coherence_detection.py +++ b/tests/test_coherence_detection.py @@ -1,5 +1,14 @@ +from __future__ import annotations + import pytest -from charset_normalizer.cd import encoding_languages, mb_encoding_languages, is_multi_byte_encoding, get_target_features, filter_alt_coherence_matches + +from charset_normalizer.cd import ( + encoding_languages, + filter_alt_coherence_matches, + get_target_features, + is_multi_byte_encoding, + mb_encoding_languages, +) @pytest.mark.parametrize( @@ -13,14 +22,20 @@ ("johab", ["Korean"]), ("shift_jis", ["Japanese"]), ("mac_greek", ["Greek"]), - ("iso2022_jp", ["Japanese"]) - ] + ("iso2022_jp", ["Japanese"]), + ], ) def test_infer_language_from_cp(iana_encoding, expected_languages): - languages = mb_encoding_languages(iana_encoding) if is_multi_byte_encoding(iana_encoding) else encoding_languages(iana_encoding) + languages = ( + mb_encoding_languages(iana_encoding) + if is_multi_byte_encoding(iana_encoding) + else encoding_languages(iana_encoding) + ) for expected_language in expected_languages: - assert expected_language in languages, "Wrongly detected language for given code page" + assert ( + expected_language in languages + ), "Wrongly detected language for given code page" @pytest.mark.parametrize( @@ -31,8 +46,8 @@ def test_infer_language_from_cp(iana_encoding, expected_languages): ("Hebrew", False, False), ("Arabic", False, False), ("Vietnamese", True, True), - ("Turkish", True, True) - ] + ("Turkish", True, True), + ], ) def test_target_features(language, expected_have_accents, expected_pure_latin): target_have_accents, target_pure_latin = get_target_features(language) @@ -44,11 +59,48 @@ def test_target_features(language, expected_have_accents, expected_pure_latin): @pytest.mark.parametrize( "matches, expected_return", [ - ([("English", 0.88,), ("English—", 0.99)], [("English", 0.99)]), - ([("English", 0.88,), ("English—", 0.99), ("English——", 0.999)], [("English", 0.999)]), - ([("English", 0.88,), ("English—", 0.77)], [("English", 0.88)]), - ([("English", 0.88,), ("Italian", 0.77)], [("English", 0.88), ("Italian", 0.77)]), - ] + ( + [ + ( + "English", + 0.88, + ), + ("English—", 0.99), + ], + [("English", 0.99)], + ), + ( + [ + ( + "English", + 0.88, + ), + ("English—", 0.99), + ("English——", 0.999), + ], + [("English", 0.999)], + ), + ( + [ + ( + "English", + 0.88, + ), + ("English—", 0.77), + ], + [("English", 0.88)], + ), + ( + [ + ( + "English", + 0.88, + ), + ("Italian", 0.77), + ], + [("English", 0.88), ("Italian", 0.77)], + ), + ], ) def test_filter_alt_coherence_matches(matches, expected_return): results = filter_alt_coherence_matches(matches) diff --git a/tests/test_detect_legacy.py b/tests/test_detect_legacy.py index ec45aa7..bd2b035 100644 --- a/tests/test_detect_legacy.py +++ b/tests/test_detect_legacy.py @@ -1,75 +1,43 @@ +from __future__ import annotations + import unittest + from charset_normalizer.legacy import detect class TestDetectLegacy(unittest.TestCase): - def test_detect_dict_keys(self): + r = detect(("\uFEFF" + "我没有埋怨,磋砣的只是一些时间。").encode("gb18030")) - r = detect( - (u'\uFEFF' + '我没有埋怨,磋砣的只是一些时间。').encode('gb18030') - ) + with self.subTest("encoding key present"): + self.assertIn("encoding", r.keys()) - with self.subTest('encoding key present'): - self.assertIn( - 'encoding', - r.keys() - ) + with self.subTest("language key present"): + self.assertIn("language", r.keys()) - with self.subTest('language key present'): - self.assertIn( - 'language', - r.keys() - ) - - with self.subTest('confidence key present'): - self.assertIn( - 'confidence', - r.keys() - ) + with self.subTest("confidence key present"): + self.assertIn("confidence", r.keys()) def test_detect_dict_value_type(self): + r = detect("我没有埋怨,磋砣的只是一些时间。".encode()) - r = detect( - '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8') - ) - - with self.subTest('encoding instance of str'): - self.assertIsInstance( - r['encoding'], - str - ) + with self.subTest("encoding instance of str"): + self.assertIsInstance(r["encoding"], str) - with self.subTest('language instance of str'): - self.assertIsInstance( - r['language'], - str - ) + with self.subTest("language instance of str"): + self.assertIsInstance(r["language"], str) - with self.subTest('confidence instance of float'): - self.assertIsInstance( - r['confidence'], - float - ) + with self.subTest("confidence instance of float"): + self.assertIsInstance(r["confidence"], float) def test_detect_dict_value(self): - r = detect( - '我没有埋怨,磋砣的只是一些时间。'.encode('utf_32') - ) + r = detect("我没有埋怨,磋砣的只是一些时间。".encode("utf_32")) - with self.subTest('encoding is equal to utf_32'): - self.assertEqual( - r['encoding'], - 'UTF-32' - ) + with self.subTest("encoding is equal to utf_32"): + self.assertEqual(r["encoding"], "UTF-32") def test_utf8_sig_not_striped(self): - r = detect( - "Hello World".encode('utf-8-sig') - ) + r = detect("Hello World".encode("utf-8-sig")) with self.subTest("Verify that UTF-8-SIG is returned when using legacy detect"): - self.assertEqual( - r['encoding'], - "UTF-8-SIG" - ) + self.assertEqual(r["encoding"], "UTF-8-SIG") diff --git a/tests/test_edge_case.py b/tests/test_edge_case.py index f324664..5b763ba 100644 --- a/tests/test_edge_case.py +++ b/tests/test_edge_case.py @@ -1,12 +1,59 @@ -from charset_normalizer import from_bytes -import pytest +from __future__ import annotations + import platform -@pytest.mark.xfail(platform.python_version_tuple()[0] == "3" and platform.python_version_tuple()[1] == "7", reason="Unicode database is too old for this case (Python 3.7)") +import pytest + +from charset_normalizer import from_bytes + + +@pytest.mark.xfail( + platform.python_version_tuple()[0] == "3" + and platform.python_version_tuple()[1] == "7", + reason="Unicode database is too old for this case (Python 3.7)", +) def test_unicode_edge_case(): - payload = b'\xef\xbb\xbf\xf0\x9f\xa9\xb3' + payload = b"\xef\xbb\xbf\xf0\x9f\xa9\xb3" best_guess = from_bytes(payload).best() - assert best_guess is not None, "Payload should have given something, detection failure" + assert ( + best_guess is not None + ), "Payload should have given something, detection failure" assert best_guess.encoding == "utf_8", "UTF-8 payload wrongly detected" + + +def test_issue_gh520(): + """Verify that minorities does not strip basic latin characters!""" + payload = b"/includes/webform.compon\xd2\xaants.inc/" + + best_guess = from_bytes(payload).best() + + assert ( + best_guess is not None + ), "Payload should have given something, detection failure" + assert "Basic Latin" in best_guess.alphabets + + +def test_issue_gh509(): + """Two common ASCII punctuations should render as-is.""" + payload = b");" + + best_guess = from_bytes(payload).best() + + assert ( + best_guess is not None + ), "Payload should have given something, detection failure" + assert "ascii" == best_guess.encoding + + +def test_issue_gh498(): + """This case was mistaken for utf-16-le, this should never happen again.""" + payload = b"\x84\xae\xaa\xe3\xac\xa5\xad\xe2 Microsoft Word.docx" + + best_guess = from_bytes(payload).best() + + assert ( + best_guess is not None + ), "Payload should have given something, detection failure" + assert "Cyrillic" in best_guess.alphabets diff --git a/tests/test_full_detection.py b/tests/test_full_detection.py index adff880..ff91e12 100644 --- a/tests/test_full_detection.py +++ b/tests/test_full_detection.py @@ -1,43 +1,50 @@ -from charset_normalizer.api import from_path +from __future__ import annotations + +from os import pardir, path + import pytest -from os import path, pardir -DIR_PATH = path.join( - path.dirname(path.realpath(__file__)), - pardir -) +from charset_normalizer.api import from_path + +DIR_PATH = path.join(path.dirname(path.realpath(__file__)), pardir) @pytest.mark.parametrize( "input_data_file, expected_charset, expected_language", [ - ('sample-arabic-1.txt', 'cp1256', 'Arabic'), - ('sample-french-1.txt', 'cp1252', 'French'), - ('sample-arabic.txt', 'utf_8', 'Arabic'), - ('sample-russian-3.txt', 'utf_8', 'Russian'), - ('sample-french.txt', 'utf_8', 'French'), - ('sample-chinese.txt', 'big5', 'Chinese'), - ('sample-greek.txt', 'cp1253', 'Greek'), - ('sample-greek-2.txt', 'cp1253', 'Greek'), - ('sample-hebrew-2.txt', 'cp1255', 'Hebrew'), - ('sample-hebrew-3.txt', 'cp1255', 'Hebrew'), - ('sample-bulgarian.txt', 'utf_8', 'Bulgarian'), - ('sample-english.bom.txt', 'utf_8', 'English'), - ('sample-spanish.txt', 'utf_8', 'Spanish'), - ('sample-korean.txt', 'cp949', 'Korean'), - ('sample-turkish.txt', 'cp1254', 'Turkish'), - ('sample-russian-2.txt', 'utf_8', 'Russian'), - ('sample-russian.txt', 'mac_cyrillic', 'Russian'), - ('sample-polish.txt', 'utf_8', 'Polish'), - ] + ("sample-arabic-1.txt", "cp1256", "Arabic"), + ("sample-french-1.txt", "cp1252", "French"), + ("sample-arabic.txt", "utf_8", "Arabic"), + ("sample-russian-3.txt", "utf_8", "Russian"), + ("sample-french.txt", "utf_8", "French"), + ("sample-chinese.txt", "big5", "Chinese"), + ("sample-greek.txt", "cp1253", "Greek"), + ("sample-greek-2.txt", "cp1253", "Greek"), + ("sample-hebrew-2.txt", "cp1255", "Hebrew"), + ("sample-hebrew-3.txt", "cp1255", "Hebrew"), + ("sample-bulgarian.txt", "utf_8", "Bulgarian"), + ("sample-english.bom.txt", "utf_8", "English"), + ("sample-spanish.txt", "utf_8", "Spanish"), + ("sample-korean.txt", "cp949", "Korean"), + ("sample-turkish.txt", "cp1254", "Turkish"), + ("sample-russian-2.txt", "utf_8", "Russian"), + ("sample-russian.txt", "mac_cyrillic", "Russian"), + ("sample-polish.txt", "utf_8", "Polish"), + ], ) def test_elementary_detection( input_data_file: str, expected_charset: str, expected_language: str, ): - best_guess = from_path(DIR_PATH + "/data/{}".format(input_data_file)).best() + best_guess = from_path(DIR_PATH + f"/data/{input_data_file}").best() - assert best_guess is not None, "Elementary detection has failed upon '{}'".format(input_data_file) - assert best_guess.encoding == expected_charset, "Elementary charset detection has failed upon '{}'".format(input_data_file) - assert best_guess.language == expected_language, "Elementary language detection has failed upon '{}'".format(input_data_file) + assert ( + best_guess is not None + ), f"Elementary detection has failed upon '{input_data_file}'" + assert ( + best_guess.encoding == expected_charset + ), f"Elementary charset detection has failed upon '{input_data_file}'" + assert ( + best_guess.language == expected_language + ), f"Elementary language detection has failed upon '{input_data_file}'" diff --git a/tests/test_isbinary.py b/tests/test_isbinary.py index b134a8a..841474f 100644 --- a/tests/test_isbinary.py +++ b/tests/test_isbinary.py @@ -1,28 +1,29 @@ -import pytest +from __future__ import annotations + import typing -from io import BytesIO from base64 import b64decode +from io import BytesIO +from os import pardir, path + +import pytest + from charset_normalizer import is_binary -from os import path, pardir -DIR_PATH = path.join( - path.dirname(path.realpath(__file__)), - pardir -) +DIR_PATH = path.join(path.dirname(path.realpath(__file__)), pardir) @pytest.mark.parametrize( "raw, expected", [ - (b'\x00\x5f\x2f\xff'*50, True), + (b"\x00\x5f\x2f\xff" * 50, True), (b64decode("R0lGODlhAQABAAAAACw="), True), (BytesIO(b64decode("R0lGODlhAQABAAAAACw=")), True), - ('sample-polish.txt', False), - ('sample-arabic.txt', False) - ] + ("sample-polish.txt", False), + ("sample-arabic.txt", False), + ], ) -def test_isbinary(raw: typing.Union[bytes, typing.BinaryIO, str], expected: bool) -> None: +def test_isbinary(raw: bytes | typing.BinaryIO | str, expected: bool) -> None: if isinstance(raw, str): - raw = DIR_PATH + "/data/{}".format(raw) + raw = DIR_PATH + f"/data/{raw}" assert is_binary(raw) is expected diff --git a/tests/test_large_payload.py b/tests/test_large_payload.py index 04526d3..7fc28fa 100644 --- a/tests/test_large_payload.py +++ b/tests/test_large_payload.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pytest from charset_normalizer import from_bytes @@ -5,29 +7,43 @@ def test_large_payload_u8_sig_basic_entry(): - payload = ('0' * TOO_BIG_SEQUENCE).encode("utf_8_sig") + payload = ("0" * TOO_BIG_SEQUENCE).encode("utf_8_sig") best_guess = from_bytes(payload).best() assert best_guess is not None, "Large U8 payload case detection completely failed" - assert best_guess.encoding == "utf_8", "Large U8 payload case detection wrongly detected!" + assert ( + best_guess.encoding == "utf_8" + ), "Large U8 payload case detection wrongly detected!" assert best_guess.bom is True, "SIG/BOM property should be True" - assert len(best_guess.raw) == len(payload), "Large payload should remain untouched when accessed through .raw" - assert best_guess._string is not None, "str should be decoded before direct access (sig available)" + assert len(best_guess.raw) == len( + payload + ), "Large payload should remain untouched when accessed through .raw" + assert ( + best_guess._string is not None + ), "str should be decoded before direct access (sig available)" def test_large_payload_ascii_basic_entry(): - payload = ('0' * TOO_BIG_SEQUENCE).encode("utf_8") + payload = ("0" * TOO_BIG_SEQUENCE).encode("utf_8") best_guess = from_bytes(payload).best() - assert best_guess is not None, "Large ASCII payload case detection completely failed" - assert best_guess.encoding == "ascii", "Large ASCII payload case detection wrongly detected!" + assert ( + best_guess is not None + ), "Large ASCII payload case detection completely failed" + assert ( + best_guess.encoding == "ascii" + ), "Large ASCII payload case detection wrongly detected!" assert best_guess.bom is False, "SIG/BOM property should be False" - assert len(best_guess.raw) == len(payload), "Large payload should remain untouched when accessed through .raw" + assert len(best_guess.raw) == len( + payload + ), "Large payload should remain untouched when accessed through .raw" assert best_guess._string is None, "str should not be decoded until direct access" def test_misleading_large_sequence(): - content = (("hello simple ascii " * TOO_BIG_SEQUENCE) + ('我没有埋怨,磋砣的只是一些时间。 磋砣的只是一些时间。')) .encode('utf_8') + content = ( + ("hello simple ascii " * TOO_BIG_SEQUENCE) + ("我没有埋怨,磋砣的只是一些时间。 磋砣的只是一些时间。") + ).encode("utf_8") guesses = from_bytes(content) @@ -35,5 +51,5 @@ def test_misleading_large_sequence(): match = guesses.best() assert match is not None assert match._string is not None, "str should be cached as only match" - assert match.encoding == 'utf_8' + assert match.encoding == "utf_8" assert str(match) is not None diff --git a/tests/test_logging.py b/tests/test_logging.py index f44820e..ad2413e 100644 --- a/tests/test_logging.py +++ b/tests/test_logging.py @@ -1,9 +1,12 @@ -import pytest +from __future__ import annotations + import logging -from charset_normalizer.utils import set_logging_handler -from charset_normalizer.api import from_bytes, explain_handler +import pytest + +from charset_normalizer.api import explain_handler, from_bytes from charset_normalizer.constant import TRACE +from charset_normalizer.utils import set_logging_handler class TestLogBehaviorClass: @@ -14,34 +17,32 @@ def setup_method(self): self.logger.level = logging.WARNING def test_explain_true_behavior(self, caplog): - test_sequence = b'This is a test sequence of bytes that should be sufficient' + test_sequence = b"This is a test sequence of bytes that should be sufficient" from_bytes(test_sequence, steps=1, chunk_size=50, explain=True) assert explain_handler not in self.logger.handlers for record in caplog.records: assert record.levelname in ["Level 5", "DEBUG"] def test_explain_false_handler_set_behavior(self, caplog): - test_sequence = b'This is a test sequence of bytes that should be sufficient' + test_sequence = b"This is a test sequence of bytes that should be sufficient" set_logging_handler(level=TRACE, format_string="%(message)s") from_bytes(test_sequence, steps=1, chunk_size=50, explain=False) - assert any(isinstance(hdl, logging.StreamHandler) for hdl in self.logger.handlers) + assert any( + isinstance(hdl, logging.StreamHandler) for hdl in self.logger.handlers + ) for record in caplog.records: assert record.levelname in ["Level 5", "DEBUG"] assert "Encoding detection: ascii is most likely the one." in caplog.text def test_set_stream_handler(self, caplog): - set_logging_handler( - "charset_normalizer", level=logging.DEBUG - ) + set_logging_handler("charset_normalizer", level=logging.DEBUG) self.logger.debug("log content should log with default format") for record in caplog.records: assert record.levelname in ["Level 5", "DEBUG"] assert "log content should log with default format" in caplog.text def test_set_stream_handler_format(self, caplog): - set_logging_handler( - "charset_normalizer", format_string="%(message)s" - ) + set_logging_handler("charset_normalizer", format_string="%(message)s") self.logger.info("log content should only be this message") assert caplog.record_tuples == [ ( diff --git a/tests/test_mess_detection.py b/tests/test_mess_detection.py index d70fee4..4089f82 100644 --- a/tests/test_mess_detection.py +++ b/tests/test_mess_detection.py @@ -1,27 +1,48 @@ +from __future__ import annotations + import pytest + from charset_normalizer.md import mess_ratio @pytest.mark.parametrize( "content, min_expected_ratio, max_expected_ratio", [ - ('典肇乎庚辰年十二月廿一,及己丑年二月十九,收各方語言二百五十,合逾七百萬目;二十大卷佔八成,單英文卷亦過二百萬。悉文乃天下有志共筆而成;有意助之,幾網路、隨纂作,大典茁焉。', 0., 0.), - ('العقلية , التنويم المغناطيسي و / أو الاقتراح', 0., 0.), - ("RadoZ تـــعــــديــل الـــتــــوقــيــــت مـــن قــبــل", 0., 0.), + ( + "典肇乎庚辰年十二月廿一,及己丑年二月十九,收各方語言二百五十,合逾七百萬目;二十大卷佔八成,單英文卷亦過二百萬。悉文乃天下有志共筆而成;有意助之,幾網路、隨纂作,大典茁焉。", + 0.0, + 0.0, + ), + ("العقلية , التنويم المغناطيسي و / أو الاقتراح", 0.0, 0.0), + ("RadoZ تـــعــــديــل الـــتــــوقــيــــت مـــن قــبــل", 0.0, 0.0), ("Cehennemin Sava■þ²s²'da kim?", 0.1, 0.5), - ("´Á¥½³ø§i -- ±i®Ìºû, ³¯·Ø©v", 0.5, 1.), - ("ïstanbul, T■rkiye'nin en kalabal»k, iktisadi ve k■lt■rel aÓ»dan en —nemli", 0.1, 0.5), - ("Parce que Óa, c'est la vÕritable histoire de la rencontre avec votre Tante Robin.", 0.01, 0.5), - ("""ØĢØŠØģاØĶŲ„ Ų„Ųˆ ØĢŲ† Ø§Ų„Ų†Ø§Øģ ŲŠŲˆŲ… Ų…Ø§ ØģŲˆŲŲŠØŠØģاØĶŲ„ŲˆŲ†ØŒ ØŊØđŲ†Ø§ Ų†ØģŲ…Øđ ØđŲ† (ŲØąŲˆØŊ؈) ŲˆØ§Ų„ØŪØ§ØŠŲ…""", 0.8, 3.0), + ("´Á¥½³ø§i -- ±i®Ìºû, ³¯·Ø©v", 0.5, 1.0), + ( + "ïstanbul, T■rkiye'nin en kalabal»k, iktisadi ve k■lt■rel aÓ»dan en —nemli", + 0.1, + 0.5, + ), + ( + "Parce que Óa, c'est la vÕritable histoire de la rencontre avec votre Tante Robin.", + 0.01, + 0.5, + ), + ( + """ØĢØŠØģاØĶŲ„ Ų„Ųˆ ØĢŲ† Ø§Ų„Ų†Ø§Øģ ŲŠŲˆŲ… Ų…Ø§ ØģŲˆŲŲŠØŠØģاØĶŲ„ŲˆŲ†ØŒ ØŊØđŲ†Ø§ Ų†ØģŲ…Øđ ØđŲ† (ŲØąŲˆØŊ؈) ŲˆØ§Ų„ØŪØ§ØŠŲ…""", + 0.8, + 3.0, + ), ("""ÇáÚŞáíÉ , ÇáÊäæíã ÇáãÛäÇØíÓí æ / Ãæ ÇáÇŞÊÑÇÍ""", 0.8, 2.5), - ("""hishamkoc@yahoo.com ุชุฑุฌู…ู€ู€ุฉ ู‡ู€ุดู€ู€ู€ุงู… ุงู„ู€ู‚ู€ู€ู€ู€ู„ุงูRadoZ ุชู€ู€ู€ุนู€ู€ู€ู€ุฏูŠู€ู€ู„ ุงู„ู€ู€ู€ุชู€ู€ู€ู€ูˆู‚ู€ู€ูŠู€ู€ู€ู€ุช ู…ู€ู€ู€ู† ู‚ู€ู€ุจู€ู€ู„""", 0.5, 2.0) - - ] + ( + """hishamkoc@yahoo.com ุชุฑุฌู…ู€ู€ุฉ ู‡ู€ุดู€ู€ู€ุงู… ุงู„ู€ู‚ู€ู€ู€ู€ู„ุงูRadoZ ุชู€ู€ู€ุนู€ู€ู€ู€ุฏูŠู€ู€ู„ ุงู„ู€ู€ู€ุชู€ู€ู€ู€ูˆู‚ู€ู€ูŠู€ู€ู€ู€ุช ู…ู€ู€ู€ู† ู‚ู€ู€ุจู€ู€ู„""", + 0.5, + 2.0, + ), + ], ) def test_mess_detection(content, min_expected_ratio, max_expected_ratio): - calculated_mess_ratio = mess_ratio( - content, - maximum_threshold=1. - ) + calculated_mess_ratio = mess_ratio(content, maximum_threshold=1.0) - assert min_expected_ratio <= calculated_mess_ratio <= max_expected_ratio, "The mess detection ratio calculated for given content is not well adjusted!" + assert ( + min_expected_ratio <= calculated_mess_ratio <= max_expected_ratio + ), "The mess detection ratio calculated for given content is not well adjusted!" diff --git a/tests/test_preemptive_detection.py b/tests/test_preemptive_detection.py index 042415b..e56c4a1 100644 --- a/tests/test_preemptive_detection.py +++ b/tests/test_preemptive_detection.py @@ -1,5 +1,8 @@ +from __future__ import annotations + import pytest +from charset_normalizer import CharsetMatch from charset_normalizer.utils import any_specified_encoding @@ -9,18 +12,81 @@ (b'', "euc_jp"), (b'', "utf_8"), (b'', None), - (b'# coding: utf-8', "utf_8"), - (b'', 'utf_8'), - (b'', 'ascii'), - (b'', 'johab'), - (b'', 'cp037'), - (b'', "cp1252"), + (b"# coding: utf-8", "utf_8"), + (b'', "utf_8"), + (b'', "ascii"), + (b'', "johab"), + (b'', "cp037"), + (b"", "cp1252"), (b'', "cp1256"), - ] + ], ) def test_detect_most_common_body_encoding(payload, expected_encoding): - specified_encoding = any_specified_encoding( - payload + specified_encoding = any_specified_encoding(payload) + + assert ( + specified_encoding == expected_encoding + ), "Unable to determine properly encoding from given body" + + +@pytest.mark.parametrize( + "payload, expected_outcome", + [ + ( + b'', + b'', + ), + ( + b'', + b'', + ), + ( + b'', + b'', + ), + (b"# coding: utf-8", b"# coding: utf-8"), + ( + b'', + b'', + ), + ( + b'', + b'', + ), + ( + b'', + b'', + ), + ( + b"", + b"", + ), + ( + b'', + b'', + ), + ], +) +def test_preemptive_mark_replacement(payload, expected_outcome): + """ + When generating (to Unicode converted) bytes, we want to change any potential declarative charset + to utf-8. This test that. + """ + specified_encoding = any_specified_encoding(payload) + + detected_encoding = ( + specified_encoding if specified_encoding is not None else "utf-8" ) - assert specified_encoding == expected_encoding, "Unable to determine properly encoding from given body" + m = CharsetMatch( + payload, + detected_encoding, + 0.0, + False, + [], + preemptive_declaration=specified_encoding, + ) + + transformed_output = m.output() + + assert transformed_output == expected_outcome diff --git a/tests/test_utils.py b/tests/test_utils.py index 5c603b3..a0cc088 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,10 @@ +from __future__ import annotations + import logging + import pytest -from charset_normalizer.utils import is_accentuated, cp_similarity, set_logging_handler + +from charset_normalizer.utils import cp_similarity, is_accentuated, set_logging_handler @pytest.mark.parametrize(